In [3]:
import pandas as pd

data = pd.read_json('data/bloomberg_quint_news.json')

In [4]:
def check_missing_values(df):
    # Check for NaN, missing, or NaT values
    missing_rows = df[df.isnull().any(axis=1)]

    if not missing_rows.empty:
        print("Deleting rows with missing values...")
        df.dropna(inplace=True)
        print("Rows with missing values have been deleted.")
    else:
        print("There are no missing values in the DataFrame.")

# Example usage:
check_missing_values(data)


Deleting rows with missing values...
Rows with missing values have been deleted.


In [None]:
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.models import KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity

# Download NLTK resources (uncomment the following lines if not already downloaded)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load pre-trained GloVe vectors
glove_model = KeyedVectors.load_word2vec_format('gloves/glove.6B.300d.txt', binary=False)

def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()

    # Remove punctuation and numbers
    text = ''.join([char for char in text if char.isalpha() or char.isspace()])

    # Tokenize the text into words
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatize the words
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return tokens

def get_word_vector(word, model):
    if word in model:
        return model[word]
    else:
        return None

def find_closest_word(string, query, model):
    # Preprocess the string and query
    string_tokens = preprocess_text(string)
    query_tokens = preprocess_text(query)

    # Convert tokens into GloVe vectors
    string_vectors = [get_word_vector(word, model) for word in string_tokens]
    query_vector = get_word_vector(query_tokens[0], model)  # Assuming query consists of one word

    # Calculate cosine similarity between query vector and each word vector in the string
    similarities = [cosine_similarity([query_vector], [word_vector])[0][0] for word_vector in string_vectors]

    # Find the index of the word in the string with the highest similarity to the query word
    closest_word_index = np.argmax(similarities)

    # Return the word in the string with the highest similarity to the query word
    return string_tokens[closest_word_index]

# Example usage:
#string = "This is an example string for NLP preprocessing."
#query = "example"
#closest_word = find_closest_word(string, query, glove_model)
#print("Closest word in the string to the query word:", closest_word)


In [16]:
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.models import KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# Download NLTK resources (uncomment the following lines if not already downloaded)
#nltk.download('punkt')
#nltk.download('stopwords')
#nltk.download('wordnet')

# Load pre-trained GloVe vectors
glove_model = KeyedVectors.load_word2vec_format('gloves/converted_vectors.txt', binary=False)

def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()

    # Remove punctuation and numbers
    text = ''.join([char for char in text if char.isalpha() or char.isspace()])

    # Tokenize the text into words
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatize the words
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return tokens

def get_word_vector(word, model):
    if word in model:
        return model[word]
    else:
        return None

def find_closest_word(string, query, model):
    # Preprocess the string and query
    string_tokens = preprocess_text(string)
    query_tokens = preprocess_text(query)

    # Convert tokens into GloVe vectors
    string_vectors = [get_word_vector(word, model) for word in string_tokens]
    query_vector = get_word_vector(query_tokens[0], model)  # Assuming query consists of one word

    # Calculate cosine similarity between query vector and each word vector in the string
    similarities = [cosine_similarity([query_vector], [word_vector])[0][0] for word_vector in string_vectors]

    # Find the index of the word in the string with the highest similarity to the query word
    closest_word_index = np.argmax(similarities)

    # Return the word in the string with the highest similarity to the query word
    return string_tokens[closest_word_index]

def find_top_articles_with_keyword(data, keyword):
    similarity_scores = []
    for index, row in data.iterrows():
        text = row['title'] + " " + row['short_description'] + " " + row['description']
        similarity_score = find_similarity_with_keyword(text, keyword, glove_model)
        similarity_scores.append((index, similarity_score))
    similarity_scores.sort(key=lambda x: x[1], reverse=True)
    top_articles = similarity_scores[:10]
    top_article_indices = [article[0] for article in top_articles]
    return data.iloc[top_article_indices]

def find_similarity_with_keyword(text, keyword, model):
    tokens = preprocess_text(text)
    keyword_vector = get_word_vector(keyword, model)
    if keyword_vector is None:
        return 0  # Keyword not found in the model
    text_vectors = [get_word_vector(word, model) for word in tokens]
    text_vectors = [vec for vec in text_vectors if vec is not None]  # Remove None vectors
    if len(text_vectors) == 0:
        return 0  # No valid vectors found in the text
    similarity_scores = [cosine_similarity([keyword_vector], [vec])[0][0] for vec in text_vectors]
    return np.mean(similarity_scores)

# Sample usage:
keyword = "stock"  # Specify the keyword to search for
top_articles = find_top_articles_with_keyword(data, keyword)
print(top_articles[['title', 'short_description', 'description']])


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\maxim\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\maxim\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\maxim\AppData\Roaming\nltk_data...


                                                  title  \
4010  Major Evergrande Backer Chinese Estates May Se...   
22     All You Need To Know Going Into Trade On Feb. 20   
316   Trump Warns EU of Car Tariffs as Commerce Prob...   
351   Coast Guard Officer Accused of Plotting to Kil...   
69    Zambia Court Orders Liquidation of Billionaire...   
17       All You Need To Know Going Into Trade On May 4   
2663  Qatar Petroleum Plans $10 Billion Bond Sale fo...   
1697  Solara Active Pharma Science - Emerging As Lar...   
18     All You Need To Know Going Into Trade On May 5\n   
9     All You Need To Know Going Into Trade On April 28   

                                      short_description  \
4010  Major Evergrande Backer Chinese Estates May Se...   
22    Stocks in the news, big brokerage calls of the...   
316   Trump Says He'll Slap Auto Tariffs on EU If No...   
351   Coast Guard Officer Accused of Plotting to Kil...   
69    Zambia Court Orders Liquidation of Billionaire...

In [17]:
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.models import KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# Download NLTK resources (uncomment the following lines if not already downloaded)
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')

# Load pre-trained GloVe vectors
glove_model = KeyedVectors.load_word2vec_format('gloves/converted_vectors.txt', binary=False)

def preprocess_text(text):
    text = text.lower()

    # Remove punctuation and numbers
    text = ''.join([char for char in text if char.isalpha() or char.isspace()])

    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return tokens

def get_word_vector(word, model):
    if word in model:
        return model[word]
    else:
        return None

def find_similarity_with_keyword(text, keyword, model):
    tokens = preprocess_text(text)
    keyword_vector = get_word_vector(keyword, model)
    if keyword_vector is None:
        return 0  # Keyword not found in the model
    weighted_similarities = []
    for token in tokens:
        token_vector = get_word_vector(token, model)
        if token_vector is not None:
            similarity = cosine_similarity([keyword_vector], [token_vector])[0][0]
            weighted_similarities.append(similarity)
    if len(weighted_similarities) == 0:
        return 0  # No valid vectors found in the text
    return np.mean(weighted_similarities)

def find_top_articles_with_keyword(data, keyword, weights={'title': 0.5, 'short_description': 0.3, 'description': 0.2}):
    weighted_similarity_scores = []
    for index, row in data.iterrows():
        title_score = find_similarity_with_keyword(row['title'], keyword, glove_model) * weights['title']
        short_desc_score = find_similarity_with_keyword(row['short_description'], keyword, glove_model) * weights['short_description']
        desc_score = find_similarity_with_keyword(row['description'], keyword, glove_model) * weights['description']
        total_score = title_score + short_desc_score + desc_score
        weighted_similarity_scores.append((index, total_score))
    weighted_similarity_scores.sort(key=lambda x: x[1], reverse=True)
    top_articles = weighted_similarity_scores[:10]
    top_article_indices = [article[0] for article in top_articles]
    return data.iloc[top_article_indices]

# Sample usage:
keyword = "france"  # Specify the keyword to search for
top_articles = find_top_articles_with_keyword(data, keyword)
print(top_articles[['title', 'short_description', 'description']])


                                                  title  \
2400  Biden Says He Was Unaware of Giuliani Raid, Wo...   
3187  ICICI Securities Q4 Review - Strong Customer A...   
2478  DoorDash Goes on European Deal Hunt Just Month...   
3182  Nestle India Q1 Review - Domestic Business Con...   
292         Eastern Europe Feeds on a Shrinking Ukraine   
386   Sony Sued for Limiting Purchases of Games to P...   
4117  Penn Endowment Posts 41% Return, Buoyed by Sto...   
104   Washington and Boston Are Beating NYC for Entr...   
3310  Cox Agrees to Buy Enterprise Unit of EQT’s Fib...   
2744  Oil Demand in India Drops as Wave of Virus Con...   

                                      short_description  \
2400  Biden Says He Was Unaware of Giuliani Raid, Wo...   
3187  ICICI Securities Q4 Review - Strong Customer A...   
2478  DoorDash Goes on European Deal Hunt Just Month...   
3182  Nestle India Q1 Review - Domestic Business Con...   
292         Eastern Europe Feeds on a Shrinking Ukraine

In [19]:
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.models import KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import json

# Download NLTK resources (uncomment the following lines if not already downloaded)
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')

# Load pre-trained GloVe vectors
glove_model = KeyedVectors.load_word2vec_format('gloves/converted_vectors.txt', binary=False)

def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()

    # Remove punctuation and numbers
    text = ''.join([char for char in text if char.isalpha() or char.isspace()])

    # Tokenize the text into words
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatize the words
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return tokens

def get_word_vector(word, model):
    if word in model:
        return model[word]
    else:
        return None

def find_similarity_with_keyword(text_vectors, keyword_vector):
    valid_text_vectors = [vec for vec in text_vectors if vec is not None]  # Filter out NaN values
    if not valid_text_vectors:
        return 0  # No valid vectors found in the text

    similarity_scores = [cosine_similarity([keyword_vector], [vec])[0][0] for vec in valid_text_vectors]
    return np.mean(similarity_scores)


def find_top_articles_with_keyword(data, keyword, weights, user_preferences):
    keyword_vector = get_word_vector(keyword, glove_model)
    if keyword_vector is None:
        return "Keyword not found in the model"
    
    similarity_scores = []
    for index, row in data.iterrows():
        title_tokens = preprocess_text(row['title'])
        short_desc_tokens = preprocess_text(row['short_description'])
        desc_tokens = preprocess_text(row['description'])

        # Convert tokens into GloVe vectors
        title_vectors = [get_word_vector(word, glove_model) for word in title_tokens]
        short_desc_vectors = [get_word_vector(word, glove_model) for word in short_desc_tokens]
        desc_vectors = [get_word_vector(word, glove_model) for word in desc_tokens]

        # Calculate similarity scores for title, short description, and description
        title_similarity = find_similarity_with_keyword(title_vectors, keyword_vector)
        short_desc_similarity = find_similarity_with_keyword(short_desc_vectors, keyword_vector)
        desc_similarity = find_similarity_with_keyword(desc_vectors, keyword_vector)

        # Combine similarity scores with weights
        total_similarity = (weights['title'] * title_similarity +
                            weights['short_description'] * short_desc_similarity +
                            weights['description'] * desc_similarity)
        
        # Apply user preferences weighting
        topic_weight = user_preferences.get(row['category'], 0)
        total_similarity *= topic_weight
        
        similarity_scores.append((index, total_similarity))
    
    similarity_scores.sort(key=lambda x: x[1], reverse=True)
    top_articles = similarity_scores[:10]
    top_article_indices = [article[0] for article in top_articles]
    return data.iloc[top_article_indices]

# Load user preferences from JSON
def load_user_preferences_from_json(json_file):
    with open(json_file, 'r') as f:
        user_data = json.load(f)
    return user_data['user_preferences']

# Sample usage:
keyword = "france"  # Specify the keyword to search for
weights = {'title': 0.5, 'short_description': 0.3, 'description': 0.2}

# Load user preferences
user_preferences = load_user_preferences_from_json('data/user.json')

top_articles = find_top_articles_with_keyword(data, keyword, weights, user_preferences)
print(top_articles[['title', 'short_description', 'description']])


                                                  title  \
2400  Biden Says He Was Unaware of Giuliani Raid, Wo...   
4117  Penn Endowment Posts 41% Return, Buoyed by Sto...   
104   Washington and Boston Are Beating NYC for Entr...   
3673  Cyient Q4 Review - Strong Show Continues: Prab...   
3311  Biden Talks Up Benefits of Vaccines After New ...   
1273  Reliance Jio Q4 Review - Weak Financial Perfor...   
1186  California Tribe Buys Palms Casino in Vegas fo...   
3142  Shopify Turns to ‘Harry Potter’ to Show Heft A...   
1860  Apple Trial Threatens to Reveal App Store's Co...   
3475  Glencore Chair Defends Pay Plan for Commodity ...   

                                      short_description  \
2400  Biden Says He Was Unaware of Giuliani Raid, Wo...   
4117  Penn Endowment Posts 41% Return, Buoyed by Sto...   
104   Washington, Boston Beating NYC for Entrepreneu...   
3673  Cyient Q4 Review - Strong Show Continues: Prab...   
3311  However, masks should remain on anywhere there...