# IREI: Profile-based retrieval
### Víctor Morcuende Castell and Guillermo Nájera Lavid
#### Course 2022-2023

### Preprocessing Phase

In [None]:
import nltk

nltk.download('all')

In [None]:
# Read the data
import pandas as pd

train_data = pd.read_csv('dataset/BBC News Train.csv')
test_data = pd.read_csv('dataset/BBC News Test.csv')

# Transform the data into a single dataset
data = pd.concat([train_data,test_data])
data.to_csv('dataset/data.csv', index=False)
data.head(10)

In [None]:
data.shape

In [None]:
data.groupby(['Category']).size().sort_values(ascending=True)

In [None]:
data.groupby(['Category']).size().plot(kind='pie', figsize=(10, 6))

In [None]:
data.groupby(['Category']).size().sort_values(ascending=True).plot(kind='barh', figsize=(10, 6))

In [None]:
# Remove all punctuations from the text
import string as st

def remove_punct(text):
    return ("".join([ch for ch in text if ch not in st.punctuation]))

data['removed_punc'] = data['Text'].apply(lambda x: remove_punct(x))
data.head()

In [None]:
# Convert text to lower case tokens
import re

def tokenize(text):
    text = re.split('\s+' ,text)
    return [x.lower() for x in text]

data['tokens'] = data['removed_punc'].apply(lambda msg : tokenize(msg))
data.head()

In [None]:
# Remove tokens of length less than 3
def remove_small_words(text):
    return [x for x in text if len(x) > 3 ]

data['larger_tokens'] = data['tokens'].apply(lambda x : remove_small_words(x))
data.head()

In [None]:
# Remove stopwords by using NLTK corpus list
def remove_stopwords(text):
    return [word for word in text if word not in nltk.corpus.stopwords.words('english')]

data['clean_tokens'] = data['larger_tokens'].apply(lambda x : remove_stopwords(x))
data.head()

In [None]:
# Apply lemmatization on tokens
from nltk import WordNetLemmatizer

def lemmatize(text):
    word_net = WordNetLemmatizer()
    return [word_net.lemmatize(word) for word in text]

data['lemma_words'] = data['clean_tokens'].apply(lambda x : lemmatize(x))
data.head()

In [None]:
# Create sentences to get clean text as input for vectors
def return_sentences(tokens):
    return " ".join([word for word in tokens])

data['clean_text'] = data['lemma_words'].apply(lambda x : return_sentences(x))
data.head()

### Model and Evaluation Phase

In [None]:
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# Balancing the dataset to have the same number of documents for each query
from sklearn.utils import resample

def balance_data(data, category_col):
    categories = data[category_col].unique()
    min_category_count = data[category_col].value_counts().min()

    balanced_data = []

    for category in categories:
        category_data = data[data[category_col] == category]
        category_data_balanced = resample(category_data, replace=False, n_samples=min_category_count, random_state=42)
        balanced_data.append(category_data_balanced)

    return pd.concat(balanced_data)

In [None]:
balanced_data = balance_data(data, 'Category')
X_train, X_test, y_train, y_test = train_test_split(balanced_data['clean_text'], balanced_data['Category'], test_size=0.2, random_state=42)
balanced_data.groupby(['Category']).size().sort_values(ascending=True)

In [None]:
balanced_data.groupby(['Category']).size().plot(kind='pie', figsize=(10, 6))

In [None]:
balanced_data.groupby(['Category']).size().sort_values(ascending=True).plot(kind='barh', figsize=(10, 6))

In [None]:
vectorizer = TfidfVectorizer()
document_vectors = vectorizer.fit_transform(balanced_data['clean_text'])

In [None]:
topics = {
    # Sports
    'sports': ["sports", "championship", "soccer", "race", "football", "tennis", "baseball", "hockey", "basketball", "athletics", "rugby", "swimming", "golf", "cycling", "cricket", "marathon", "gymnastics", "boxing", "volleyball", "badminton", "fencing", "wrestling", "snowboarding", "skiing", "horse-racing", "archery", "table-tennis", "e-sports", "fitness", "olympics"],
    
    # Business
    'business': ["business", "finance", "stocks", "economy", "investment", "entrepreneurship", "corporation", "market", "trade", "revenue", "profit", "startup", "loss", "growth", "acquisition", "tax", "debt", "funding", "venture", "capital", "inflation", "interest", "dividends", "corporate", "management", "banking", "insurance", "real-estate", "franchise", "supply-chain"],
    
    # Entertainment
    'entertainment': ["entertainment", "movies", "music", "television", "celebrities", "awards", "festivals", "concert", "theater", "comedy", "drama", "action", "romance", "animation", "documentary", "dance", "art", "literature", "photography", "sculpture", "painting", "opera", "magic", "circus", "museum", "exhibition", "actor", "actress", "singer", "culture"],
    
    # Politics
    'politics': ["politics", "government", "elections", "policy", "democracy", "president", "parliament", "vote", "prime-minister", "congress", "senate", "international", "relations", "diplomacy", "referendum", "constitution", "legislation", "political-party", "campaign", "debate", "rights", "protest", "activism", "military", "intelligence", "treaty", "embassy", "visa", "immigration", "trade-agreements"],
    
    # Tech
    'tech': ["tech", "technology", "innovation", "gadgets", "smartphone", "artificial-intelligence", "robotics", "software", "hardware", "computer", "internet", "cybersecurity", "virtual-reality", "augmented-reality", "machine-learning", "data-science", "blockchain", "cryptocurrency", "internet-of-things", "cloud-computing", "big-data", "quantum-computing", "networking", "operating-system", "mobile-apps", "programming", "research", "drones", "3D-printing", "wearables"]
}

In [None]:
users = [
    {'id': 1, 'interests': ['sports']},
    {'id': 2, 'interests': ['business']},
    {'id': 3, 'interests': ['entertainment']},
    {'id': 4, 'interests': ['politics']},
    {'id': 5, 'interests': ['tech']},
    {'id': 6, 'interests': ['sports', 'business']},
    {'id': 7, 'interests': ['entertainment', 'politics']},
    {'id': 8, 'interests': ['tech', 'sports']},
    {'id': 9, 'interests': ['business', 'entertainment']},
    {'id': 10, 'interests': ['politics', 'tech', 'business']}
]

In [None]:
# Step 1: Create a dictionary with user interests as keys and user IDs as values
interests_users = {}
for user in users:
    user_id = user['id']
    for interest in user['interests']:
        if interest not in interests_users:
            interests_users[interest] = [user_id]
        else:
            interests_users[interest].append(user_id)

# Step 2: Convert user interests into interest vectors using the vectorizer
interest_vectors = {}
for interest, keywords in topics.items():
    interest_vector = vectorizer.transform([' '.join(keywords)])
    interest_vectors[interest] = interest_vector

# Step 3: Calculate cosine similarity between the document and interest vectors
def recommend_users(doc_vector, interest_vectors, interests_users, threshold=0.1):
    recommended_users = set()
    for interest, interest_vector in interest_vectors.items():
        similarity = cosine_similarity(doc_vector, interest_vector)
        if similarity >= threshold:
            recommended_users.update(interests_users[interest])
    return recommended_users

# Step 4: Recommend documents to users based on their cosine similarity scores
doc_recommendations = {}
for index, row in data.iterrows():
    doc_vector = vectorizer.transform([row['clean_text']])
    recommended_users = recommend_users(doc_vector, interest_vectors, interests_users)
    doc_recommendations[row['Text']] = recommended_users

In [None]:
# Calculate Precision, Recall, and F1 score
def evaluate_performance(doc_recommendations, data, users, topics):
    true_positives = 0
    false_positives = 0
    false_negatives = 0

    for doc, recommended_users in doc_recommendations.items():
        true_category = data[data['Text'] == doc]['Category'].values[0]
        true_interests = set(topics[true_category])

        for user in users:
            user_id = user['id']
            user_interests = set(user['interests'])

            if user_id in recommended_users:
                if user_interests.intersection(true_interests):
                    true_positives += 1
                else:
                    false_positives += 1
            else:
                if user_interests.intersection(true_interests):
                    false_negatives += 1

    precision = true_positives / (true_positives + false_positives)
    recall = true_positives / (true_positives + false_negatives)
    f1_score = 2 * (precision * recall) / (precision + recall)

    return precision, recall, f1_score

precision, recall, f1_score = evaluate_performance(doc_recommendations, data, users, topics)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1_score)

In [None]:
for user in users:
    user_interests = " ".join(user['interests'])
    user_vec = vectorizer.transform([user_interests])
    user['vector'] = user_vec

user_list = [user['vector'] for user in users]

In [None]:
import random

for i in range(10):
  random_index = random.randint(0, len(balanced_data['clean_text'])-1) #get a random document from the dataset
  incoming_doc_vector = document_vectors[random_index]
  list_sim = []

  profiles = []

  for j in range(len(user_list)):
    similarities = cosine_similarity(incoming_doc_vector, user_list[j]) #cosine distance between the document and the users
    if similarities[0][0] > 0.0: #all values that are not 0 are saved
      profiles.append(j+1)
      list_sim.append(similarities[0][0])
  
  #return the document, the users thar are going to receive that document and the main topic related to the document
  print("For document",i+1, ":", balanced_data['clean_text'][random_index])
  print()
  print("Categorized as: "+ balanced_data['Category'][random_index]+ ' topic.')
  print()
  print("The user who are interested in this document are", profiles)
  print()

  print("RANKING")
  print()
  ranking = pd.DataFrame()
  ranking["Users"] = profiles
  ranking["Score"] = list_sim
  ranking = ranking.sort_values('Score', ascending=False)
  print(ranking)
  print()

In [None]:
# Use Term Frequency-Inverse Document Frequency (TF-IDF) to assign weights to words in each topic.
def create_topic_vectors(topics):
    topic_texts = [' '.join(keywords) for keywords in topics.values()]
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(topic_texts)

    topic_vectors = {}
    for i, (topic, words) in enumerate(topics.items()):
        vector = tfidf_matrix[i].toarray()[0]
        topic_vectors[topic] = vector
    return topic_vectors, tfidf_vectorizer

In [None]:
'''
If decay_exponent is set to 0, all interests will have equal weights regardless of their position in the list.
If decay_exponent is set to 1, the decay factor is linear, and the weights will decrease proportionally to the position in the list.
If decay_exponent is set to a value greater than 1, the decay factor will decrease more rapidly, giving more emphasis to the first interest and significantly less to the subsequent interests.
If decay_exponent is set to a value between 0 and 1, the decay factor will decrease more slowly, making the weights more evenly distributed among the interests.
You can experiment with different values for decay_exponent to find the balance that best suits your needs. Keep in mind that setting an extremely high 
decay_exponent will result in almost entirely ignoring interests lower in the list, so it's essential to find a balance that works for your specific use case.
'''

def create_user_profiles(users, topic_vectors, decay_exponent):
    user_profiles = []

    for user in users:
        user_interests = user['interests']
        aggregated_vector = np.zeros(len(topic_vectors[next(iter(topic_vectors))]), dtype=float)
        total_weight = 0

        for i, interest in enumerate(user_interests):
            weight = 1 / (i + 1) ** decay_exponent
            total_weight += weight
            topic_vector = topic_vectors[interest]
            aggregated_vector += weight * topic_vector

        # Normalize the aggregated vector
        aggregated_vector /= total_weight
        
        user_profiles.append({
            'id': user['id'],
            'profile': aggregated_vector
        })

    return user_profiles

In [None]:
def test_cosine_similarity_with_profiles(text, vectorizer, user_profile):
    text_vec = vectorizer.transform([text])
    profile_vec = user_profile['profile'].reshape(1, -1)
    similarity = cosine_similarity(text_vec, profile_vec)
    return similarity[0][0]

In [None]:
# Step 1: Create a dictionary with user interests as keys and user IDs as values
interests_users = {}
for user in users:
    user_id = user['id']
    for interest in user['interests']:
        if interest not in interests_users:
            interests_users[interest] = [user_id]
        else:
            interests_users[interest].append(user_id)

# Step 2: Convert user interests into interest vectors using the vectorizer
interest_vectors = {}
for interest, keywords in topics.items():
    interest_vector = vectorizer.transform([' '.join(keywords)])
    interest_vectors[interest] = interest_vector

# Step 3: Calculate cosine similarity between the document and interest vectors
def recommend_users(doc_vector, interest_vectors, interests_users, threshold=0.1):
    recommended_users = set()
    for interest, interest_vector in interest_vectors.items():
        similarity = cosine_similarity(doc_vector, interest_vector)
        if similarity >= threshold:
            recommended_users.update(interests_users[interest])
    return recommended_users

# Step 4: Recommend documents to users based on their cosine similarity scores
doc_recommendations = {}
for index, row in data.iterrows():
    doc_vector = vectorizer.transform([row['clean_text']])
    recommended_users = recommend_users(doc_vector, interest_vectors, interests_users)
    doc_recommendations[row['Text']] = recommended_users

print(doc_recommendations)

In [None]:
def should_deliver_document(text, vectorizer, user_profile, threshold):
    similarity = test_cosine_similarity_with_profiles(text, vectorizer, user_profile)
    return similarity >= threshold

In [None]:
topic_vectors, tfidf_vectorizer = create_topic_vectors(topics)
user_profiles = create_user_profiles(users, topic_vectors, decay_exponent=1)

# Example document
document = "Today the stock market experienced significant growth, with several companies reporting increased profits."

# Threshold for cosine similarity
threshold = 0.5

# Check if the document should be delivered to each user
for user_profile in user_profiles:
    if should_deliver_document(document, tfidf_vectorizer, user_profile, threshold):
        print(f"Deliver the document to user {user_profile['id']}")
    else:
        print(f"Do not deliver the document to user {user_profile['id']}")

In [None]:
#vectorizer = CountVectorizer()
#X_train_vec = vectorizer.fit_transform(X_train)
#X_test_vec = vectorizer.transform(X_test)

# OR

vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_df=0.9, min_df=5)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [None]:
categories = y_train.unique()
category_avg_vecs = {}

for category in categories:
    category_indices = y_train[y_train == category].index
    category_vectors = X_train_vec[category_indices, :]
    category_avg_vec = np.mean(category_vectors, axis=0)
    category_avg_vecs[category] = category_avg_vec

In [None]:
def predict_category(text, vectorizer, category_avg_vecs):
    text_vec = vectorizer.transform([text])
    max_similarity = -1
    predicted_category = None

    for category, avg_vec in category_avg_vecs.items():
        similarity = cosine_similarity(text_vec, np.asarray(avg_vec))
        if similarity > max_similarity:
            max_similarity = similarity
            predicted_category = category

    return predicted_category

In [None]:
y_pred = [predict_category(text, vectorizer, category_avg_vecs) for text in X_test]

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

print("Classification Report:\n", classification_report(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)


In [None]:
def train_and_evaluate(classifier, X_train_vec, y_train, X_test_vec, y_test):
    classifier.fit(X_train_vec, y_train)
    y_pred = classifier.predict(X_test_vec)
    accuracy = accuracy_score(y_test, y_pred)
    classification_report_dict = classification_report(y_test, y_pred, output_dict=True)
    
    print(classifier.__class__.__name__)
    print("Accuracy:", accuracy)
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("\n")
    
    performance = {
        "accuracy": accuracy,
        "precision": classification_report_dict["macro avg"]["precision"],
        "recall": classification_report_dict["macro avg"]["recall"],
        "f1_score": classification_report_dict["macro avg"]["f1-score"],
    }
    
    return performance

In [None]:
performance_metrics = {}

classifiers = [
    MultinomialNB(),
    LogisticRegression(),
    LinearSVC(),
    RandomForestClassifier(),
    KNeighborsClassifier()
]


for classifier in classifiers:
    performance_metrics[classifier.__class__.__name__] = train_and_evaluate(classifier, X_train_vec, y_train, X_test_vec, y_test)

In [None]:
for classifier, metrics in performance_metrics.items():
    print(f"{classifier}:")
    for metric, value in metrics.items():
        print(f"  {metric}: {value:.4f}")
    print("\n")

In [None]:
from sklearn.model_selection import cross_val_score

def evaluate_with_cross_val(classifier, X, y, n_splits=5):
    scores = cross_val_score(classifier, X, y, cv=n_splits)
    return np.mean(scores)

X_vec = vectorizer.fit_transform(data['clean_text'])
y = data['Category']

for classifier in classifiers:
    mean_score = evaluate_with_cross_val(classifier, X_vec, y)
    print(f"{classifier.__class__.__name__}: {mean_score:.4f}")

In [None]:
import seaborn as sns
import matplotlib.pylab as plt
from sklearn.metrics import confusion_matrix


def plot_cm(y_true, y_pred, class_names):
    cm = confusion_matrix(y_true, y_pred)
    cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    plt.figure(figsize=(8, 8))
    sns.heatmap(cm_normalized, annot=True, cmap='Blues', xticklabels=class_names, yticklabels=class_names)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix')
    plt.show()

class_names = data['Category'].unique()

for classifier in classifiers:
    classifier.fit(X_train_vec, y_train)
    y_pred = classifier.predict(X_test_vec)
    print(f"Confusion Matrix for {classifier.__class__.__name__}:")
    plot_cm(y_test, y_pred, class_names)

Cosine similarity

In [None]:
# Extract top N keywords for each category
def top_n_keywords_by_category(X_train_vec, y_train, vectorizer, n=20):
    categories = y_train.unique()
    feature_names = np.array(vectorizer.get_feature_names_out())
    
    top_keywords = {}
    
    for category in categories:
        category_indices = y_train[y_train == category].index
        category_vectors = X_train_vec[category_indices, :]
        category_sum_vec = np.sum(category_vectors, axis=0)
        
        sorted_indices = np.argsort(category_sum_vec).flatten()[::-1]
        top_n_indices = sorted_indices[:n]
        top_n_keywords = feature_names[top_n_indices]
        
        top_keywords[category] = top_n_keywords.tolist()
    
    return top_keywords

# Test the performance using the keyword profiles
def test_cosine_similarity_with_profiles(text, vectorizer, category_profiles):
    text_vec = vectorizer.transform([text])
    max_similarity = -1
    predicted_category = None

    for category, profile_keywords in category_profiles.items():
        profile_vec = vectorizer.transform([' '.join(map(str, profile_keywords))])
        similarity = cosine_similarity(text_vec, profile_vec)
        if similarity > max_similarity:
            max_similarity = similarity
            predicted_category = category

    return predicted_category

# Create better category profiles
category_profiles = top_n_keywords_by_category(X_train_vec, y_train, vectorizer, n=20)

# Test the performance
y_pred_profiles = [test_cosine_similarity_with_profiles(text, vectorizer, category_profiles) for text in X_test]

# Calculate accuracy and display results
accuracy_profiles = accuracy_score(y_test, y_pred_profiles)
print(f"Accuracy with keyword profiles: {accuracy_profiles:.4f}")

print("Classification Report with keyword profiles:\n", classification_report(y_test, y_pred_profiles))

cm_profiles = confusion_matrix(y_test, y_pred_profiles)
print("Confusion Matrix with keyword profiles:\n", cm_profiles)

In [None]:
# Test the performance using the keyword profiles
y_pred_profiles = [test_cosine_similarity_with_profiles(text, vectorizer, category_profiles) for text in X_test]

# Calculate accuracy and display results
accuracy_profiles = accuracy_score(y_test, y_pred_profiles)
print(f"Accuracy with keyword profiles: {accuracy_profiles:.4f}")

print("Classification Report with keyword profiles:\n", classification_report(y_test, y_pred_profiles))

cm_profiles = confusion_matrix(y_test, y_pred_profiles)
print("Confusion Matrix with keyword profiles:\n", cm_profiles)