In [None]:
import pandas as pd
import numpy as np
import random

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import roc_auc_score
from tqdm import tqdm

Load all data and train TF-IDF

In [11]:
def load_news(news_path):
    df = pd.read_csv(news_path, sep='\t', header=None,
                     names=['news_id', 'category', 'subcategory', 'title', 'abstract', 'url', 'title_entities', 'abstract_entities'])
    df['content'] = df['title'].fillna('') + ' ' + df['abstract'].fillna('')
    return df[['news_id', 'content']]

train_news = load_news('MINDsmall_train/news.tsv')
dev_news = load_news('MINDsmall_dev/news.tsv')
all_news = pd.concat([train_news, dev_news]).drop_duplicates('news_id')

# TF-IDF vectorization
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
news_vectors = vectorizer.fit_transform(train_news['content'])  # only train content for training

# Map news_id to index
news_id_to_index = dict(zip(train_news['news_id'], range(len(train_news))))

Load Training Behaviors and Build User Profiles

In [12]:
train_behaviors = pd.read_csv('MINDsmall_train/behaviors.tsv', sep='\t', header=None,
                              names=['impression_id', 'user_id', 'time', 'history', 'impressions'])

user_profiles = {}

for _, row in train_behaviors.iterrows():
    user = row['user_id']
    history = row['history']
    if pd.isna(history): continue
    history_ids = [nid for nid in history.split() if nid in news_id_to_index]
    if not history_ids: continue
    indices = [news_id_to_index[nid] for nid in history_ids]
    profile_vector = news_vectors[indices].mean(axis=0)
    user_profiles[user] = profile_vector

Evaluate the model on the dev set

In [13]:
dev_behaviors = pd.read_csv('MINDsmall_dev/behaviors.tsv', sep='\t', header=None,
                            names=['impression_id', 'user_id', 'time', 'history', 'impressions'])


def mrr(scores, labels):
    sorted_labels = [l for _, l in sorted(zip(scores, labels), reverse=True)]
    for i, label in enumerate(sorted_labels):
        if label == 1:
            return 1.0 / (i + 1)
    return 0.0

def ndcg(scores, labels, k):
    sorted_labels = [l for _, l in sorted(zip(scores, labels), reverse=True)][:k]
    dcg = sum([1.0 / np.log2(i + 2) if l == 1 else 0 for i, l in enumerate(sorted_labels)])
    ideal_dcg = sum([1.0 / np.log2(i + 2) for i in range(min(sum(labels), k))])
    return dcg / ideal_dcg if ideal_dcg > 0 else 0.0

def evaluate(dev_behaviors, news_id_to_index, news_vectors, user_profiles):
    aucs, mrrs, ndcg5s, ndcg10s = [], [], [], []

    for _, row in tqdm(dev_behaviors.iterrows(), total=len(dev_behaviors)):
        user = row['user_id']
        impressions = [imp.split('-') for imp in row['impressions'].split()]
        news_ids = [nid for nid, _ in impressions if nid in news_id_to_index]
        labels = [int(label) for nid, label in impressions if nid in news_id_to_index]

        if not news_ids or user not in user_profiles or len(set(labels)) == 1:
            continue

        profile_vector = np.asarray(user_profiles[user])  # Convert to numpy array
        candidate_idxs = [news_id_to_index[nid] for nid in news_ids]
        candidate_vectors = news_vectors[candidate_idxs]
        sims = cosine_similarity(profile_vector, candidate_vectors).flatten()

        aucs.append(roc_auc_score(labels, sims, multi_class='ovr'))
        mrrs.append(mrr(sims, labels))
        ndcg5s.append(ndcg(sims, labels, 5))
        ndcg10s.append(ndcg(sims, labels, 10))

    return {
        'AUC': np.mean(aucs),
        'MRR': np.mean(mrrs),
        'nDCG@5': np.mean(ndcg5s),
        'nDCG@10': np.mean(ndcg10s)
    }

Evaluate a baseline system (random reccomender)

In [14]:
def evaluate_random(dev_behaviors, news_id_to_index):
    aucs, mrrs, ndcg5s, ndcg10s = [], [], [], []

    for _, row in tqdm(dev_behaviors.iterrows(), total=len(dev_behaviors)):
        impressions = [imp.split('-') for imp in row['impressions'].split()]
        news_ids = [nid for nid, _ in impressions if nid in news_id_to_index]
        labels = [int(label) for nid, label in impressions if nid in news_id_to_index]

        if not news_ids or len(set(labels)) == 1:
            continue

        random_scores = [random.random() for _ in labels]

        aucs.append(roc_auc_score(labels, random_scores))
        mrrs.append(mrr(random_scores, labels))
        ndcg5s.append(ndcg(random_scores, labels, 5))
        ndcg10s.append(ndcg(random_scores, labels, 10))

    return {
        'AUC': np.mean(aucs),
        'MRR': np.mean(mrrs),
        'nDCG@5': np.mean(ndcg5s),
        'nDCG@10': np.mean(ndcg10s)
    }

Compare results

In [15]:
cb_results = evaluate(dev_behaviors, news_id_to_index, news_vectors, user_profiles)
rand_results = evaluate_random(dev_behaviors, news_id_to_index)

# Display comparison
print("Content-Based Recommender Performance:")
for metric, score in cb_results.items():
    print(f"{metric}: {score:.4f}")

print("\nRandom Recommender Performance:")
for metric, score in rand_results.items():
    print(f"{metric}: {score:.4f}")

100%|██████████| 73152/73152 [00:10<00:00, 7178.55it/s]
100%|██████████| 73152/73152 [00:41<00:00, 1774.22it/s]

Content-Based Recommender Performance:
AUC: 0.5737
MRR: 0.3466
nDCG@5: 0.3341
nDCG@10: 0.3948

Random Recommender Performance:
AUC: 0.4980
MRR: 0.2586
nDCG@5: 0.2432
nDCG@10: 0.3064



