## Step 0: load and prepare data

In [1]:
import pandas as pd
import numpy as np
from pandas import DataFrame

from scipy.sparse import csr_matrix


In [2]:

raw_impressions = pd.read_csv(
    "../MIND/train/behaviors.tsv",
    sep='\t',
    header=None,
    names=["impressionId", "userId", "time", "history", "impressions"],
    usecols=["userId", "impressions"]
  )

raw_news = pd.read_csv(
  "../MIND/train/news.tsv",
  sep="\t", 
  header=None,
  names=["newsId", "category", "subcategory", "title", "abstract", "url", "titleEntities", "abstractEntities"],
  usecols=["newsId", "title"]
)


In [3]:
import re

impressions = raw_impressions

def str2int(x):
  m = re.search(r"\d+", x)
  if not m:
    raise Exception("didn't find news id")
  return int(m.group())

def str2click(x):
  m = re.search(r"-(\d)", x)
  if not m: 
    raise Exception("didnt find click info")
  return int(m.group(1))

impressions = impressions.dropna()

impressions["click"] = impressions["impressions"].apply(lambda x: x.split(" "))
impressions = impressions.explode("click").reset_index()
impressions = impressions.head(100_000)

impressions["newsId"] = impressions["click"].apply(str2int)
impressions["userId"] = impressions["userId"].apply(str2int)
impressions["click"] = impressions["click"].apply(str2click)

impressions = impressions[["userId", "newsId", "click"]]
impressions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype
---  ------  --------------   -----
 0   userId  100000 non-null  int64
 1   newsId  100000 non-null  int64
 2   click   100000 non-null  int64
dtypes: int64(3)
memory usage: 2.3 MB


## Step 1: Make user-item matrix

In [4]:
user_ids = impressions["userId"].unique()
news_ids = impressions["newsId"].unique()

M = len(user_ids)
N = len(news_ids)

uid2index = {id: i for i, id in enumerate(user_ids)}
nid2index = {id: i for i, id in enumerate(news_ids)}

def make_X(df: DataFrame):
    

  rows = [uid2index[uid] for uid in df["userId"]]
  cols = [nid2index[nid] for nid in df["newsId"]]

  data = df["click"]

  X = csr_matrix((data, (rows, cols)), shape=(M, N))

  return X

X = make_X(impressions)

## Step 2: Collaborative filtering

In [5]:
from sklearn.neighbors import NearestNeighbors

# Item-based CF
item_knn = NearestNeighbors(metric='cosine', algorithm='brute')
item_knn.fit(X.T)

# User-based CF
user_knn = NearestNeighbors(metric='cosine', algorithm='brute')
user_knn.fit(X)

def recommend_items_item_based(user_index, X, item_knn, n_recommendations=5):
    # Find items this user has interacted with
    user_items = X[user_index].nonzero()[1]
    
    # Initialize recommendation scores
    recommendations = np.zeros(X.shape[1])
    
    for item in user_items:
        # Find similar items
        distances, indices = item_knn.kneighbors(
            X.T[item].reshape(1, -1), 
            n_neighbors=11  # Including itself
        )
        
        # Convert distances to similarities and remove the item itself
        similarities = 1 - distances.flatten()
        similar_items = indices.flatten()
        
        # Skip the first one as it's the item itself
        for i, similar_item in enumerate(similar_items[1:]):
            # Weight by similarity
            recommendations[similar_item] += similarities[i+1]
    
    # Filter out items the user has already interacted with
    recommendations[user_items] = 0
    
    # Get top recommendations
    top_recommendations = recommendations.argsort()[-n_recommendations:][::-1]
    
    return top_recommendations

In [6]:
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Number of latent factors
n_factors = 50

# Initialize NMF model
nmf_model = NMF(n_components=n_factors, init='random', random_state=42)

# Fit the model to the user-item matrix
user_factors = nmf_model.fit_transform(X)
item_factors = nmf_model.components_

# Generate recommendations for a user
def recommend_items_mf(user_index, user_factors, item_factors, X, n_recommendations=5):
    # Predict ratings for all items
    user_vector = user_factors[user_index].reshape(1, -1)
    predicted_ratings = np.dot(user_vector, item_factors)
    
    # Set already interacted items to zero
    user_items = X[user_index].nonzero()[1]
    predicted_ratings[0, user_items] = 0
    
    # Get top recommendations
    top_recommendations = predicted_ratings[0].argsort()[-n_recommendations:][::-1]
    
    return top_recommendations

## Step 3: Evaluate model

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score
import time
import numpy as np
from tqdm import tqdm

def evaluate_recommender_optimized(interactions_df, method='mf', test_size=0.2, k=5):
    train_df, test_df = train_test_split(interactions_df, test_size=test_size, random_state=42)
    
    train_matrix = make_X(train_df)
    
    test_users = np.array([uid2index[uid] for uid in test_df['userId']])
    test_items = np.array([nid2index[nid] for nid in test_df['newsId']])
    
    start_time = time.time()
    if method == 'mf':
        print("Fitting NMF model once...")
        nmf_model = NMF(n_components=50, init='random', random_state=42)
        user_factors = nmf_model.fit_transform(train_matrix)
        item_factors = nmf_model.components_
    else:
        print("Fitting item KNN model once...")
        item_knn = NearestNeighbors(metric='cosine', algorithm='brute')
        item_knn.fit(train_matrix.T)
    
    print(f"Model fitting time: {time.time() - start_time:.2f} seconds")
    
    precision_at_k = []
    recall_at_k = []

    unique_test_users = np.unique(test_users)
    
    for user_idx in tqdm(unique_test_users, desc="Evaluating users"):
        if method == 'mf':
            recommendations = recommend_items_mf(user_idx, user_factors, item_factors, train_matrix, n_recommendations=k)
        else:
            recommendations = recommend_items_item_based(user_idx, train_matrix, item_knn, n_recommendations=k)
        
        actual_items = test_items[test_users == user_idx]
        
        common_items = len(set(recommendations).intersection(set(actual_items)))
        precision = common_items / len(recommendations) if recommendations.size > 0 else 0
        recall = common_items / len(actual_items) if len(actual_items) > 0 else 0
        
        precision_at_k.append(precision)
        recall_at_k.append(recall)
    
    mean_precision = np.mean(precision_at_k)
    mean_recall = np.mean(recall_at_k)
    
    print(f"Mean Precision@{k}: {mean_precision:.4f}")
    print(f"Mean Recall@{k}: {mean_recall:.4f}")
    
    return mean_precision, mean_recall



precision, recall = evaluate_recommender_optimized(impressions)

Fitting NMF model once...
Model fitting time: 6.70 seconds


Evaluating users: 100%|██████████| 2237/2237 [00:04<00:00, 459.89it/s]

Mean Precision@5: 0.0254
Mean Recall@5: 0.0211



