## Step 0: load and prepare data

In [1]:
import pandas as pd
import numpy as np
from pandas import DataFrame

from scipy.sparse import csr_matrix

In [2]:

raw_impressions = pd.read_csv(
    "../MIND/train/behaviors.tsv",
    sep='\t',
    header=None,
    names=["impressionId", "userId", "time", "history", "impressions"],
    usecols=["userId", "impressions"]
  )

raw_news = pd.read_csv(
  "../MIND/train/news.tsv",
  sep="\t", 
  header=None,
  names=["newsId", "category", "subcategory", "title", "abstract", "url", "titleEntities", "abstractEntities"],
  usecols=["newsId", "title"]
)


In [3]:
import re

impressions = raw_impressions

def _split_clicked(x):
  return re.findall(r"(\w+)-1", x)

impressions = impressions.dropna()

impressions["newsId"] = impressions["impressions"].apply(_split_clicked)
impressions = impressions.explode("newsId").reset_index()
impressions = impressions.head(100_000)

impressions["click"] = 1

impressions = impressions[["userId", "newsId", "click"]]
print(impressions.info())
print(impressions.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   userId  100000 non-null  object
 1   newsId  100000 non-null  object
 2   click   100000 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 2.3+ MB
None
   userId  newsId  click
0  U13740  N55689      1
1  U91836  N17059      1
2  U73700  N23814      1
3  U34670  N49685      1
4   U8125   N8400      1


## Step 1: Make user-item matrix

In [4]:
from collab_utils import create_x

X, uid2index, nid2index, index2uid, index2nid = create_x(impressions)

print(X.shape)

(34324, 5617)


## Step 2: Collaborative filtering

In [5]:
from sklearn.neighbors import NearestNeighbors

# Item-based CF
item_knn = NearestNeighbors(metric='cosine', algorithm='brute')
item_knn.fit(X.T)

# User-based CF
user_knn = NearestNeighbors(metric='cosine', algorithm='brute')
user_knn.fit(X)

def recommend_items_item_based(user_index, X, item_knn, n_recommendations=5):
    # Find items this user has interacted with
    user_items = X[user_index].nonzero()[1]
    
    # Initialize recommendation scores
    recommendations = np.zeros(X.shape[1])
    
    for item in user_items:
        # Find similar items
        distances, indices = item_knn.kneighbors(
            X.T[item].reshape(1, -1), 
            n_neighbors=11  # Including itself
        )
        
        # Convert distances to similarities and remove the item itself
        similarities = 1 - distances.flatten()
        similar_items = indices.flatten()
        
        # Skip the first one as it's the item itself
        for i, similar_item in enumerate(similar_items[1:]):
            # Weight by similarity
            recommendations[similar_item] += similarities[i+1]
    
    # Filter out items the user has already interacted with
    recommendations[user_items] = 0
    
    # Get top recommendations
    top_recommendations = recommendations.argsort()[-n_recommendations:][::-1]
    
    return top_recommendations

In [6]:
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Number of latent factors
n_factors = 50

# Initialize NMF model
nmf_model = NMF(n_components=n_factors, init='random', random_state=42)

# Fit the model to the user-item matrix
user_factors = nmf_model.fit_transform(X)
item_factors = nmf_model.components_

# Generate recommendations for a user
def recommend_items_mf(user_index, user_factors, item_factors, X, n_recommendations=5):
    # Predict ratings for all items
    user_vector = user_factors[user_index].reshape(1, -1)
    predicted_ratings = np.dot(user_vector, item_factors)
    
    # Set already interacted items to zero
    user_items = X[user_index].nonzero()[1]
    predicted_ratings[0, user_items] = 0
    
    # Get top recommendations
    top_recommendations = predicted_ratings[0].argsort()[-n_recommendations:][::-1]
    
    return top_recommendations

## Step 3: Evaluate model

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score
import time
import numpy as np
from tqdm import tqdm

def evaluate_recommender_optimized(interactions_df, method='mf', test_size=0.2, k=5):
    train_df, test_df = train_test_split(interactions_df, test_size=test_size, random_state=42)

    train_matrix, _, _, _, _ = create_x(train_df)
    
    test_users = np.array([uid2index[uid] for uid in test_df['userId']])
    test_items = np.array([nid2index[nid] for nid in test_df['newsId']])
    
    start_time = time.time()

    print("Fitting NMF model")

    nmf_model = NMF(n_components=50, init='random', random_state=42)
    user_factors = nmf_model.fit_transform(train_matrix)
    item_factors = nmf_model.components_

    print(f"Model fitting time: {time.time() - start_time:.2f} seconds")
    
    precision_at_k = []
    recall_at_k = []

    unique_test_users = np.unique(test_users)

    skipped = 0
    
    for user_idx in tqdm(unique_test_users, desc="Evaluating users"):
        try: 
            recommendations = recommend_items_mf(user_idx, user_factors, item_factors, train_matrix, n_recommendations=k)
        except: 
            skipped += 1
            continue

        actual_items = test_items[test_users == user_idx]
        
        common_items = len(set(recommendations).intersection(set(actual_items)))
        precision = common_items / len(recommendations) if recommendations.size > 0 else 0
        recall = common_items / len(actual_items) if len(actual_items) > 0 else 0
        
        precision_at_k.append(precision)
        recall_at_k.append(recall)
    
    mean_precision = np.mean(precision_at_k)
    mean_recall = np.mean(recall_at_k)

    print(f"Skipped {skipped}") 
    print(f"Mean Precision@{k}: {mean_precision:.4f}")
    print(f"Mean Recall@{k}: {mean_recall:.4f}")
    
    return mean_precision, mean_recall



precision, recall = evaluate_recommender_optimized(impressions)

Fitting NMF model




Model fitting time: 57.91 seconds


Evaluating users: 100%|██████████| 13761/13761 [00:36<00:00, 379.39it/s]

Skipped 851
Mean Precision@5: 0.0018
Mean Recall@5: 0.0060





Model fitting time: 92.77 seconds
Evaluating users: 100%|██████████| 25538/25538 [01:19<00:00, 320.65it/s]
Mean Precision@5: 0.0133
Mean Recall@5: 0.0370

10_000 rows:
Fitting NMF model
Model fitting time: 5.65 seconds
Evaluating users: 100%|██████████| 1703/1703 [00:05<00:00, 314.24it/s]
Mean Precision@5: 0.0011
Mean Recall@5: 0.0047

100_000 rows: