# Collaborative Filtering
Collaborative Filtering (CF) is a popular technique used in recommender systems. It makes recommendations by looking at the interactions between users and items, such as clicks, views, or ratings.

The main idea behind collaborative filtering is simple: Users who have interacted with the same items in the past are likely to have similar preferences. For example:

* If two users both clicked on the same news articles,

* and one of them clicked on a new article,

* we can recommend that new article to the other user.

## Types of Collaborative Filtering
* **User-Based CF:** Recommends items to a user based on the preferences of similar users.

* **Item-Based CF:** Recommends items similar to the ones the user already liked.

* **Model-Based CF:** Uses machine learning models (e.g., matrix factorization, deep learning) to predict user-item interactions.

## Dataset Description
We'll use the MIND (Microsoft News Dataset), a large-scale dataset from Microsoft News, to implement and test our Collaborative Filtering (CF) approach. It includes:

* User behavior logs (behaviors.tsv): Each row corresponds to an impression event, containing:

  * User ID

  * History of clicked news

  * Impressions: list of candidate news articles with click labels (1 or 0)

* News metadata (news.tsv): Each article includes:

  * News ID

  * Category and Subcategory

  * Title and Abstract

For CF, we'll focus on the interaction matrix between users and news articles based on click behavior, treating each (user, news) click as implicit feedback.

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("arashnic/mind-news-dataset")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/arashnic/mind-news-dataset?dataset_version_number=2...


100%|██████████| 61.7M/61.7M [00:00<00:00, 155MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/arashnic/mind-news-dataset/versions/2


In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict
from scipy.sparse import csr_matrix

In [None]:
# Load behaviors.tsv file into a DataFrame
def load_behaviors(filepath):
    behaviors = pd.read_csv(filepath, sep='\t', header=None,
                            names=['Impression_ID', 'User_ID', 'Time', 'History', 'Impressions'])

    sampled_users = behaviors["User_ID"].drop_duplicates().sample(n=10000, random_state=42)
    behaviors = behaviors[behaviors["User_ID"].isin(sampled_users)]
    behaviors.fillna("", inplace=True)
    return behaviors

# Build user-item interaction matrix from behaviors data
def build_interaction_matrix(behaviors):
    users, news_list, rows, cols = {}, {}, [], []
    user_counter, news_counter = 0, 0

    for _, row in behaviors.iterrows():
        user = row['User_ID']
        if user not in users:
            users[user] = user_counter
            user_counter += 1

        # Process each impression to extract news ID and whether it was clicked
        impressions = row['Impressions'].split()
        for imp in impressions:
            news_id, clicked = imp.split('-')
            if news_id not in news_list:
                news_list[news_id] = news_counter
                news_counter += 1

            if clicked == '1':  # Store only clicked interactions
                rows.append(users[user])
                cols.append(news_list[news_id])

    data = np.ones(len(rows), dtype=np.int8)  # Use ones for interactions
    interaction_matrix = csr_matrix((data, (rows, cols)), shape=(len(users), len(news_list)))
    return interaction_matrix, list(users.keys()), list(news_list.keys())

In [None]:
class CollaborativeFilteringRecommender:
    def __init__(self, behaviors, interaction_matrix, users, news_list, method="user", top_n=5):
        self.interaction_matrix = interaction_matrix
        self.users = users
        self.news_list = news_list
        self.method = method
        self.top_n = top_n
        self.similarity_matrix = self.compute_similarity()

    # Compute similarity matrix
    def compute_similarity(self):
        if self.method == "user":
          # Compute user similarity
            return cosine_similarity(self.interaction_matrix)
        elif self.method == "item":
            # Compute item similarity
            return cosine_similarity(self.interaction_matrix.T) # Transpose to compute item-item similarity

    # Generate recommendations for a user
    def recommend(self, user_id):
        if user_id not in self.users:
            return []

        user_idx = self.users.index(user_id)
        if self.method == "user":
            similar_users = np.argsort(-self.similarity_matrix[user_idx])[1:6]
            recommended_news = set()
            for similar_user in similar_users:
                similar_user_news = set(self.interaction_matrix[similar_user].indices)
                recommended_news.update(similar_user_news)
            user_news = set(self.interaction_matrix[user_idx].indices)
        else:
            user_interactions = interaction_matrix[user_idx, :]  # Get interactions of the user
            # Compute recommendation scores for items based on the user's past interactions and item similarities
            scores = user_interactions.dot(self.similarity_matrix).flatten()
            interacted_items = set(user_interactions.indices)
            recommended_news = [i for i in np.argsort(-scores) if i not in interacted_items]

        return [self.news_list[i] for i in list(recommended_news)[:self.top_n]]


In [None]:
behaviors = load_behaviors(os.path.join(path, "MINDsmall_train", "behaviors.tsv"))
interaction_matrix, users, news_list = build_interaction_matrix(behaviors)

In [None]:
# Initialize recommender for user-based and item-based filtering
user_cf = CollaborativeFilteringRecommender(behaviors, interaction_matrix, users, news_list, method="user")
item_cf = CollaborativeFilteringRecommender(behaviors, interaction_matrix, users, news_list, method="item")

In [None]:
# Get recommendations for a sample user
sample_user_id = user_cf.users[0]
print(f"User-Based CF Recommendations for user {sample_user_id}: {user_cf.recommend(sample_user_id)}")
print(f"Item-Based CF Recommendations for user {sample_user_id}:  {item_cf.recommend(sample_user_id)}")

User-Based CF Recommendations for user U8125: ['N8595', 'N8400', 'N60445', 'N10859', 'N3663']
Item-Based CF Recommendations for user U8125:  ['N10859', 'N9836', 'N58549', 'N64858', 'N8595']


## Matrix Factorization

Matrix Factorization (MF) is one of the most widely used techniques in Collaborative Filtering for recommendation systems. It decomposes the large, sparse user-item interaction matrix into the product of two lower-dimensional matrices:

$$
R \approx U \cdot V^T
$$

- **$ R $**: Original user-item interaction matrix (e.g., clicks, ratings).
- **$ U \in \mathbb{R}^{n \times k} $**: Latent matrix representing *users*.
- **$ V \in \mathbb{R}^{m \times k} $**: Latent matrix representing *items* (e.g., news articles).
- **$ k $**: Number of latent features (a tunable hyperparameter).

The goal is to learn latent vectors for users and items such that the **dot product** between a user's vector and an item's vector approximates the observed interaction. These latent representations capture hidden preferences and properties that aren't explicitly available in the dataset.

#### 🔧 Training Objective

We optimize the latent vectors by minimizing the difference between predicted and actual interactions:

$$
\mathcal{L} = \sum_{(u,i) \in \text{observed}} (R_{ui} - \hat{R}_{ui})^2 + \lambda(\|U_u\|^2 + \|V_i\|^2)
$$

- **$ \hat{R}_{ui} = U_u \cdot V_i^T $**: Predicted interaction score.
- **$ \lambda $**: Regularization parameter to prevent overfitting.

In this notebook, we implement a matrix factorization recommender class that supports two popular techniques:

- **SVD (Singular Value Decomposition)**: A linear dimensionality reduction method for decomposing the interaction matrix.
- **ALS (Alternating Least Squares)**: A scalable algorithm particularly suited for implicit feedback like clicks.


In [None]:
!pip install implicit



In [None]:
import pandas as pd
import numpy as np
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix
from implicit.als import AlternatingLeastSquares

In [None]:
class MatrixFactorizationRecommender:
    def __init__(self, behaviors, interaction_matrix, users, news_list, method="SVD", n_components=20, iterations=15):
        self.interaction_matrix = interaction_matrix
        self.users = users
        self.news_list = news_list
        self.method = method
        self.n_components = n_components
        self.iterations = iterations
        self.model = None
        self.fit()


    # Fit the selected matrix factorization model
    def fit(self):
        if self.method == "SVD":
            self.model = TruncatedSVD(n_components=self.n_components)
            self.model.fit(self.interaction_matrix)
        elif self.method == "ALS":
            self.model = AlternatingLeastSquares(factors=self.n_components, iterations=self.iterations)
            self.model.fit(self.interaction_matrix.T)

    # Generate recommendations for a given user
    def recommend(self, user_id, top_n=5):
        user_idx = self.users.index(user_id)

        if self.method == "SVD":
            svd_matrix = self.model.transform(self.interaction_matrix)
            scores = svd_matrix[user_idx] @ self.model.components_  # Predict scores for all items
        elif self.method == "ALS":
            scores = self.model.recommend(user_idx, self.interaction_matrix[user_idx], N=top_n)
            return [self.news_list[i[0]] for i in scores]

        # Exclude already interacted items
        interacted_items = set(self.interaction_matrix[user_idx].indices)
        recommendations = [self.news_list[i] for i in np.argsort(-scores) if i not in interacted_items][:top_n]
        return recommendations


In [None]:
behaviors = load_behaviors(os.path.join(path, "MINDsmall_train", "behaviors.tsv"))
interaction_matrix, users, news_list = build_interaction_matrix(behaviors)

In [None]:
# Initialize recommender
MFrecommender = MatrixFactorizationRecommender(behaviors, interaction_matrix[:1000, :1000], users, news_list, method="SVD")

In [None]:
# Get recommendations for a sample user
sample_user_id = MFrecommender.users[0]
print(f"Matrix Factorization Recommendations for user {sample_user_id}: {MFrecommender.recommend(sample_user_id)}")

Matrix Factorization Recommendations for user U13740: ['N52622', 'N18708', 'N43502', 'N7821', 'N13930']
