# CF Model
## Train implicit collaborative filtering model using weighted Alternating Least Squares


In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import scipy, implicit
print("SciPy:", scipy.__version__)
print("implicit:", implicit.__version__)

  from .autonotebook import tqdm as notebook_tqdm


SciPy: 1.11.3
implicit: 0.7.2


In [None]:
#import ratings 
usr = pd.read_csv('data/user_ratings.csv')
usr.head()

Unnamed: 0,BGGId,Rating,Username
0,213788,8.0,Tonydorrf
1,213788,8.0,tachyon14k
2,213788,8.0,Ungotter
3,213788,8.0,brainlocki3
4,213788,8.0,PPMP


In [3]:
ratings_per_users = usr.groupby('Username').size()
ratings_per_movies = usr.groupby('BGGId').size()

In [5]:
usr['MappedRating'] = np.where(usr['Rating'] <= 4, -1,
                       np.where(usr['Rating'] <= 7, 0, 1))

In [None]:
# only keep users with at least 5 ratings
keep = ratings_per_users[ratings_per_users >= 5].index
usr = usr[usr['Username'].isin(keep)]

In [None]:
# map user and games to indices
user_map = {u: i for i, u in enumerate(usr['Username'].unique())}
game_map = {g: j for j, g in enumerate(usr['BGGId'].unique())}

rows = usr['Username'].map(user_map)
cols = usr['BGGId'].map(game_map)


(18696365,)

In [None]:
from scipy.sparse import csr_matrix
from scipy.sparse import coo_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from implicit.als import AlternatingLeastSquares

#funcion to create the COO Sparse Matrix that will be used in matrix factorization
def create_A_matrix(users, items, ratings):
    
    # sort to ensure that alignment of Game IDs between CF, CBF, and LLM models.
    unique_users = np.sort(users.unique())
    unique_items = np.sort(items.unique())
    user_map = {user: i for i, user in enumerate(unique_users)}
    item_map = {item: i for i, item in enumerate(unique_items)}
    rows = users.map(user_map)
    cols = items.map(item_map)

    #mask = ratings.notna()
    A = coo_matrix((ratings, (rows,cols)), shape=(len(user_map), len(item_map)))

    return A, user_map, item_map


A, user_map, item_map = create_A_matrix(usr['Username'], usr['BGGId'], usr['Rating'])
rows, cols, data = A.row, A.col, A.data
indices = np.arange(len(data))

#create train and test sets by data index, not by row.
train, test = train_test_split(indices, test_size=0.05, random_state=42)

train_rows, train_cols, train_data = rows[train], cols[train], data[train]
#val_rows, val_cols, val_data = rows[test_index], cols[test_index], data[test_index]

# create training matrix
A_train = coo_matrix((train_data, (train_rows, train_cols)), shape=A.shape)


In [None]:
# my own implementation of ALS, did not end up using because it ran too slowly, and we decided to go with implicit CF instead of explicit
def als(R, k=2, W= None, lambda_=0.1, n_iters=10 ):
    m, n = R.shape

    #Global mean
    mu = R.data.mean()
    
    # Initialize parameters
    U = np.random.normal(scale=1./k, size=(m, k))
    V = np.random.normal(scale=1./k, size=(n, k))
    b_user = np.zeros(m)
    b_item = np.zeros(n)

    #create two versions for row and column operations
    R_csr = R.tocsr()
    R_csc = R.tocsc()


    if W is None:
        W = csr_matrix(R_csr.copy())
        W.data[:] = 1.0
    W_csr = W.tocsr()
    W_csc = W.tocsc()

    nonzero_users = np.unique(R_csr.nonzero()[0])
    nonzero_items = np.unique(R_csc.nonzero()[1])
    for _ in range(n_iters):
        #print(f'Iteration {_+1}/{n_iters}')
        # Update user features
        for i in nonzero_users:
            start, end = R_csr.indptr[i], R_csr.indptr[i+1]
            idx  = R_csr.indices[start:end]
            r_i = R_csr.data[start:end] 
            w_i = W_csr.data[start:end]
            if len(idx) > 0:
                V_i = V[idx, :]
                W_i = np.diag(w_i)
                r_i_centered = r_i - mu - b_user[i] - b_item[idx]
                A = V_i.T @ W_i @ V_i + lambda_ * np.eye(k)
                b = V_i.T @ (W_i @ r_i_centered)
                U[i] = np.linalg.solve(A,b)

                #update user bias
                pred = V_i @ U[i]
                b_user[i] = np.average(r_i - pred - mu - b_item[idx], weights = w_i)

        # Update item features
        for j in nonzero_items:
            start, end = R_csc.indptr[j], R_csc.indptr[j+1]
            idx  = R_csc.indices[start:end]            
            r_j = R_csc.data[start:end]
            w_j = W_csc.data[start:end]
            if len(idx) > 0:
                U_j = U[idx,:]
                W_j = np.diag(w_j)
                r_j_centered = r_j - mu - b_user[idx] - b_item[j]
                A = U_j.T @  W_j @ U_j + lambda_ * np.eye(k)
                b = U_j.T @ (W_j @ r_j_centered)
                V[j] = np.linalg.solve(A,b)

                #udate item bias    
                pred = U_j @ V[j]
                b_item[j] = np.average(r_j - pred - mu - b_user[idx], weights = w_j)
    return U, V, b_user, b_item, mu

def rmse(y_true, y_pred):
    return np.sqrt(np.mean((y_true - y_pred) ** 2))

def mae(y_true, y_pred):
    return np.mean(np.abs(y_true - y_pred))

In [16]:
def weight_matrix(weight_dict, A):
    W = csr_matrix(A.copy())
    W.data = np.array([weight_dict.get(rating, 1.0) for rating in A.data])
    return W

In [61]:
def build_confidence_matrix(R, alpha = 20, r_min = 1, r_max = 10, gamma = 1.0):
    R_scaled = R.copy().astype(np.float32)
    R_scaled.data = np.clip((R_scaled.data - r_min) / (r_max - r_min), 0, 1)
    if gamma != 1.0:
        R_scaled.data = R_scaled.data ** gamma

    C = R_scaled.tocsr()
    C.data = 1 + alpha * C.data
    return C 

In [None]:
# first round of grid search for hyperparameter tuning
from implicit.als import AlternatingLeastSquares 
from implicit.evaluation import mean_average_precision_at_k, ndcg_at_k , precision_at_k, ranking_metrics_at_k, leave_k_out_split
import random 
np.random.seed(42)
import itertools
import time 

k_values = [32,64, 128]
lambda_values = [0.0001, 0.01, 1.0]
n_iters_values = [20]
best_score = -np.inf
alphas = [50, 100]
gammas = [1.0,2.0]
best_params = None
results = []
keep = ratings_per_users[ratings_per_users >= 10].index

sample_user_indices = [user_map[u] for u in keep.unique()]

# For faster computation during hyperparameter tuning, we can sample a subset of users
R = A_train.tocsr()[sample_user_indices, :].tocoo()

#R.data = np.where(R.data > 0, 1.0, 0.0)  # implicit feedback: liked or not
grid = random.sample(list(itertools.product(k_values, lambda_values, n_iters_values, alphas)) , 20)

results = []
# Grid search loop
for k, lambda_, n_iters, alpha in grid:
    print(f"\nEvaluating for k={k}, lambda={lambda_}, n_iters={n_iters}, alpha={alpha}")
    t0 = time.time()
    map_at_ks = []
    ndcg_ks = []
    precision_ks = []
    ranking_ks = []
    for seed in range(3):
        train, val = leave_k_out_split(R, K=1, random_state=42+seed)
        model = AlternatingLeastSquares(factors = k,
                                        regularization=lambda_,
                                        iterations=n_iters)
        C = build_confidence_matrix(train, alpha=alpha)
        model.fit(C.T)
        ranking_k = ranking_metrics_at_k(model, train.T, val.T, K=10)
        map_at_ks.append(ranking_k['map'])
        ndcg_ks.append(ranking_k['ndcg'])
        precision_ks.append(ranking_k['precision'])
    
    results.append({'k': k, 'lambda': lambda_, 'n_iters': n_iters, 'alpha': alpha, 'map_at_10': np.mean(map_at_ks),
                    'ndcg_at_10': np.mean(ndcg_ks), 'precision_at_10': np.mean(precision_ks),
                    'runtime_min': (time.time() - t0) / 60})
    print(f"Mean MAP@10: {np.mean(map_at_ks):.4f}, NDCG@10: {np.mean(ndcg_ks):.4f}, Precision@10: {np.mean(precision_ks):.4f} (took {(time.time() - t0) / 60:.2f} min)")

results_df = pd.DataFrame(results).sort_values('map_at_10', ascending=False)
    

In [None]:
# second round of grid search
from implicit.nearest_neighbours import bm25_weight

# Re-weight implicit feedback
train, val = leave_k_out_split(R, K=5, random_state=42+seed)

X = bm25_weight(train, K1=1.2, B=0.75)

results = []
param_grid = [
    (128, 0.01, 25, 160, 3.0),
    (128, 0.03, 25, 160, 3.0),
    (128, 0.01, 25, 200, 3.0),
    (128, 0.01, 25, 160, 4.0),
    (192, 0.01, 20, 160, 3.0),
    (192, 0.03, 20, 160, 3.0),
    (192, 0.01, 20, 200, 3.0),
    (256, 0.03, 20, 160, 3.0),
    (256, 0.03, 20, 200, 3.0),
    (128, 0.01, 20, 160, 3.0)  # cg_steps=4
]

for k, lambda_, n_iters, alpha, gamma in param_grid:
    print(f"Evaluating k={k}, λ={lambda_}, iters={n_iters}, α={alpha}, gamma={gamma}")
    C = build_confidence_matrix(X, alpha=alpha, gamma = gamma)
    model = AlternatingLeastSquares(factors=k,
                                    regularization=lambda_,
                                    iterations=n_iters,
                                    random_state=42)
    model.fit(C.T)

    # Evaluation (user×item)
    train_ui = train.T.tocsr().astype(np.float32)
    val_ui   = val.T.tocsr().astype(np.float32)
    ranking_k = ranking_metrics_at_k(model, train_ui, val_ui, K=10)
    results.append((k, lambda_, n_iters, alpha,gamma, ranking_k['precision'], ranking_k['ndcg'], ranking_k['map']))

results_df = pd.DataFrame(results, columns = ['k','lambda','n_iters','alpha','gamma','precision@K','NDCG@K','MAP@K']).sort_values('MAP@K', ascending=False)


Evaluating k=128, λ=0.01, iters=25, α=160, gamma=3.0


100%|██████████| 25/25 [05:52<00:00, 14.08s/it]
100%|██████████| 20577/20577 [00:16<00:00, 1283.39it/s]


Evaluating k=128, λ=0.03, iters=25, α=160, gamma=3.0


100%|██████████| 25/25 [05:45<00:00, 13.82s/it]
100%|██████████| 20577/20577 [00:17<00:00, 1199.91it/s]


Evaluating k=128, λ=0.01, iters=25, α=200, gamma=3.0


100%|██████████| 25/25 [06:18<00:00, 15.14s/it]
100%|██████████| 20577/20577 [00:18<00:00, 1130.79it/s]


Evaluating k=128, λ=0.01, iters=25, α=160, gamma=4.0


100%|██████████| 25/25 [06:10<00:00, 14.83s/it]
100%|██████████| 20577/20577 [00:19<00:00, 1080.84it/s]


Evaluating k=192, λ=0.01, iters=20, α=160, gamma=3.0


100%|██████████| 20/20 [05:44<00:00, 17.21s/it]
100%|██████████| 20577/20577 [00:20<00:00, 996.42it/s] 


Evaluating k=192, λ=0.03, iters=20, α=160, gamma=3.0


100%|██████████| 20/20 [05:43<00:00, 17.17s/it]
100%|██████████| 20577/20577 [00:20<00:00, 993.60it/s] 


Evaluating k=192, λ=0.01, iters=20, α=200, gamma=3.0


100%|██████████| 20/20 [05:40<00:00, 17.03s/it]
100%|██████████| 20577/20577 [00:20<00:00, 1014.51it/s]


Evaluating k=256, λ=0.03, iters=20, α=160, gamma=3.0


100%|██████████| 20/20 [07:40<00:00, 23.02s/it]
100%|██████████| 20577/20577 [00:23<00:00, 879.86it/s]


Evaluating k=256, λ=0.03, iters=20, α=200, gamma=3.0


100%|██████████| 20/20 [07:35<00:00, 22.79s/it]
100%|██████████| 20577/20577 [00:22<00:00, 900.18it/s]


Evaluating k=128, λ=0.01, iters=20, α=160, gamma=3.0


100%|██████████| 20/20 [04:41<00:00, 14.06s/it]
100%|██████████| 20577/20577 [00:17<00:00, 1145.71it/s]


In [None]:
#here were the best parameters found through hyperparameter tuning:
best_k, best_lambda, best_n_iters, best_alpha, best_gamma = 256, 0.03, 25, 160, 3.0

A_test = coo_matrix((data[test], (rows[test], cols[test])), shape=A.shape)

# BM25 weighting for popularity boosted the precision@10 by 100%
X = bm25_weight(A_train, K1=1.2, B=0.75)

C_final = build_confidence_matrix(X, alpha=best_alpha, gamma=best_gamma)
model_final = AlternatingLeastSquares(factors=best_k,
                                      regularization=best_lambda,
                                      iterations=best_n_iters,
                                      random_state=42)
model_final.fit(C_final.T)
U_final_implicit, V_final_implicit = model_final.user_factors, model_final.item_factors

#validate on test
ranking = ranking_metrics_at_k(model_final, A_train.T.tocsr().astype(np.float32), A_test.T.tocsr().astype(np.float32), K=10)



100%|██████████| 25/25 [10:26<00:00, 25.05s/it]
100%|██████████| 21036/21036 [00:24<00:00, 869.54it/s]


In [None]:
# 0.05 on Precision@K, which means that 1/20 or so recommendations in the top 10 are relevant.
ranking

{'precision': 0.05451895475126513,
 'map': 0.02336318942389659,
 'ndcg': 0.04980932632808593,
 'auc': 0.5202069428600354}

In [None]:
# incorporate likes from a new user and recommend games to them.
def fold_in_implicit_user(V, liked_items, alpha=5, lambda_=0.03):
    """
    Compute a new user vector given items they've liked (implicit feedback).
    """
    V_i = V[liked_items]
    # confidence weights
    C_i = 1 + alpha * np.ones(len(liked_items), dtype=np.float32)
    
    A = V_i.T @ (C_i[:, None] * V_i) + lambda_ * np.eye(V.shape[1])
    b = V_i.T @ (C_i * np.ones(len(liked_items), dtype=np.float32))
    
    u_new = np.linalg.solve(A, b)
    return u_new

# calculates a score for every game and normalizes between 0 and 1. to be combined with
# CBF and LLM scores into a composite score.
def recommend_new_user(V, liked_items,alpha=5, lambda_=0.03):
    u = fold_in_implicit_user(V,liked_items=liked_items, alpha=alpha, lambda_=lambda_)

    #calculate scores
    scores = V.dot(u)

    #normalize between 0 and 1 
    scores = (scores - min(scores)) / (max(scores) - min(scores))

    return scores 


In [None]:
liked_items = np.array([25, 122, 562])
scores = recommend_new_user(V_final_implicit,liked_items)

scores 

(0.0, 1.0)

In [None]:
#Save final user and item embeddings into numpy file.
np.save('data/U_final.npy', U_final_implicit)
np.save('data/V_final.npy', V_final_implicit)