## Matrix Factorization

In [73]:
from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import accuracy
from collections import defaultdict
from surprise.model_selection import GridSearchCV
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np
import pickle


In [47]:
ratings_df = pd.read_csv('../data/ratings.csv')
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [48]:
ratings_df['rating'] = ratings_df['rating']*2
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,8.0,964982703
1,1,3,8.0,964981247
2,1,6,8.0,964982224
3,1,47,10.0,964983815
4,1,50,10.0,964982931


In [49]:
reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(ratings_df[['userId', 'movieId', 'rating']], reader)
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

In [50]:
model = SVD()
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1a9e29259d0>

In [51]:
predictions = model.test(testset)
rmse = accuracy.rmse(predictions)
mae = accuracy.mae(predictions)

RMSE: 1.7666
MAE:  1.3581


In [53]:
param_grid = {
    'n_factors': [50, 100, 200, 300],
    'lr_all': [0.002, 0.005, 0.01],
    'reg_all': [0.02, 0.1]
}

gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=10)
gs.fit(data)

print(gs.best_score)
print(gs.best_params)

{'rmse': 1.6861607202169986, 'mae': 1.2918686000737443}
{'rmse': {'n_factors': 300, 'lr_all': 0.01, 'reg_all': 0.1}, 'mae': {'n_factors': 300, 'lr_all': 0.01, 'reg_all': 0.1}}


In [56]:
def accuracy_within_threshold(predictions, threshold=1.0):
    y_true = np.array([pred.r_ui for pred in predictions])
    y_pred = np.array([pred.est for pred in predictions])
    return np.mean(np.abs(y_true - y_pred) <= threshold)

# After model.test()
acc_within_2 = accuracy_within_threshold(predictions, threshold=2.0)
print(f"Accuracy within 2.0: {acc_within_2:.4f}")

acc_within_1 = accuracy_within_threshold(predictions, threshold=1.0)
print(f"Accuracy within 1.0: {acc_within_1:.4f}")

acc_within_0_5 = accuracy_within_threshold(predictions, threshold=0.5)
print(f"Accuracy within 0.5: {acc_within_0_5:.4f}")


Accuracy within 2.0: 0.7717
Accuracy within 1.0: 0.4659
Accuracy within 0.5: 0.2469


In [69]:
user_id = 7

# All movie IDs
all_movie_ids = ratings_df['movieId'].unique()

# Movies the user has already rated
rated_movie_ids = ratings_df[ratings_df['userId'] == user_id]['movieId'].values

# Movies not rated by the user
unrated_movie_ids = [mid for mid in all_movie_ids if mid not in rated_movie_ids]


In [70]:
# Ensure user_id and movieId are passed as strings (Surprise expects strings)
predictions = [
    model.predict(str(user_id), str(movie_id))
    for movie_id in unrated_movie_ids
]


In [72]:
predictions[:10]

[Prediction(uid='7', iid='3', r_ui=None, est=7.006458570932712, details={'was_impossible': False}),
 Prediction(uid='7', iid='6', r_ui=None, est=7.006458570932712, details={'was_impossible': False}),
 Prediction(uid='7', iid='47', r_ui=None, est=7.006458570932712, details={'was_impossible': False}),
 Prediction(uid='7', iid='70', r_ui=None, est=7.006458570932712, details={'was_impossible': False}),
 Prediction(uid='7', iid='101', r_ui=None, est=7.006458570932712, details={'was_impossible': False}),
 Prediction(uid='7', iid='110', r_ui=None, est=7.006458570932712, details={'was_impossible': False}),
 Prediction(uid='7', iid='151', r_ui=None, est=7.006458570932712, details={'was_impossible': False}),
 Prediction(uid='7', iid='157', r_ui=None, est=7.006458570932712, details={'was_impossible': False}),
 Prediction(uid='7', iid='163', r_ui=None, est=7.006458570932712, details={'was_impossible': False}),
 Prediction(uid='7', iid='216', r_ui=None, est=7.006458570932712, details={'was_impossib

In [74]:
with open("svd_model.pkl", "wb") as f:
    pickle.dump(model, f)
