## Collaborative Filtering with Cold Start
### Model inspried by: Ibtesam Ahmed
#### Link: https://www.kaggle.com/code/ibtesama/getting-started-with-a-movie-recommendation-system/notebook


##### Using SVD learning model and including user cold start scenario
##### Change cold start user to 2
##### Gridsearch

In [19]:
import pandas as pd
from surprise import Reader, Dataset, SVD, accuracy
from surprise.model_selection import GridSearchCV

In [20]:
# Load dataset
ratings = pd.read_csv('../tmdb/ratings_small.csv')
reader = Reader(rating_scale=(0.5, 5.0))

In [21]:
# Choose a user to simulate cold start
user_id = 2
user_ratings = ratings[ratings['userId'] == user_id]

# Keep only 5 ratings for the user, remove the rest
keep_ratings = user_ratings.sample(n=5, random_state=42)   # the 5 ratings we keep
remove_ratings = user_ratings.drop(keep_ratings.index)      # the rest we remove

# Remove the other ratings from the dataset for training
train_ratings = ratings.drop(remove_ratings.index)

In [22]:
# Number of movies user rated
print(len(user_ratings))

76


In [23]:
# cold start information
keep_ratings

Unnamed: 0,userId,movieId,rating,timestamp
24,2,50,4.0,835355586
55,2,349,4.0,835355441
30,2,153,4.0,835355441
20,2,10,4.0,835355493
65,2,382,3.0,835356165


In [26]:
# Load the training set into Surprise
train_data = Dataset.load_from_df(train_ratings[['userId','movieId','rating']], reader)
trainset = train_data.build_full_trainset()

In [27]:
# Grid search for hyperparameter tuning
param_grid = {
    'n_factors': [20, 50, 100],
    'n_epochs': [20, 30],
    'lr_all': [0.002, 0.005],
    'reg_all': [0.02, 0.05]
}

gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)


gs.fit(trainset)

print("Best RMSE:", gs.best_score['rmse'])
print("Best MAE:", gs.best_score['mae'])
print("Best hyperparameters:", gs.best_params['rmse'])

AttributeError: 'Trainset' object has no attribute 'raw_ratings'

In [None]:
# Train the SVD model
best_params = gs.best_params['rmse']
svd = SVD(
    n_factors=best_params['n_factors'],
    n_epochs=best_params['n_epochs'],
    lr_all=best_params['lr_all'],
    reg_all=best_params['reg_all']
)

trainset = trainset.build_full_trainset()
svd.fit(trainset)


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7c78c8488260>

In [17]:
# Predict the removed ratings for the user
predictions = []
for movie_id in remove_ratings['movieId']:
    true_rating = ratings[(ratings['userId'] == user_id) & (ratings['movieId'] == movie_id)]['rating'].values[0]
    pred = svd.predict(user_id, movie_id, r_ui=true_rating)
    predictions.append(pred)
    print(pred)

user: 2          item: 17         r_ui = 5.00   est = 3.92   {'was_impossible': False}
user: 2          item: 39         r_ui = 5.00   est = 3.55   {'was_impossible': False}
user: 2          item: 47         r_ui = 4.00   est = 4.18   {'was_impossible': False}
user: 2          item: 52         r_ui = 3.00   est = 3.65   {'was_impossible': False}
user: 2          item: 62         r_ui = 3.00   est = 4.06   {'was_impossible': False}
user: 2          item: 110        r_ui = 4.00   est = 4.31   {'was_impossible': False}
user: 2          item: 144        r_ui = 3.00   est = 3.65   {'was_impossible': False}
user: 2          item: 150        r_ui = 5.00   est = 4.01   {'was_impossible': False}
user: 2          item: 161        r_ui = 3.00   est = 4.04   {'was_impossible': False}
user: 2          item: 165        r_ui = 3.00   est = 3.61   {'was_impossible': False}
user: 2          item: 168        r_ui = 3.00   est = 3.53   {'was_impossible': False}
user: 2          item: 185        r_ui = 3.

In [18]:
# Evaluate performance on these predictions
rmse = accuracy.rmse(predictions)
mae = accuracy.mae(predictions)

RMSE: 0.9485
MAE:  0.7538
