<h4>Imports</h4>

In [1]:
import pandas as pd
from surprise import SVD, Dataset, Reader, accuracy
from surprise.model_selection import train_test_split
import pickle

<h4>Data Loading</h4>

In [2]:
movies = pd.read_csv(r"D:\Class\Data-606\Movie-Recommendation-System\Data\ml-latest-small\movies.csv")
ratings = pd.read_csv(r"D:\Class\Data-606\Movie-Recommendation-System\Data\ml-latest-small\ratings.csv")

In [3]:
movies

Unnamed: 0,movieid,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [4]:
ratings

Unnamed: 0,userId,movieid,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [5]:
# Merge the two dataframes on movieId column
data = pd.merge(movies, ratings, on="movieid")

In [6]:
reader = Reader(rating_scale=(0.5, 5))

In [7]:
data

Unnamed: 0,movieid,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982703
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,847434962
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,1106635946
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,1510577970
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,1305696483
...,...,...,...,...,...,...
100831,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,184,4.0,1537109082
100832,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,184,3.5,1537109545
100833,193585,Flint (2017),Drama,184,3.5,1537109805
100834,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,184,3.5,1537110021


In [8]:
 dataset = Dataset.load_from_df(data[["userId", "movieid", "rating"]], reader)

In [9]:
trainset, testset = train_test_split(dataset, test_size=0.25)

In [10]:
algo = SVD()

In [11]:
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x177e1740eb0>

In [12]:
predictions = algo.test(testset)

In [13]:
print(f"RMSE: {accuracy.rmse(predictions)}")

RMSE: 0.8753
RMSE: 0.8752844205269817


In [14]:
predictions

[Prediction(uid=558, iid=5419, r_ui=5.0, est=3.5310694589846414, details={'was_impossible': False}),
 Prediction(uid=558, iid=296, r_ui=3.0, est=4.616718763203575, details={'was_impossible': False}),
 Prediction(uid=142, iid=344, r_ui=4.0, est=3.086818656398536, details={'was_impossible': False}),
 Prediction(uid=308, iid=527, r_ui=1.0, est=3.022475074201952, details={'was_impossible': False}),
 Prediction(uid=45, iid=588, r_ui=5.0, est=4.080105733309869, details={'was_impossible': False}),
 Prediction(uid=448, iid=33830, r_ui=2.0, est=3.2214794218705904, details={'was_impossible': False}),
 Prediction(uid=219, iid=3510, r_ui=1.0, est=2.9820729792917606, details={'was_impossible': False}),
 Prediction(uid=338, iid=190215, r_ui=1.5, est=2.6789939543682704, details={'was_impossible': False}),
 Prediction(uid=280, iid=4973, r_ui=4.0, est=4.418151187891, details={'was_impossible': False}),
 Prediction(uid=603, iid=1673, r_ui=5.0, est=4.08899566460861, details={'was_impossible': False}),
 P

In [15]:
algo.predict(1, 1)

Prediction(uid=1, iid=1, r_ui=None, est=4.058286890587489, details={'was_impossible': False})

In [16]:
from surprise.model_selection import GridSearchCV
# Define the parameter grid to search over
param_grid = {'n_factors': [50, 100, 200],
              'n_epochs': [20, 30, 40],
              'lr_all': [0.002, 0.005, 0.01],
              'reg_all': [0.02, 0.1, 0.4]}
 
# Creating a GridSearchCV object and fit it to the data
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=5)
gs.fit(dataset)

# Printing the best RMSE score and corresponding hyperparameters
print(gs.best_score['rmse'])
print(gs.best_params['rmse'])

0.8514720336514635
{'n_factors': 200, 'n_epochs': 40, 'lr_all': 0.01, 'reg_all': 0.1}


In [17]:
# Pickle the model object
with open('model.pkl', 'wb') as f:
    pickle.dump(algo, f)