In [2]:
pip install surprise


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [3]:
from surprise import Reader
from surprise import Dataset
from surprise import accuracy
import pandas as pd
import json

from surprise import \
SVD, \
NMF, \
CoClustering

In [4]:
# load training dataset

df_train = pd.read_csv('dataset_split/training_set.csv')[['userId', 'movieId', 'rating']]

df_train.head()


Unnamed: 0,userId,movieId,rating
0,1,3578,5.0
1,1,2268,4.0
2,1,5060,5.0
3,1,2944,5.0
4,1,2137,5.0


In [5]:
reader = Reader(rating_scale=(1, 5))

In [6]:
trainset = Dataset.load_from_df(df_train, reader).build_full_trainset()

In [7]:
# pick algorithm for training

model_svd = SVD()
model_nmf = NMF()
model_cocluster = CoClustering()

In [8]:
# training the models

model_svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x127aeac10>

In [9]:
model_nmf.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.NMF at 0x127aeab90>

In [10]:
model_cocluster.fit(trainset)

<surprise.prediction_algorithms.co_clustering.CoClustering at 0x122a41890>

In [11]:
# load validation dataset

df_validate = pd.read_csv('dataset_split/testing_set.csv')[['userId', 'movieId', 'rating']]

df_validate.head()

Unnamed: 0,userId,movieId,rating
0,1,157,5.0
1,1,362,5.0
2,1,457,5.0
3,1,543,4.0
4,1,590,4.0


In [13]:
predicted_attributes_svd = []
predicted_attributes_nmf = []
predicted_attributes_cocluster = []

# this function predicts ratings on the validation set using the trained model
def get_predicted_ratings(x, algo):
  prediction_list = []

  if algo == "SVD":
    model = model_svd
  elif algo == "NMF":
    model = model_nmf
  elif algo == "CoClustering":
    model = model_cocluster

  prediction = model.predict(x[0], x[1])

  prediction_list.append(int(prediction[0]))
  prediction_list.append(int(prediction[1]))
  prediction_list.append(prediction[3])

  if algo == "SVD":
    predicted_attributes_svd.append(prediction_list)
  elif algo == "NMF":
    predicted_attributes_nmf.append(prediction_list)
  elif algo == "CoClustering":
    predicted_attributes_cocluster.append(prediction_list)  

In [14]:
# predict ratings for the validation set using the trained SVD model 

df_validate.apply(lambda x: get_predicted_ratings(x, "SVD"), axis=1)

df_result_svd = pd.DataFrame.from_records(predicted_attributes_svd, 
                                      columns=['userId', 'movieId', 'predicted_rating'])
df_result_svd['rating'] = df_validate['rating']

df_result_svd.head()

Unnamed: 0,userId,movieId,predicted_rating,rating
0,1,157,3.900461,5.0
1,1,362,4.216504,5.0
2,1,457,4.831566,5.0
3,1,543,4.320072,4.0
4,1,590,4.252876,4.0


In [15]:
# predict ratings for the validation set using the trained NMF model 

df_validate.apply(lambda x: get_predicted_ratings(x, "NMF"), axis=1)

df_result_nmf = pd.DataFrame.from_records(predicted_attributes_nmf, 
                                      columns=['userId', 'movieId', 'predicted_rating'])
df_result_nmf['rating'] = df_validate['rating']

df_result_nmf.head()

Unnamed: 0,userId,movieId,predicted_rating,rating
0,1,157,3.201665,5.0
1,1,362,3.930156,5.0
2,1,457,4.612365,5.0
3,1,543,4.011487,4.0
4,1,590,4.099457,4.0


In [16]:
df_validate.apply(lambda x: get_predicted_ratings(x, "CoClustering"), axis=1)

df_result_cocluster = pd.DataFrame.from_records(predicted_attributes_cocluster, 
                                      columns=['userId', 'movieId', 'predicted_rating'])
df_result_cocluster['rating'] = df_validate['rating']

df_result_cocluster.head()

Unnamed: 0,userId,movieId,predicted_rating,rating
0,1,157,3.381599,5.0
1,1,362,4.267873,5.0
2,1,457,4.785016,5.0
3,1,543,4.015373,4.0
4,1,590,4.784376,4.0


In [17]:
# save predictions to file

df_result_svd.to_csv('predictions/prediction_svd.csv', index=False)

df_result_nmf.to_csv('predictions/prediction_nmf.csv', index=False)

df_result_cocluster.to_csv('predictions/prediction_cocluster.csv', index=False)