In [25]:
import pandas as pd
from utils.load_dataset import load_dataset
from surprise import Dataset
from surprise import Reader
from pathlib import Path
from surprise import SVD, NMF, KNNBasic
from surprise import accuracy
from surprise.model_selection import KFold, train_test_split
from surprise.model_selection import cross_validate

In [9]:
USERS_NUMBER = 10_000
RANDOM_STATE = 42

In [10]:
ds_df = load_dataset()
users = ds_df['ratings']['UserID'].drop_duplicates().sample(USERS_NUMBER, random_state=RANDOM_STATE)
ds_df_filter = ds_df['ratings'][ds_df['ratings']['UserID'].isin(users)]
len(ds_df_filter) / len(ds_df['ratings'])

0.14446593433054805

In [11]:
ds_df_filter

Unnamed: 0,UserID,MovieID,Rating,Timestamp
4435,40,34,5.0,945889233
4436,40,36,4.0,945889346
4437,40,50,5.0,945889117
4438,40,150,3.0,945889313
4439,40,174,3.0,945876902
...,...,...,...,...
9999946,71565,3789,4.0,974295682
9999947,71565,3808,4.0,974295467
9999948,71565,3811,4.0,974295234
9999949,71565,3812,4.0,974294786


In [12]:
reader = Reader(rating_scale=(1, 5))

In [15]:
data = Dataset.load_from_df(ds_df_filter[['UserID', 'MovieID', 'Rating']], reader)

## SVD

In [18]:
trainset, testset = train_test_split(data, test_size=0.01)

In [20]:
algo = SVD()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f8c309bec88>

In [21]:
predictions = algo.test(testset)
accuracy.rmse(predictions, verbose=True)

RMSE: 0.8111


0.8111153385081812

## NMF

In [26]:
algo = NMF()
algo.fit(trainse

<surprise.prediction_algorithms.matrix_factorization.NMF at 0x7f8c307f5b70>

In [27]:
predictions = algo.test(testset)
accuracy.rmse(predictions, verbose=True)

RMSE: 0.8652


0.8651760581957743

## KNNBasic

In [29]:
algo = KNNBasic()
algo.fit(trainset)

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7f8c30834be0>

In [30]:
predictions = algo.test(testset)
accuracy.rmse(predictions, verbose=True)

RMSE: 0.8746


0.8745911359741265