In [49]:
import surprise
import pandas as pd

In [50]:
from zipfile import ZipFile

with ZipFile('ml-1m.zip', 'r') as f:

    f.extractall()

In [51]:
movies = pd.read_csv('movies.dat', sep='::', engine='python', header=None, encoding = "ISO-8859-1")
ratings = pd.read_csv('ratings.dat', sep='::', engine='python', header=None, encoding = "ISO-8859-1")
movies.columns = ['MovieID', 'Title', 'Genres']
ratings.columns = ['UserID', 'MovieID', 'Rating', 'Timestamp']

In [52]:
movies_with_ratings = movies.merge(ratings, on='MovieID').reset_index(drop=True)
movies_with_ratings.dropna(inplace=True)
movies_with_ratings = movies_with_ratings[['UserID','Title', 'Rating']]

In [59]:
dataset = pd.DataFrame({
    'uid': movies_with_ratings.UserID,
    'iid': movies_with_ratings.Title,
    'rating': movies_with_ratings.Rating
})

In [62]:
dataset['rating'].min(), dataset['rating'].max()

(1, 5)

In [65]:
dataset['uid'].nunique(), dataset['iid'].nunique()

(6040, 3706)

In [69]:
from surprise import KNNWithMeans, KNNBasic, SVD
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import train_test_split, cross_validate


reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(dataset, reader)

# SVD cross-validation

In [70]:
trainset, testset = train_test_split(data, test_size=0.2, random_state=1)

In [73]:
algo1 = SVD()

cross_validate(algo1, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8722  0.8749  0.8716  0.8761  0.8731  0.8736  0.0017  
MAE (testset)     0.6850  0.6868  0.6838  0.6873  0.6854  0.6857  0.0013  
Fit time          10.09   10.04   12.80   12.32   11.91   11.43   1.15    
Test time         1.61    1.26    1.32    1.55    1.27    1.40    0.15    


{'test_rmse': array([0.8721896 , 0.87488756, 0.87155727, 0.87611247, 0.87312577]),
 'test_mae': array([0.68500719, 0.68684485, 0.68381713, 0.68725204, 0.68536284]),
 'fit_time': (10.091252088546753,
  10.039140462875366,
  12.797046661376953,
  12.317123889923096,
  11.911550998687744),
 'test_time': (1.6121478080749512,
  1.2629401683807373,
  1.3204755783081055,
  1.5548615455627441,
  1.2714860439300537)}

Выберем лучший алгоритм для последующего его улучшения.

# KNN with means cross-validation

In [74]:
algo2 = KNNWithMeans()

cross_validate(algo2, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9289  0.9298  0.9298  0.9273  0.9302  0.9292  0.0010  
MAE (testset)     0.7384  0.7390  0.7383  0.7373  0.7397  0.7385  0.0008  
Fit time          20.02   20.67   22.09   22.79   19.58   21.03   1.22    
Test time         74.23   72.26   75.86   71.80   79.36   74.70   2.74    


{'test_rmse': array([0.92894292, 0.92976239, 0.9298408 , 0.92727388, 0.93020337]),
 'test_mae': array([0.73843739, 0.73897881, 0.73828577, 0.73728757, 0.73970489]),
 'fit_time': (20.021695852279663,
  20.671433210372925,
  22.08785390853882,
  22.786900997161865,
  19.581608295440674),
 'test_time': (74.22834134101868,
  72.26439070701599,
  75.85965371131897,
  71.80454349517822,
  79.35630106925964)}

# KNN Basic cross-validation

In [75]:
algo3 = KNNBasic()
cross_validate(algo3, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9227  0.9210  0.9251  0.9216  0.9240  0.9229  0.0015  
MAE (testset)     0.7272  0.7264  0.7292  0.7268  0.7274  0.7274  0.0010  
Fit time          20.73   20.88   21.30   19.64   20.74   20.66   0.55    
Test time         69.61   71.64   72.21   70.79   74.64   71.78   1.68    


{'test_rmse': array([0.92269208, 0.92098741, 0.92508573, 0.92161726, 0.92397141]),
 'test_mae': array([0.72715545, 0.72638205, 0.72921525, 0.72684481, 0.72739715]),
 'fit_time': (20.731898546218872,
  20.880713939666748,
  21.29983139038086,
  19.635774612426758,
  20.740044832229614),
 'test_time': (69.60802626609802,
  71.6352813243866,
  72.2135910987854,
  70.78700590133667,
  74.63641166687012)}

# SVD++ cross-validation

In [76]:
from surprise import SVDpp
algo4 = SVDpp()
cross_validate(algo4, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVDpp on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8622  0.8595  0.8603  0.8626  0.8613  0.8612  0.0012  
MAE (testset)     0.6714  0.6701  0.6701  0.6730  0.6716  0.6712  0.0011  
Fit time          263.96  268.71  274.46  278.47  278.87  272.89  5.77    
Test time         54.37   71.53   59.95   57.99   61.65   61.10   5.75    


{'test_rmse': array([0.86222663, 0.85946105, 0.86027642, 0.86258737, 0.86126311]),
 'test_mae': array([0.67139707, 0.6701001 , 0.67012556, 0.67301672, 0.67156811]),
 'fit_time': (263.95687079429626,
  268.71485900878906,
  274.46264386177063,
  278.46577525138855,
  278.8734073638916),
 'test_time': (54.36972737312317,
  71.5268144607544,
  59.952635765075684,
  57.9886519908905,
  61.65486717224121)}

SVD++ дала требуемый результат. 