In [None]:
import pandas as pd
from surprise import Dataset, accuracy, Reader
from surprise.model_selection import cross_validate, train_test_split, GridSearchCV
from surprise import KNNWithMeans, KNNBasic, KNNWithZScore, KNNBaseline

In [None]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')

In [None]:
mr = movies.merge(ratings, how='inner', on='movieId').reset_index(drop=True)
mr.dropna(inplace=True)
mr.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982703
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,847434962
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,1106635946
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,1510577970
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,1305696483


In [None]:
mr.rating.min(), mr.rating.max()

(0.5, 5.0)

In [None]:
reader = Reader(rating_scale=(0.5, 5))

In [None]:
df = pd.DataFrame(
    {
        'uid': mr['userId'],
        'title': mr['title'],
        'rating': mr['rating']
    }
)

In [None]:
data = Dataset.load_from_df(df, reader)

In [None]:
params = {'k': 50, 'min_k': 5, 'sim_options': {
          'name': 'cosine',
          'user_based': True}}

params_2 = {'k': 50, 'min_k': 5, 'sim_options': {
          'name': 'cosine',
          'user_based': False}}

algo = [KNNWithMeans, KNNBasic, KNNWithZScore, KNNBaseline]

## Base line algo

In [None]:
def tt_algo(algo, params):
  dict_ = {}
  for al in algo:
    train, test = train_test_split(data, test_size=0.2)
    a = al(**params)
    a.fit(train)
    a.test(test)
    acc = accuracy.rmse(a.test(test), verbose=True)
    dict_[al] = acc
  return dict_

In [None]:
print(tt_algo(algo, params))

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9012
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9777
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.8927
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.8648
{<class 'surprise.prediction_algorithms.knns.KNNWithMeans'>: 0.9011700930061269, <class 'surprise.prediction_algorithms.knns.KNNBasic'>: 0.9776822748178199, <class 'surprise.prediction_algorithms.knns.KNNWithZScore'>: 0.8926903447836868, <class 'surprise.prediction_algorithms.knns.KNNBaseline'>: 0.8648253351695603}


In [None]:
print(tt_algo(algo, params_2))

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.8976
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9760
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9138
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.8882
{<class 'surprise.prediction_algorithms.knns.KNNWithMeans'>: 0.8975770023920917, <class 'surprise.prediction_algorithms.knns.KNNBasic'>: 0.9760449334772625, <class 'surprise.prediction_algorithms.knns.KNNWithZScore'>: 0.913788298222136, <class 'surprise.prediction_algorithms.knns.KNNBaseline'>: 0.8882051000416672}


## Cross algo

In [None]:
def knn_baseline(data, algo: list, params: dict):
  for alg in algo:
    print(alg)
    print(cross_validate(alg(**params), data, measures=["RMSE"], cv=5, verbose=False))
    print('----'*20)

In [None]:
knn_baseline(data, algo, params)

<class 'surprise.prediction_algorithms.knns.KNNWithMeans'>
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
{'test_rmse': array([0.89131824, 0.89414596, 0.89751729, 0.89234154, 0.90581838]), 'fit_time': (0.1948685646057129, 0.20676779747009277, 0.330747127532959, 0.20074892044067383, 0.2164161205291748), 'test_time': (2.1845476627349854, 2.681842088699341, 2.068533420562744, 2.1635704040527344, 1.978421688079834)}
--------------------------------------------------------------------------------
<class 'surprise.prediction_algorithms.knns.KNNBasic'>
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix.

## KNNBaseline алгоритм показал нужный результат, но все равно ошибка очень большая

In [None]:
knn_baseline(data, algo, params_2)

<class 'surprise.prediction_algorithms.knns.KNNWithMeans'>
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
{'test_rmse': array([0.90318356, 0.90962903, 0.89294488, 0.90474772, 0.89731849]), 'fit_time': (7.909477949142456, 9.631837368011475, 7.519832372665405, 6.874406814575195, 6.862316370010376), 'test_time': (17.283974409103394, 14.561537504196167, 13.278869390487671, 14.538656949996948, 13.201172113418579)}
--------------------------------------------------------------------------------
<class 'surprise.prediction_algorithms.knns.KNNBasic'>
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
