## Получаем данные

In [2]:
import pandas as pd

In [24]:
import numpy as np
from tqdm import tqdm_notebook
from surprise import Dataset, Reader, KNNBasic, KNNBaseline
from surprise.model_selection import train_test_split
from surprise import accuracy
from surprise.model_selection import KFold

In [4]:
df_ratings = pd.read_csv('./ml-1m/ratings.dat', sep='::', header=None)
df_movies = pd.read_csv('./ml-1m/movies.dat', sep='::', header=None)

  """Entry point for launching an IPython kernel.
  


In [5]:
df_ratings.columns=['userId','movieId','rating','timestamp']
df_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [6]:
df_movies.columns=['movieId','title','genres']
df_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [7]:
df = pd.merge(df_ratings, df_movies, on='movieId')

In [8]:
df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,1193,5,978298413,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,1193,4,978220179,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,1193,4,978199279,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,1193,5,978158471,One Flew Over the Cuckoo's Nest (1975),Drama


In [9]:
df_for_surpise = df_ratings[['userId', 'movieId', 'rating']]

In [10]:
df_for_surpise.columns = ['uid', 'iid', 'rating']

In [11]:
df_for_surpise.head()

Unnamed: 0,uid,iid,rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5


In [12]:
reader = Reader(rating_scale=(0.5, 5))

In [13]:
dataset = Dataset.load_from_df(df_for_surpise, reader)

In [14]:
trainset, testset = train_test_split(dataset, test_size=0.2)

In [25]:
algo = KNNBaseline(k=40, min_k=5, sim_options={'name': 'pearson_baseline', 'user_based': False})

In [26]:
algo.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBaseline at 0x16d494a0828>

In [27]:
predictions = algo.test(testset)

In [28]:
accuracy.rmse(predictions)

RMSE: 0.8584


0.8584285766651266

In [29]:
kfold = KFold(5)

In [30]:
scores = []
for trainset, testset in tqdm_notebook(kfold.split(dataset)):
    algo = KNNBaseline(k=40, min_k=5, sim_options={'name': 'pearson_baseline', 'user_based': False})
    algo.fit(trainset)
    predictions = algo.test(testset)
    scores.append(accuracy.rmse(predictions))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.8560
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.8583
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.8590
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.8566
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.8600



In [31]:
scores

[0.856029282919974,
 0.8583298514534555,
 0.8590013577632791,
 0.8565746461471356,
 0.8600308213557795]