## Задание по теме «Коллаборативная фильтрация»

ПАКЕТ SURPRISE

- используйте данные MovieLens 1M
- можно использовать любые модели из пакета
- получите RMSE на тестовом сете 0.87 и ниже

Комментарий преподавателя :
- В ДЗ на датасет 1М может не хватить RAM. Можно сделать на 100K. 
- Качество RMSE предлагаю считать на основе CrossValidation (5 фолдов), а не отложенном датасете.

In [1]:
import pandas as pd
import numpy as np

from tqdm import tqdm_notebook
from surprise import KNNWithMeans, KNNBasic, KNNWithZScore, SVD
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import train_test_split
from surprise.model_selection import cross_validate
from surprise.model_selection import GridSearchCV

In [2]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')

In [3]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
movies_with_ratings = movies.join(ratings.set_index('movieId'), on='movieId').reset_index(drop=True)
movies_with_ratings.dropna(inplace=True)

In [5]:
dataset = pd.DataFrame({
    'uid': movies_with_ratings.userId,
    'iid': movies_with_ratings.title,
    'rating': movies_with_ratings.rating
})
dataset.head()

Unnamed: 0,uid,iid,rating
0,1.0,Toy Story (1995),4.0
1,5.0,Toy Story (1995),4.0
2,7.0,Toy Story (1995),4.5
3,15.0,Toy Story (1995),2.5
4,17.0,Toy Story (1995),4.5


In [6]:
ratings.rating.min()

0.5

In [7]:
ratings.rating.max()

5.0

In [8]:
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(dataset, reader)

In [9]:
algo_knn_wm  = KNNWithMeans(k=50, sim_options={'name': 'pearson_baseline', 'user_based': True})
algo_knn_basic = KNNBasic(k=50, sim_options={'name': 'pearson_baseline', 'user_based': True})
algo_knn_z = KNNWithZScore(k=50, sim_options={'name': 'pearson_baseline', 'user_based': True})
algo_svd = SVD()


model = [['KNNWithMeans', algo_knn_wm], ['KNNBasic', algo_knn_basic], ['KNNWithZScore', algo_knn_z], ['SVD', algo_svd]]
for i in model:
    cr = cross_validate(i[1], data, measures=['RMSE'], cv=5, verbose=False, n_jobs=-1)
    print (i[0])
    print('RMSE =', cr['test_rmse'].min())

KNNWithMeans
RMSE = 0.8915291746300547
KNNBasic
RMSE = 0.9665672286843022
KNNWithZScore
RMSE = 0.8928079033076487
SVD
RMSE = 0.8677883766309333
