In [1]:
from surprise import KNNWithMeans, KNNBasic
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import train_test_split

import pandas as pd

In [2]:
movies = pd.read_csv('data/movies.csv')
ratings = pd.read_csv('data/ratings.csv')

In [3]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
movies_with_ratings = movies.join(ratings.set_index('movieId'), on='movieId')
movies_with_ratings.dropna(inplace=True)

In [5]:
movies_with_ratings.head(10)

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1.0,4.0,964982700.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5.0,4.0,847435000.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7.0,4.5,1106636000.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15.0,2.5,1510578000.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17.0,4.5,1305696000.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,18.0,3.5,1455210000.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,19.0,4.0,965705600.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,21.0,3.5,1407619000.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,27.0,3.0,962685300.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,31.0,5.0,850466600.0


In [6]:
movies_with_ratings[movies_with_ratings.userId == 2.0].movieId.unique()

array([   318,    333,   1704,   3578,   6874,   8798,  46970,  48516,
        58559,  60756,  68157,  71535,  74458,  77455,  79132,  80489,
        80906,  86345,  89774,  91529,  91658,  99114, 106782, 109487,
       112552, 114060, 115713, 122882, 131724])

In [7]:
dataset = pd.DataFrame({
    'uid': movies_with_ratings.userId,
    'iid': movies_with_ratings.movieId,
    'rating': movies_with_ratings.rating
})

In [8]:
dataset.head(20)

Unnamed: 0,uid,iid,rating
0,1.0,1,4.0
0,5.0,1,4.0
0,7.0,1,4.5
0,15.0,1,2.5
0,17.0,1,4.5
0,18.0,1,3.5
0,19.0,1,4.0
0,21.0,1,3.5
0,27.0,1,3.0
0,31.0,1,5.0


In [9]:
ratings.rating.min()

0.5

In [10]:
ratings.rating.max()

5.0

In [11]:
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(dataset, reader)

In [12]:
trainset, testset = train_test_split(data, test_size=.15)

In [13]:
algo = KNNWithMeans(k=50, sim_options={'name': 'pearson_baseline', 'user_based': True})
algo.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7f135ed29d30>

In [14]:
test_pred = algo.test(testset)

In [15]:
accuracy.rmse(test_pred, verbose=True)

RMSE: 0.8942


0.894194861869477

In [16]:
algo.predict(uid=2, iid=1)

Prediction(uid=2, iid=1, r_ui=None, est=4.0804368953471375, details={'actual_k': 34, 'was_impossible': False})

In [17]:
algo = KNNWithMeans(k=50, sim_options={'name': 'pearson_baseline', 'user_based': False})
algo.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7f135c6fa220>

In [18]:
test_pred = algo.test(testset)

In [19]:
accuracy.rmse(test_pred, verbose=True)

RMSE: 0.8820


0.8820067292632042

In [20]:
algo.predict(uid=2, iid=6)

Prediction(uid=2, iid=6, r_ui=None, est=3.8448304915304283, details={'actual_k': 14, 'was_impossible': False})