In [50]:
from surprise import KNNWithMeans, KNNBasic, KNNBaseline, SVD
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import train_test_split

import pandas as pd

In [2]:
movies = pd.read_csv('movies1M.csv')
ratings = pd.read_csv('ratings1M.csv')

In [3]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [4]:
movies_with_ratings = movies.join(ratings.set_index('movieId'), on='movieId').reset_index(drop=True)
movies_with_ratings.dropna(inplace=True)

In [5]:
dataset = pd.DataFrame({
    'uid': movies_with_ratings.userId,
    'iid': movies_with_ratings.title,
    'rating': movies_with_ratings.rating
})

In [6]:
dataset.head()

Unnamed: 0,uid,iid,rating
0,7.0,Toy Story (1995),3.0
1,9.0,Toy Story (1995),4.0
2,13.0,Toy Story (1995),5.0
3,15.0,Toy Story (1995),2.0
4,19.0,Toy Story (1995),3.0


In [7]:
ratings.rating.min()

0.5

In [8]:
ratings.rating.max()

5.0

In [9]:
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(dataset, reader)

In [133]:
trainset, testset = train_test_split(data, test_size=.20, random_state=20)

In [15]:
algo = SVD()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x11fc98550>

In [16]:
test_pred = algo.test(testset)

In [17]:
accuracy.rmse(test_pred, verbose=True)

RMSE: 0.8948


0.8947884535969212

In [19]:
algo.predict(uid=6, iid='Fight Club (1999)')

Prediction(uid=6, iid='Fight Club (1999)', r_ui=None, est=3.7053819322406434, details={'was_impossible': False})

In [42]:
algo2 = KNNBasic(k=20, min_k=5)
algo2.fit(trainset)

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x1218e5588>

In [43]:
test_pred2 = algo2.test(testset)

In [44]:
accuracy.rmse(test_pred2, verbose=True)

RMSE: 0.9705


0.9705117793901107

In [45]:
sim_options = {'name': 'pearson_baseline',
               'user_based': True  # compute  similarities between items
               }
algo3 = KNNWithMeans(k=20, min_k=5, sim_options=sim_options)
algo3.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x1218e5c50>

In [46]:
test_pred3 = algo3.test(testset)

In [47]:
accuracy.rmse(test_pred3, verbose=True)

RMSE: 0.9246


0.9245705438082592

In [51]:
sim_options = {'name': 'pearson_baseline', 'user_based': False}
algo4 = KNNBaseline(sim_options=sim_options)
algo4.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBaseline at 0x1288f5400>

In [52]:
test_pred4 = algo4.test(testset)

In [59]:
accuracy.rmse(test_pred4, verbose=True)

RMSE: 0.8865


0.8865450496386085

In [191]:
sim_options = {'name': 'pearson_baseline', 'user_based': False, 'shrinkage': 20}
algo5 = KNNBaseline(k=150, min_k=11, sim_options=sim_options)
algo5.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBaseline at 0x129975fd0>

In [192]:
test_pred5 = algo5.test(testset)

In [193]:
accuracy.rmse(test_pred5, verbose=True)

RMSE: 0.8762


0.876242836615142