In [1]:
import numpy as np
import pandas as pd
from surprise import SVD
from surprise.prediction_algorithms import KNNWithMeans, KNNBasic, KNNBaseline
from surprise import Dataset, Reader
from surprise import accuracy
from surprise.model_selection import cross_validate, train_test_split, GridSearchCV
import pickle

In [2]:
df = pd.read_csv('cleaned_movie_ratings.csv')
df.drop('Unnamed: 0', axis=1, inplace=True)
df.head()

Unnamed: 0,userId,movieId,rating,title,genres,year
0,1,1,4.0,Toy Story (1995),Adventure Animation Children Comedy Fantasy,1995
1,1,3,4.0,Grumpier Old Men (1995),Comedy Romance,1995
2,1,6,4.0,Heat (1995),Action Crime Thriller,1995
3,1,47,5.0,Seven (a.k.a. Se7en) (1995),Mystery Thriller,1995
4,1,50,5.0,"Usual Suspects, The (1995)",Crime Mystery Thriller,1995


In [3]:
modeling_df = df[['userId','movieId','rating']]
modeling_df.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [4]:
reader = Reader(line_format='user item rating', sep=',')
data = Dataset.load_from_df(modeling_df, reader)
data.raw_ratings


[(1, 1, 4.0, None),
 (1, 3, 4.0, None),
 (1, 6, 4.0, None),
 (1, 47, 5.0, None),
 (1, 50, 5.0, None),
 (1, 70, 3.0, None),
 (1, 101, 5.0, None),
 (1, 110, 4.0, None),
 (1, 151, 5.0, None),
 (1, 157, 5.0, None),
 (1, 163, 5.0, None),
 (1, 216, 5.0, None),
 (1, 223, 3.0, None),
 (1, 231, 5.0, None),
 (1, 235, 4.0, None),
 (1, 260, 5.0, None),
 (1, 296, 3.0, None),
 (1, 316, 3.0, None),
 (1, 333, 5.0, None),
 (1, 349, 4.0, None),
 (1, 356, 4.0, None),
 (1, 362, 5.0, None),
 (1, 367, 4.0, None),
 (1, 423, 3.0, None),
 (1, 441, 4.0, None),
 (1, 457, 5.0, None),
 (1, 480, 4.0, None),
 (1, 500, 3.0, None),
 (1, 527, 5.0, None),
 (1, 543, 4.0, None),
 (1, 552, 4.0, None),
 (1, 553, 5.0, None),
 (1, 590, 4.0, None),
 (1, 592, 4.0, None),
 (1, 593, 4.0, None),
 (1, 596, 5.0, None),
 (1, 608, 5.0, None),
 (1, 648, 3.0, None),
 (1, 661, 5.0, None),
 (1, 673, 3.0, None),
 (1, 733, 4.0, None),
 (1, 736, 3.0, None),
 (1, 780, 3.0, None),
 (1, 804, 4.0, None),
 (1, 919, 5.0, None),
 (1, 923, 5.0, None

In [5]:
trainset, testset = train_test_split(data, test_size=.20)

In [6]:
knn = KNNBaseline()


In [7]:
knn.fit(trainset)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBaseline at 0x11d479b90>

In [8]:
predictions = knn.test(testset)

In [9]:
accuracy.rmse(predictions)
accuracy.mae(predictions)

RMSE: 0.8760
MAE:  0.6705


0.6705206880833393

In [10]:
knn_params = {'name':["cosine","pearson_baseline"],
'user_based': [True,False],
'min_support':[True, False]
}

In [11]:
knn_grid = GridSearchCV(KNNBaseline,
 param_grid = knn_params,
measures=['rmse', 'mae'],refit=True,
 cv=3)

knn_grid.fit(data)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matr

In [12]:
knn_grid.best_score

{'rmse': 0.8822860174855808, 'mae': 0.6747914712931729}

In [13]:
knn_grid.best_params

{'rmse': {'name': 'cosine', 'user_based': True, 'min_support': True},
 'mae': {'name': 'cosine', 'user_based': True, 'min_support': True}}

In [14]:
prediction = knn_grid.test(testset)

In [15]:
accuracy.rmse(predictions)
accuracy.mae(prediction)

RMSE: 0.8760
MAE:  0.5144


0.5143839464695258

In [16]:

knn_best_params = {'name':['cosine'],
'user_based': [True],
'min_support': [True]}

In [17]:
knn_tuned_grid = GridSearchCV(KNNBaseline, param_grid= knn_best_params, refit=True,cv=3)
knn_tuned_grid.fit(data)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.


In [18]:
prediction_tuned = knn_tuned_grid.test(testset)

In [19]:
knn_tuned_grid.best_params

{'rmse': {'name': 'cosine', 'user_based': True, 'min_support': True},
 'mae': {'name': 'cosine', 'user_based': True, 'min_support': True}}

In [20]:
tuned_knnb= KNNBaseline(sim_options={'name':'cosine','user_based':True, "min_support":True})
tuned_knnb_cv=cross_validate(tuned_knnb,data,measures=['RMSE', 'MAE'],cv=4)

Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.


In [21]:
accuracy.rmse(prediction_tuned)
accuracy.mse(prediction_tuned)

RMSE: 0.6884
MSE: 0.4739


0.4739408858308277

In [22]:
for i in tuned_knnb_cv.items():
    print(i)

('test_rmse', array([0.87764427, 0.88419556, 0.88010416, 0.88052011]))
('test_mae', array([0.6724537 , 0.67606291, 0.67388142, 0.67453801]))
('fit_time', (0.7019989490509033, 0.5337748527526855, 0.6237561702728271, 0.7576398849487305))
('test_time', (2.733794927597046, 3.010209083557129, 2.760957956314087, 3.33502197265625))


In [23]:
np.mean(tuned_knnb_cv['test_rmse'])

0.8806160246261865

In [24]:
def get_top_n(predictions, n=5):

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, r_ui, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [25]:
prediction_set = trainset.build_anti_testset() # Combinations from the train set that the model hasnt made predictions on

In [26]:
pred_anti_test = knn_tuned_grid.test(prediction_set)

In [27]:
from collections import defaultdict
top_n = get_top_n(pred_anti_test, n=5)

In [28]:
top10recs =pd.DataFrame(top_n[10], 
             columns = [
                 'movieId','estimated_rating'])

In [35]:
df.loc[df['userId']==10].head()

Unnamed: 0,userId,movieId,rating,title,genres,year
1119,10,296,1.0,Pulp Fiction (1994),Comedy Crime Drama Thriller,1994
1120,10,356,3.5,Forrest Gump (1994),Comedy Drama Romance War,1994
1121,10,588,4.0,Aladdin (1992),Adventure Animation Children Comedy Musical,1992
1122,10,597,3.5,Pretty Woman (1990),Comedy Romance,1990
1123,10,912,4.0,Casablanca (1942),Drama Romance,1942


In [29]:
top10recs

Unnamed: 0,movieId,estimated_rating
0,3567,5
1,187717,5
2,100556,5
3,6818,5
4,3379,5


In [30]:
df_2 = pd.read_csv('movies.csv')
df_2

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [31]:
pd.merge(top10recs,df_2, on='movieId', how='inner')

Unnamed: 0,movieId,estimated_rating,title,genres
0,3567,5,Bossa Nova (2000),Comedy|Drama|Romance
1,187717,5,Won't You Be My Neighbor? (2018),Documentary
2,100556,5,"Act of Killing, The (2012)",Documentary
3,6818,5,Come and See (Idi i smotri) (1985),Drama|War
4,3379,5,On the Beach (1959),Drama


#### Tuned KNNBaseline Results

RMSE: 0.6802


MSE: 0.4626

In [32]:
filename='KNNBaseline_tuned'
pickle.dump(knn_tuned_grid, open(filename, 'wb'))