In [2]:
import pandas as pd
import numpy as np 
import pickle

In [3]:
df = pd.read_csv('cleaned_movie_ratings.csv')
df.drop('Unnamed: 0', axis=1, inplace=True)
df.head()

Unnamed: 0,userId,movieId,rating,title,genres,year
0,1,1,4.0,Toy Story (1995),Adventure Animation Children Comedy Fantasy,1995
1,1,3,4.0,Grumpier Old Men (1995),Comedy Romance,1995
2,1,6,4.0,Heat (1995),Action Crime Thriller,1995
3,1,47,5.0,Seven (a.k.a. Se7en) (1995),Mystery Thriller,1995
4,1,50,5.0,"Usual Suspects, The (1995)",Crime Mystery Thriller,1995


In [4]:
data = df[['userId', 'movieId', 'rating']]
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100775 entries, 0 to 100774
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   userId   100775 non-null  int64  
 1   movieId  100775 non-null  int64  
 2   rating   100775 non-null  float64
dtypes: float64(1), int64(2)
memory usage: 2.3 MB


In [5]:
from surprise.prediction_algorithms import KNNBasic
from surprise import Dataset, Reader
from surprise import accuracy
from surprise.model_selection import cross_validate, train_test_split, GridSearchCV
from collections import defaultdict

In [6]:
reader = Reader(line_format = 'user item rating', sep=',')
dataset = Dataset.load_from_df(data, reader)
dataset

<surprise.dataset.DatasetAutoFolds at 0x119d2d3d0>

In [7]:
trainset, testset = train_test_split(dataset, test_size=.20)

In [8]:
knn = KNNBasic()
knn.fit(trainset)

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x11be2e5d0>

In [9]:
prediction = knn.test(testset)

In [10]:
accuracy.rmse(prediction)
accuracy.mae(prediction)

RMSE: 0.9434
MAE:  0.7223


0.7222640373327587

#### KNNBasic With Model Tuning LETSSS GOOOO

In [11]:
params = {'min_k':[1,2,5],
'name':['cosine','pearson','pearson_baseline'],
'user_based':[True,False],
'min_support':[True, False]}

In [12]:
knnbasic_tuned_grid = GridSearchCV(KNNBasic, param_grid= params, refit=True,cv=3)

In [13]:
knnbasic_tuned_grid.fit(dataset)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi

In [14]:
knnbasic_tuned_grid.best_score

{'rmse': 0.9504093090129103, 'mae': 0.7304022186765268}

In [15]:
knnbasic_tuned_grid.best_params

{'rmse': {'min_k': 2,
  'name': 'cosine',
  'user_based': True,
  'min_support': True},
 'mae': {'min_k': 2,
  'name': 'cosine',
  'user_based': True,
  'min_support': True}}

In [16]:
knn_tuned = KNNBasic(sim_options={'min_k': 2,
  'name': 'cosine',
  'user_based': True,
  'min_support': True})

In [17]:
knn_tuned.fit(trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x11e84d450>

In [18]:
knn_tuned.test(testset)

[Prediction(uid=80, iid=6539, r_ui=4.0, est=4.012430079783989, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid=50, iid=78499, r_ui=2.0, est=4.244943138597086, details={'actual_k': 39, 'was_impossible': False}),
 Prediction(uid=177, iid=62956, r_ui=3.0, est=3.3356084333268314, details={'actual_k': 6, 'was_impossible': False}),
 Prediction(uid=122, iid=112852, r_ui=4.0, est=4.100316738966452, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid=111, iid=3175, r_ui=3.5, est=3.4756782685457384, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid=194, iid=784, r_ui=3.0, est=2.5787893566866296, details={'actual_k': 30, 'was_impossible': False}),
 Prediction(uid=608, iid=4343, r_ui=4.0, est=3.029078724937814, details={'actual_k': 22, 'was_impossible': False}),
 Prediction(uid=597, iid=2988, r_ui=4.0, est=4.0, details={'actual_k': 1, 'was_impossible': False}),
 Prediction(uid=600, iid=1302, r_ui=2.0, est=3.522142749750991, details={'actual_k':

In [19]:
tuned_predictions = knn_tuned.test(testset)

In [20]:
accuracy.rmse(tuned_predictions)
accuracy.mae(tuned_predictions)

RMSE: 0.9680
MAE:  0.7457


0.7456868276936653

#### Tuned KNNBasic Results:
RMSE: 0.9727


MAE:  0.7507

In [22]:
filename='KNNBasic_tuned'
pickle.dump(knnbasic_tuned_grid, open(filename,'wb'))