In [1]:
import pandas as pd
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import train_test_split, cross_validate
from surprise import SVD, accuracy
from surprise import KNNWithMeans
import random

In [51]:
df = pd.read_csv('CaseStudy1/rating.csv')

In [52]:
df.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1


In [53]:
df.shape

(7813737, 3)

In [54]:
df = df[df['rating'] != -1]

In [6]:
df = df.iloc[:3500000,:]

In [7]:
df.shape

(3500000, 3)

In [8]:
reader = Reader(rating_scale=(1,10))

In [55]:
data = Dataset.load_from_df(df,reader)

# Memory based collaborative filtering

## Item based

In [10]:
sim_options = {
    'name':'cosine',
    'user_based':False
}

In [11]:
algo = KNNWithMeans(sim_options=sim_options)

In [12]:
train = data.build_full_trainset()

In [13]:
algo.fit(train)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7f2cfee3a4d0>

In [26]:
pred = algo.predict(df.iloc[[rand_indx]]['user_id'].values[0],df.iloc[[rand_indx]]['anime_id'].values[0])
pred.est

7.966723730190104

In [16]:
rand_indx = random.randint(0,len(df))

In [25]:
df.iloc[[rand_indx]]

Unnamed: 0,user_id,anime_id,rating
1406179,13411,10321,8


In [27]:
cross_validate(algo,data,measures = ['RMSE','MAE'],cv=3,verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNWithMeans on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    1.1960  1.1968  1.1954  1.1961  0.0006  
MAE (testset)     0.8984  0.9000  0.8981  0.8988  0.0008  
Fit time          32.15   32.03   32.52   32.23   0.21    
Test time         162.16  162.83  160.94  161.98  0.78    


{'test_rmse': array([1.19601929, 1.19683296, 1.19544854]),
 'test_mae': array([0.89837697, 0.89995083, 0.89805149]),
 'fit_time': (32.145565032958984, 32.03176164627075, 32.521087646484375),
 'test_time': (162.1636815071106, 162.82727479934692, 160.9365668296814)}

## User-based

In [43]:
sim_options = {
    'name': 'cosine',
    'user_based': True
}

In [44]:
algo = KNNWithMeans(sim_options=sim_options)

In [45]:
train = data.build_full_trainset()

In [46]:
algo.fit(train)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7f2c24043820>

In [47]:
rand_indx = random.randint(0,len(df))

In [48]:
df.iloc[[rand_indx]]

Unnamed: 0,user_id,anime_id,rating
38802,427,24415,9


In [49]:
pred = algo.predict(df.iloc[[rand_indx]]['user_id'].values[0],df.iloc[[rand_indx]]['anime_id'].values[0])
pred.est

8.387617800008092

In [35]:
cross_validate(algo,data,measures = ['RMSE','MAE'],cv=3,verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNWithMeans on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    1.1966  1.1959  1.1955  1.1960  0.0004  
MAE (testset)     0.8996  0.8986  0.8979  0.8987  0.0007  
Fit time          33.74   35.17   35.50   34.80   0.76    
Test time         171.73  172.37  175.36  173.16  1.58    


{'test_rmse': array([1.19655322, 1.19594153, 1.19553995]),
 'test_mae': array([0.89959171, 0.89855232, 0.89791945]),
 'fit_time': (33.73641538619995, 35.17265558242798, 35.496763706207275),
 'test_time': (171.72997379302979, 172.3741738796234, 175.36313486099243)}

# Collaborative Filtering Matrix Factorization

In [50]:
algo = SVD()

In [58]:
train = data.build_full_trainset()

In [78]:
algo.fit(train)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f2c23d64640>

In [79]:
rand_indx = random.randint(0,len(df))

In [80]:
df.iloc[[rand_indx]]

Unnamed: 0,user_id,anime_id,rating
5490100,51734,31043,9


In [83]:
pred = algo.predict(df.iloc[[rand_indx]]['user_id'].values[0],df.iloc[[rand_indx]]['anime_id'].values[0])
pred.est

8.550904234837699

In [86]:
cross_validate(algo,data,measures = ['RMSE','MAE'],cv=3,verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    1.2068  1.2208  1.2172  1.2149  0.0059  
MAE (testset)     0.9138  0.9226  0.9266  0.9210  0.0053  
Fit time          0.61    0.62    0.62    0.62    0.01    
Test time         0.21    0.21    0.22    0.21    0.00    


{'test_rmse': array([1.2068498 , 1.22081777, 1.21715802]),
 'test_mae': array([0.91377368, 0.92257305, 0.92657716]),
 'fit_time': (0.609809160232544, 0.6211960315704346, 0.6231136322021484),
 'test_time': (0.21317076683044434, 0.21088075637817383, 0.2195425033569336)}