# DSA4212 Group 2 Assignment 2

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Read in the data

anime_data = pd.read_csv('assignment_2_anime.csv')
ratings_test = pd.read_csv('assignment_2_ratings_test.csv')
ratings_train = pd.read_csv('assignment_2_ratings_train.csv')

In [3]:
from surprise import Dataset, Reader, accuracy

In [4]:
# Find the minimum and maximum rating for the Reader object

min_rating = ratings_train.rating.min()
max_rating = ratings_train.rating.max()

In [5]:
# Train data
reader = Reader(rating_scale = (min_rating, max_rating))

train_data = Dataset.load_from_df(ratings_train[['user_id', 'anime_id', 'rating']], reader)

trainset = train_data.build_full_trainset()

In [6]:
# Test data
reader = Reader(rating_scale = (min_rating, max_rating))

test_data = Dataset.load_from_df(ratings_test[['user_id', 'anime_id', 'rating']], reader)

testset = test_data.build_full_trainset().build_testset()

## 1. Model based Collaborative Filtering
### Matrix factorization based algorithm: NMF

In [7]:
from surprise import NMF

In [37]:
nmf = NMF()
nmf.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.NMF at 0x1d4112209a0>

In [38]:
predictions = nmf.test(testset)
accuracy.mse(predictions)

MSE: 5.0285


5.028504428382759

### Matrix factorization based algorithm: SVD

In [8]:
from surprise import SVD
from surprise.model_selection import cross_validate

In [40]:
svd = SVD()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1d4bdedf3a0>

In [41]:
predictions = svd.test(testset)
accuracy.mse(predictions)

MSE: 1.3172


1.317221988049183

#### Cross validation

In [10]:
svd2 = SVD()
results = cross_validate(svd2, train_data, measures=['MSE'], cv=10, verbose=True)

Evaluating MSE of algorithm SVD on 10 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Fold 6  Fold 7  Fold 8  Fold 9  Fold 10 Mean    Std     
MSE (testset)     1.3434  1.3439  1.3382  1.3433  1.3426  1.3489  1.3417  1.3402  1.3487  1.3506  1.3442  0.0038  
Fit time          52.98   53.01   57.81   59.37   58.78   58.93   58.75   58.47   58.55   64.95   58.16   3.20    
Test time         2.82    4.29    4.48    2.68    2.66    4.45    4.46    4.44    4.45    4.44    3.92    0.79    


In [11]:
predictions2 = svd2.test(testset)
accuracy.mse(predictions2)

MSE: 1.3441


1.344138026055986

#### Hyperparameter Tuning

In [15]:
from surprise.model_selection import GridSearchCV
 
param_grid = {
  'n_factors': [20, 50, 100],
  'n_epochs': [5, 10, 20]
}
 
tuning = GridSearchCV(SVD, param_grid, measures=['mse'], cv=10)
tuning.fit(train_data)
print(tuning.best_mse['mse'])
print(tuning.best_params['mse'])

1.3234832570445836
{'n_factors': 20, 'n_epochs': 20}


## paramgrid test

In [None]:
from surprise.model_selection import GridSearchCV
param_grid = {
    'n_factors': [10, 100, 500],
    'n_epochs': [5, 20, 50], 
    'lr_all': [0.001, 0.005, 0.02],
    'reg_all': [0.005, 0.02, 0.1]}

tuning = GridSearchCV(SVD, param_grid, measures=['mse'], cv=10)
tuning.fit(train_data)

In [17]:
print(tuning.best_params['mse']['n_factors'])
print(tuning.best_params['mse']['n_epochs'])
print(tuning.best_params['mse']['lr_all'])
print(tuning.best_params['mse']['reg_all'])

500
50
0.005
0.1


In [18]:
params = tuning.best_params['mse']
svd = SVD(n_factors=params['n_factors'], n_epochs=params['n_epochs'],lr_all=params['lr_all'], reg_all=params['reg_all'])
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1197249ac10>

In [19]:
predictions = svd.test(testset)
accuracy.mse(predictions)

MSE: 1.2660


1.2660169206778658

## end test

In [17]:
# best hyperparameters
best_factor = tuning.best_params['mse']['n_factors']
best_epoch = tuning.best_params['mse']['n_epochs']

svd = SVD(n_factors=best_factor, n_epochs=best_epoch)
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1be76731b50>

In [18]:
predictions = svd.test(testset)
accuracy.mse(predictions)

MSE: 1.3059


1.3058940788466915

## Fast.ai

In [48]:
from fastai.collab import *
from fastai.tabular.all import *

In [59]:
anime = pd.read_csv('assignment_2_anime.csv', usecols = (0,1))

ratings_train = ratings_train.merge(anime)
ratings_test = ratings_test.merge(anime)

ratings_train.head()

Unnamed: 0,user_id,anime_id,rating,name
0,20170,10794,6,IS: Infinite Stratos Encore - Koi ni Kogareru Rokujuusou
1,40606,10794,7,IS: Infinite Stratos Encore - Koi ni Kogareru Rokujuusou
2,541,10794,9,IS: Infinite Stratos Encore - Koi ni Kogareru Rokujuusou
3,14637,10794,8,IS: Infinite Stratos Encore - Koi ni Kogareru Rokujuusou
4,24072,10794,8,IS: Infinite Stratos Encore - Koi ni Kogareru Rokujuusou


In [60]:
dls = CollabDataLoaders.from_df(ratings_train, item_name='name', bs=64)
dls.show_batch()

Unnamed: 0,user_id,name,rating
0,50720,IS: Infinite Stratos,5
1,68730,Gangsta.,8
2,35286,Sword Art Online II,7
3,1938,K-On!!,7
4,68271,Darkside Blues,6
5,45904,Dog Days,7
6,57447,Kamen no Maid Guy,6
7,22477,One Punch Man: Road to Hero,5
8,25209,Softenni,7
9,59735,Ookamikakushi,6


In [61]:
# Understanding pytorch definiton of latent factors

n_users  = len(dls.classes['user_id'])
n_movies = len(dls.classes['name'])
n_factors = 5

user_factors = torch.randn(n_users, n_factors)
movie_factors = torch.randn(n_movies, n_factors)

In [62]:
class DotProduct(Module):
    def __init__(self, n_users, n_movies, n_factors):
        self.user_factors = Embedding(n_users, n_factors)
        self.movie_factors = Embedding(n_movies, n_factors)
        
    def forward(self, x):
        users = self.user_factors(x[:,0])
        movies = self.movie_factors(x[:,1])
        return (users * movies).sum(dim=1)

In [64]:
model = DotProduct(n_users, n_movies, 50)
learn = Learner(dls, model, loss_func=MSELossFlat())

In [65]:
learn.fit_one_cycle(5, 5e-3)

epoch,train_loss,valid_loss,time
0,5.179986,5.164547,06:18
1,4.443039,4.559471,06:44
2,3.554875,3.603416,06:52
3,2.664872,2.692226,07:21
4,2.24542,2.39789,06:26


## TEST

In [110]:
from surprise import BaselineOnly

baseline = BaselineOnly()
baseline.fit(trainset)

predictions = baseline.test(testset)
accuracy.mse(predictions)

Estimating biases using als...
MSE: 1.4599


1.4598653587259132

In [9]:
from surprise import SVDpp

svdpp = SVDpp()
svdpp.fit(trainset)

predictions = svdpp.test(testset)
accuracy.mse(predictions)

MSE: 1.4204


1.4203963100495411

In [9]:
from surprise import SVDpp

svdpp = SVDpp()
results = cross_validate(svdpp, train_data, measures=['MSE'], cv=10, verbose=True)

predictions2 = svdpp.test(testset)
accuracy.mse(predictions2)

Evaluating MSE of algorithm SVDpp on 10 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Fold 6  Fold 7  Fold 8  Fold 9  Fold 10 Mean    Std     
MSE (testset)     1.4323  1.4465  1.4498  1.4308  1.4373  1.4375  1.4416  1.4502  1.4486  1.4504  1.4425  0.0072  
Fit time          958.72  954.14  975.88  1006.00 1014.02 974.10  966.35  973.68  975.92  969.92  976.87  18.04   
Test time         68.25   67.82   90.14   75.87   68.79   68.59   67.88   68.27   67.57   68.88   71.21   6.72    
MSE: 1.4446


1.4445884154098154