In [13]:
# =============================================================================
# PART 1 - Recommendation-System
# PART 1.1
# =============================================================================
'''
Part 1.1

Instructions for running the file in the cmd:
    Change working directory to:
        ./DMT4BaS_2019/HW_2
    run in the command line:
        python part_1/sw/part1.py
        
        
Goal of this part of the task is to try out all the algorithms from the surprise library.
'''

'''
LIBRARIES
'''
#Matrix Factorization-based algorithms
from surprise import SVD
from surprise import SVDpp
from surprise import NMF
#SlopeOne-collaborative filtering algorithm
from surprise import SlopeOne
#k-NN inspired algorithms
from surprise import KNNBasic
from surprise import KNNBaseline
from surprise import KNNWithMeans

#CoClustering - collaborative filtering algorithm based on co-clustering
from surprise import CoClustering

#Basic algorithms
#Algorithm predicting the baseline estimate for given user and item.
from surprise import BaselineOnly
#Algorithm predicting a random rating based on the distribution of the training set, which is assumed to be normal.
from surprise import NormalPredictor

from surprise import Reader
from surprise import Dataset


from surprise.model_selection import KFold
from surprise.model_selection import cross_validate

from tabulate import tabulate
import time
import datetime
import numpy as np
import os
########################################
alg_list=[SVD,SVDpp,NMF,SlopeOne,KNNBasic,KNNWithMeans,KNNBaseline,CoClustering,BaselineOnly,NormalPredictor]
alg_names_lst=['SVD','SVDpp','NMF','SlopeOne','KNNBasic','KNNWithMeans','KNNBaseline','CoClustering','BaselineOnly','NormalPredictor']

# path of dataset file
file_path = os.path.expanduser('./part_1/dataset/ratings.csv')

print("Loading Dataset...")
reader = Reader(line_format='user item rating', sep=',', rating_scale=[0.5, 5], skip_lines=1)
data = Dataset.load_from_file(file_path, reader=reader)
print("Done.")

print("Performing splits...")
kf = KFold(n_splits=5, random_state=0)
print("Done.")



'''
Print a table of mean RMSE for all the algs
'''
table = []
for idx,klass in enumerate(alg_list):
    print(alg_names_lst[idx],klass())
    start = time.time()
    out = cross_validate(klass(), data, ['rmse'], kf,n_jobs=12,verbose=True)
    cv_time = str(datetime.timedelta(seconds=int(time.time() - start)))
    mean_rmse = '{:.3f}'.format(np.mean(out['test_rmse']))
    new_line = [alg_names_lst[idx], mean_rmse, cv_time]
    table.append(new_line)
    print('Finished.')
header = ['RMSE','Time']
print(tabulate(table, header, tablefmt="pipe"))


Loading Dataset...
Done.
Performing splits...
Done.
SVD <surprise.prediction_algorithms.matrix_factorization.SVD object at 0x11bab0490>
Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9087  0.9085  0.9078  0.9110  0.9130  0.9098  0.0019  
Fit time          8.79    8.85    9.23    9.17    8.82    8.97    0.19    
Test time         0.29    0.31    0.26    0.22    0.23    0.26    0.03    
Finished.
SVDpp <surprise.prediction_algorithms.matrix_factorization.SVDpp object at 0x11879f590>
Evaluating RMSE of algorithm SVDpp on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8916  0.8925  0.8881  0.8968  0.8986  0.8935  0.0038  
Fit time          722.92  723.28  717.12  725.32  729.16  723.56  3.91    
Test time         11.20   11.15   11.43   8.99    8.31    10.22   1.30    
Finished.
NMF <surprise.prediction_algorithms.matrix_factorization

In [15]:
# =============================================================================
# PART 1 - Recommendation-System
#           KNNBaseline optimization 
# =============================================================================
'''
Part 1.2
Instructions for running the file in the cmd:
    Change working directory to:
        ./DMT4BaS_2019/HW_2
    run in the command line:
        python part_1/sw/part1_2.py
'''

from surprise.model_selection import RandomizedSearchCV

#k-NN inspired algorithms
from surprise import KNNBaseline

from surprise import Reader
from surprise import Dataset

from surprise.model_selection import KFold
from surprise.model_selection import cross_validate

import os
import numpy as np
import datetime



# Reading the data
file_path = os.path.expanduser('./part_1/dataset/ratings.csv')

print("Loading Dataset...")
reader = Reader(line_format='user item rating', sep=',', rating_scale=[0.5, 5], skip_lines=1)#skip header
data = Dataset.load_from_file(file_path, reader=reader)


kf = KFold(n_splits=5, random_state=0)

start=datetime.datetime.now()
print('Optimizing hyperparameters of the KNNBaseline')
print("Start.....")
print(start)

'''
Optimizing hyperparameters of the KNNBaseline
In this case we used RandomSearchCV with 30 iterations;
For KNN algorithm the most important parameters, the one that have the biggest 
impact are value of k, which is recommended to be odd value, and simularity 
function
'''
'''
At first we use these parameters just to see if recommendation to use 
pearson_baseline as a similarity function is valid.
similarity_options={
        'name':['cosine','msd','pearson','pearson_baseline'],
        'user_based': [True,False],
        'shrinkage':[1,10,250,100,500,1000,1240]+list(range(50,150))
}
parameters_distributions = {
   
'k': np.arange(1,60,2),
              'min_k':[1,2,3,4,5,6,7,8,9,10,11],
              'sim_options':similarity_options}
'''
#And these were the results
#0.8865
#First iteration 
#{'k': 45, 'min_k': 11, 'sim_options': {'name': 'pearson_baseline', 'user_based': False}}

# =============================================================================
# 'Best found parameters for KNNBaseline in the first Iteration
alg=KNNBaseline(k= 45, min_k=11, sim_options= {'name':'pearson_baseline', 'user_based':False})
cross_validate(alg, data, measures=['RMSE'], cv=kf, verbose=True)
#54m18s was the TIME FOR EXECUTION 
#--->0.8865

print("sono qui --------")
'''
Then we tries just the pearson_baseline as a similarity function...
and tried to see the k value which is the best one.
'''


current_algo= KNNBaseline

similarity_options={
        'name':['pearson_baseline'], #it is recommended to use Pearson Baseline
        'user_based': [True,False]
        }
parameters_distributions = {
   
'k': np.arange(1,60,2),
              'min_k':[1,2,3,4,5,6,7,8,9,10,11],
              'sim_options':similarity_options}
searchCV = RandomizedSearchCV(current_algo,
							parameters_distributions,
							n_iter=30,
							measures=['rmse'],
                            n_jobs=12,
							cv=5)
searchCV.fit(data)
end=datetime.datetime.now()
print(end-start,"\nEnd.....")
print(searchCV.best_params['rmse'])
print("sono qui +++++++++++")
#Second iteration 
#0.8864
#{'k': 37, 'min_k': 11, 'sim_options': {'name': 'pearson_baseline', 'user_based': False}}
alg=KNNBaseline(k= 37, min_k=11, sim_options= {'name':'pearson_baseline', 'user_based':False})
cross_validate(alg, data, measures=['RMSE'], cv=kf, verbose=True)
#22m23s


# =============================================================================
# 'Best found parameters for KNNBaseline 
alg=KNNBaseline(k= 37, min_k=11, sim_options= {'name':'pearson_baseline', 'user_based':False})
cross_validate(alg, data, measures=['RMSE'], cv=kf, verbose=True)
#22m23s --->0.8864
# =============================================================================

Loading Dataset...
Optimizing hyperparameters of the KNNBaseline
Start.....
2020-05-09 11:21:57.404663
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Evaluating RMSE of algorithm KNNBaseline on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8839  0.8874  0.8832  0.8896  0.8886  0.8865  0.0025  
Fit time          6.15    6.76    6.33    5.97    6.07    6.26    0.28    
Test time         12.40   12.1

{'test_rmse': array([0.88385138, 0.88730123, 0.88292064, 0.88918504, 0.88883718]),
 'fit_time': (5.964932918548584,
  6.696568965911865,
  6.099676847457886,
  6.028443813323975,
  6.11378812789917),
 'test_time': (11.379655122756958,
  11.482139825820923,
  11.430302858352661,
  10.84785008430481,
  11.413607120513916)}

In [17]:
# =============================================================================
# PART 1 - Recommendation-System
#           SVD optimization 
# =============================================================================
'''
Part 1.2
Instructions for running the file in the cmd:
    Change working directory to:
        ./DMT4BaS_2019/HW_2
    run in the command line:
        python part_1/sw/part1_2_svd.py
        
In this scrypt we performed 2 Grid Search Cross Validations over 5 folds to try 
to find the best hyper parameters.
Since execution for first alg were slower we decided to choose parameters wiser.
In first GridSearchCV execution we choose init_mean, lr_all, reg_all
'''

from surprise.model_selection import GridSearchCV

#Matrix Factorization-based algorithm
from surprise import SVD

from surprise import Reader
from surprise import Dataset

from surprise.model_selection import KFold
from surprise.model_selection import cross_validate

import os
import datetime
'''
Since in the first part 
'''
# Reading the data
file_path = os.path.expanduser('./part_1/dataset/ratings.csv')

print("Loading Dataset...")
reader = Reader(line_format='user item rating', sep=',', rating_scale=[0.5, 5], skip_lines=1)#skip header
data = Dataset.load_from_file(file_path, reader=reader)



'''
Optimizing hyperparameters of the SVD
'''
kf = KFold(n_splits=5, random_state=0) #making folds for cross validation


start=datetime.datetime.now()
print('Optimizing hyperparameters of the SVD')
print("Start.....")
print(start)

 
#OPTIMIZATION OF SVD

param_grid = {'init_mean':[0.1,0.15],
              'lr_all':[0.005,0.01,0.025], #0.025 default
              'reg_all':[0.02,0.005,0.1]} #0.1 default
grid_search = GridSearchCV(SVD,param_grid,measures=['rmse'],
                           cv=5,n_jobs=3)
grid_search.fit(data)

  
end=datetime.datetime.now()
print(end-start,"\nEnd.....")
print(grid_search.best_params['rmse'])

#Execution time 0:06:17.595912 
# After first grid search  --> {'init_mean': 0.15, 'lr_all': 0.025, 'reg_all': 0.1}

'Best found paramteres for SVD'
# =============================================================================
# 0.8838
#{'init_mean': 0.15, 'lr_all': 0.025, 'reg_all': 0.1}
opt_svd_alg=SVD(init_mean=0.15,lr_all=0.025,reg_all=0.1)
cross_validate(opt_svd_alg,data,measures=['rmse'],cv=kf,n_jobs=3,verbose=True)
# =============================================================================

# =============================================================================
# Try to optimize number of factors by using optimized fixed values for 
# hyper parameter reg_all, lr_all and init_mean
#--> {'init_mean': 0.15, 'lr_all': 0.025, 'reg_all': 0.1}
# =============================================================================

 
'''
Additional optimization of just number of factors with other chosen fixed hyperparameters
'''

start=datetime.datetime.now()
print('Optimizing hyperparameters of the SVD')
print("Start.....")
print(start)
current_algo = SVD

#OPTIMIZATION OF SVD

param_grid = {'n_factors': [50,100,125,150,200],
              'init_mean':[0.15],
              'lr_all':[0.025],
              'reg_all':[0.1]}
grid_search = GridSearchCV(SVD,param_grid,measures=['rmse'],
                           cv=5,n_jobs=3)
grid_search.fit(data)

#0:02:55.428739 s
  
end=datetime.datetime.now()
print(end-start,"\nEnd.....")
print(grid_search.best_params['rmse'])

# =============================================================================
# 0.8835
opt_svd_alg=SVD(n_factors=150,lr_all=0.025,reg_all=0.1,init_mean=0.15)
cross_validate(opt_svd_alg,data,measures=['rmse'],cv=kf,n_jobs=12,verbose=True)
# =============================================================================
#{'n_factors': 150, 'init_mean': 0.15, 'lr_all': 0.025, 'reg_all': 0.1}
 
'''
Execution time(I GridSearch+ II GridSearch) ---> 9m13s
'''
##0:06:17.595912 + #0:02:55.428739 =9m13s
#{'init_mean': 0.15, 'lr_all': 0.025, 'reg_all': 0.1}#(n_factors=150}


Loading Dataset...
Optimizing hyperparameters of the SVD
Start.....
2020-05-09 12:22:03.519240
0:08:11.999527 
End.....
{'init_mean': 0.1, 'lr_all': 0.025, 'reg_all': 0.1}
Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8812  0.8837  0.8821  0.8878  0.8854  0.8840  0.0024  
Fit time          15.48   15.37   15.37   15.38   15.40   15.40   0.04    
Test time         0.45    0.46    0.45    0.44    0.45    0.45    0.01    
Optimizing hyperparameters of the SVD
Start.....
2020-05-09 12:30:58.282983
0:02:19.543966 
End.....
{'n_factors': 125, 'init_mean': 0.15, 'lr_all': 0.025, 'reg_all': 0.1}
Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8811  0.8839  0.8813  0.8867  0.8851  0.8836  0.0022  
Fit time          13.14   13.26   13.44   13.19   12.94   13.19   0.16    
Test time         0.31    0.27   

'\nExecution time(I GridSearch+ II GridSearch) ---> 9m13s\n'