In [2]:
import pandas as pd
from scipy.stats import ttest_rel,probplot
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from wordcloud import WordCloud
#import math
#import copy as cp
#from collections import defaultdict


#from surprise
from surprise import KNNWithMeans,SVD,SVDpp,NMF,SlopeOne,CoClustering
from surprise import Reader, Dataset,accuracy
from surprise.model_selection import cross_validate, KFold, ShuffleSplit, GridSearchCV
from surprise.prediction_algorithms import PredictionImpossible
from surprise.model_selection import train_test_split as surprisesplit
from surprise import dump

#user defined functions
#from prec_recall import precision_recall_at_k,pr_eval
#from sigweight import KNNSigWeighting
#from Rank import *
from data_cleaning import *
#from hybrid import WeightedHybrid

In [3]:
df_rating = pd.read_csv(r'archive (2)\rating.csv')
df_anime = pd.read_csv(r'archive (2)\anime.csv')

In [4]:
final_data = pd.read_csv(r'core_tv_comedy_rating.csv')
train,test = train_test_split(final_data,test_size=0.2,random_state=1,stratify=final_data['user_id'])

#train and test csv files
train.to_csv('train.csv', index=False)
test.to_csv('test.csv', index=False)

In [5]:
#load data into suprise object

reader = Reader(line_format='user item rating', sep=',',rating_scale=(1, 10),skip_lines=1)
WholeSet = Dataset.load_from_file('core_tv_comedy_rating.csv', reader)
S_train = Dataset.load_from_file('train.csv', reader)
S_test = Dataset.load_from_file('test.csv', reader)

# surprise trainset  
trainset = S_train.build_full_trainset() # 80% of data for training
WholeSet_train = WholeSet.build_full_trainset() # wholeSet as training

# testset for evaluating RMSE

testset = list(map(lambda x : x[0:3],S_test.raw_ratings)) # 20% of data as testset for evaluating rmse


# testset for evaluating relevence(Precision, Recall, NDCG)
NoRatingSet = WholeSet_train.build_anti_testset(fill=1) #whole unknown rating

_, noRate_test = train_test_split(NoRatingSet, test_size=0.2, random_state=1, stratify=list(map(lambda x: x[0], NoRatingSet)))

testPlusUnknown= testset + noRate_test #testset + 20% of unknown rating for evaluating relevence

Configuration for model training

In [6]:
#configuring splite
#define kfold with fold=5 and random states = 1
kf_5=KFold(n_splits=5,random_state=1)

In [7]:
#configuring best algo function
def Best_Algo(algoname,gs_object):
    """
    Args:
        algoname: name of the algorithmn
        gs_object: GridSearchCV object
        
    
    return:
        best_rmses: a pandas series of best RMSES        
        best_estimator: algo with best setting
        results_df: resutls dataframe
    
    print:
        best_rmses
        mean of Best RMSES
        param_grid setting of best_rmses
        index of best rmses in result table
            
    """        
    print(f'{algoname} Best RMSES: ')
    print('')
    
    results_df = pd.DataFrame(gs_object.cv_results)
    cols=[re.match(r'split\d+_test_rmse', x).group() for x in results_df.columns if re.match(r'split\d+_test_rmse', x)  is not None]
    best_rmses=results_df[cols].iloc[gs_object.best_index.get('rmse'),:] 
    best_params=gs_object.best_params['rmse']
    best_estimator=gs_object.best_estimator['rmse']
    print(best_rmses)
    print('==============================')
    print(f'mean of Best RMSES: {gs_object.best_score['rmse']:.{3}f}\n')
    print(f'best params: {best_params}\n'.format())
    print(f'Index: {gs_object.best_index.get('rmse')}\n')
        
    return best_rmses,best_estimator,results_df

In [8]:
#model training
#KNNwithMeans
# item based knn gridsearch
par={'k':[5,19,40,95,175],'sim_options':{'name':['pearson','cosine'],'user_based':[False],'min_support':[10,100,200]}}
gs_knn_item = GridSearchCV(KNNWithMeans,par,measures=['rmse'],cv=kf_5,return_train_measures=True)
gs_knn_item.fit(S_train)

Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.

In [9]:
# user base knn gridsearch
par={'k':[19,40,95,505],'sim_options':{'name':['pearson','cosine'],'user_based':[True],'min_support':[10,100,500]}}
gs_knn_u = GridSearchCV(KNNWithMeans,par,measures=['rmse'],cv=kf_5,return_train_measures=True)
gs_knn_u.fit(S_train)

Computing the pearson similarity matrix...


MemoryError: Unable to allocate 7.71 GiB for an array with shape (32178, 32178) and data type float64

In [None]:
I_Knn_rmses,I_bestKnn,I_knn_result=Best_Algo('KNNWithMeans (item based)',gs_knn_item)
print('\n'*2)
U_Knn_rmses,U_bestKnn,U_knn_result=Best_Algo('KNNWithMeans (user based)',gs_knn_u)

In [None]:
##item base
# fit data with best trained knn
I_bestKnn.fit(trainset)

# bias estimation (Training error)
I_pre_knn=I_bestKnn.test(trainset.build_testset())
print('bias Rmse:',end=' ')
print(accuracy.rmse(I_pre_knn,False))

# unbias estimation (tesing error)
I_pre_knn=I_bestKnn.test(testset)
print('unbias Rmse:',end=' ')
print(accuracy.rmse(I_pre_knn,False))

In [None]:
best_rmses_df=pd.DataFrame(I_Knn_rmses) #create df
best_rmses_df.columns=(['Item_knn']) # item base rmses


# save rmses & knn_result_df
I_knn_result.to_csv('I_knn_result.csv')
best_rmses_df.to_csv('best_rmses_df.csv')

# Save prediction & algo
dump.dump('I_knnbestalgo',predictions=I_pre_knn,algo=I_bestKnn)

In [None]:
# load rmses & knn_result_df
best_rmses_df=pd.read_csv('best_rmses_df.csv',index_col=0)
I_knn_result_df=pd.read_csv('I_knn_result.csv',index_col=0)

# load prediction & algo
I_pre_knn,I_bestKnn = dump.load('I_knnbestalgo')

In [None]:
#SVD
param_grid = {'n_factors':[5,10,300,500],'n_epochs': [5, 20], 'lr_all': [0.002, 0.02,0.2],
              'reg_all': [0.002, 0.02, 0.2, 0.8],'verbose': [True],'random_state': [1]}

svdpp_gs = GridSearchCV(SVD, param_grid, measures=['rmse'],return_train_measures=True, cv=kf_5)

svdpp_gs.fit(S_train)

In [None]:
# fit data with best trained knn
SVD_best.fit(trainset)

# bias estimation (Training error)
bia_pre=SVD_best.test(trainset.build_testset())
print('bias Rmse:',end=' ')
print(accuracy.rmse(bia_pre,False))


# unbias estimation (tesing error)
unbias_pre=SVD_best.test(testset)
print('unbias Rmse:',end=' ')
print(accuracy.rmse(unbias_pre,False))

In [None]:
#add SVD cv best rmse to best_rmses_df
best_rmses_df['SVD']=pd.read_csv('svd_best_rmse.csv',index_col=0).iloc[:5,:]['rmses']

# save rmses & knn_result_df
SVD_result_df.to_csv('svd_results.csv')
best_rmses_df.to_csv('best_rmses_df.csv')

# Save prediction & algo
dump.dump('svd_best',predictions=unbias_pre,algo=SVD_best)

In [None]:
# load rmses & SVD_result_df
best_rmses_df=pd.read_csv('best_rmses_df.csv',index_col=0)

SVD_result_df=pd.read_csv('svd_results.csv',index_col=0)


# load prediction & algo
SVD_pre,SVD_best = dump.load('svd_best')

In [None]:
best_rmses_df

In [None]:
#CoClustering
param_grid = {'n_cltr_u':[5,10,100,300,3000], 'n_cltr_i': [5,15,20,150],'n_epochs': [5, 20],
              'verbose': [True],'random_state': [1]}

co_gs = GridSearchCV(CoClustering, param_grid, measures=['rmse'],return_train_measures=True, cv=kf_5)

co_gs.fit(S_train)

# get the best parameters by rmse and print the results
co_best_rmses,co_estimator, results_df =Best_Algo('CoClustering',co_gs)

In [None]:
# fit data with best trained knn
co_best.fit(trainset)

# bias estimation (Training error)
bia_pre=co_best.test(trainset.build_testset())
print('bias Rmse:',end=' ')
print(accuracy.rmse(bia_pre,False))


# unbias estimation (tesing error)
unbias_pre=co_best.test(testset)
print('unbias Rmse:',end=' ')
print(accuracy.rmse(unbias_pre,False))

In [None]:
#add coclustering cv best rmse to best_rmses_df
best_rmses_df['CoClusting']=pd.read_csv('co_best_rmse.csv',index_col=0).iloc[:5,:]['rmses']

# save rmses
best_rmses_df.to_csv('best_rmses_df.csv')

# load rmses & coclustering_result_df
best_rmse_df=pd.read_csv('co_best_rmse.csv',index_col=0)

co_results_df=pd.read_csv('co_results.csv',index_col=0)


# load prediction & algo
co_pre,co_best = dump.load('co_best')

In [None]:
best_rmses_df

In [None]:
#SVDpp
param_grid = {'n_factors':[5,10,300,500],'n_epochs': [5, 20], 'lr_all': [0.002, 0.02,0.2],
              'reg_all': [0.002, 0.02, 0.2, 0.8],'verbose': [True],'random_state': [1]}

svdpp_gs = GridSearchCV(SVDpp, param_grid, measures=['rmse'],return_train_measures=True, cv=kf_5)

svdpp_gs.fit(S_train)
co_best_rmses,co_estimator, results_df =Best_Algo('CoClustering',co_gs)

In [None]:
#WeightedHybrid
#hybrid_KNN
hybrid  = WeightedHybrid([SVD_best, I_bestKnn])



#RMSE function##
def RMSE_caculator(algoname,algo,data):
    
    print('## {} RMSES: '.format(algoname))
    result=cross_validate(algo, data, measures=['RMSE'], cv=kf_5, verbose=True)
    
    print('=='*50)
    print('')
        
    return result['test_rmse']
    


##compute RMSE for hybrid ##


#hybrid rmse
hybrid_rmses = RMSE_caculator('Weighted Hybrid',hybrid,S_train)

In [None]:
hybrid_rmses.mean()