 __Packege Import__

In [None]:
from surprise import SVD
from surprise import SVDpp
from surprise import NMF
from surprise import KNNBaseline
from sklearn.linear_model import LinearRegression
import numpy as np
import time
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
from surprise import Reader, Dataset
from datetime import datetime
from sklearn.svm import SVR

__Data Import__

In [None]:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('../input/ml-100k/u.data', sep='\t', names=r_cols, encoding='latin-1')

m_cols = ['movie_id', 'title', 'release_date', 'video_release_date', 'imdb_url','unknown', 'Action', 'Adventure',\
          'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy','Film-Noir', 'Horror',\
          'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
movies = pd.read_csv('../input/ml-100k/u.item', sep='|', names=m_cols, encoding='latin-1')

u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('../input/ml-100k/u.user', sep='|', names=u_cols, encoding='latin-1', parse_dates=True)

__Data Cleaning__

In [None]:
ratings['unix_timestamp'] = ratings['unix_timestamp'].apply(datetime.fromtimestamp)
ratings.columns = ['user_id', 'movie_id', 'rating', 'time']

movies['release_date'] = pd.to_datetime(movies['release_date'])

for i in users['occupation'].unique():
    users[i] = users['occupation'] == i
users.drop('occupation', axis=1, inplace=True)

__Define Useful Variables__

In [None]:
ratings_p = pd.pivot_table(ratings, values='rating', index='user_id', columns='movie_id')

user_length = len(ratings_p)
item_length = len(ratings_p.iloc[0,:])

ratings_ma = np.ma.masked_invalid(ratings_p)
mask = pd.DataFrame(ratings_ma.mask,index = [i for i in range(1,944)], columns = [i for i in range(1,1683)])

reader = Reader()
data = Dataset.load_from_df(ratings[['user_id', 'movie_id', 'rating']], reader)
trainset = data.build_full_trainset()

mean = ratings_p.stack().mean()
std = ratings_p.stack().std()

ratings_movie_summary = ratings.groupby('movie_id')['rating'].agg(['count', 'mean', 'std'])
ratings_user_summary = ratings.groupby('user_id')['rating'].agg(['count', 'mean', 'std'])

__Baseline1__

In [None]:
movie_mean = np.ones(ratings_p.shape)
movie_mean = pd.DataFrame(movie_mean * np.array(ratings_movie_summary['mean']).reshape(1,1682))
user_mean = np.ones(ratings_p.T.shape)
user_mean = pd.DataFrame(user_mean * np.array(ratings_user_summary['mean'])).T
pred_baseline1 = movie_mean + user_mean - mean

__Baseline2 - ZScore__

In [None]:
user_std = np.ones(ratings_p.T.shape)
user_std = pd.DataFrame(user_std * np.array(ratings_user_summary['std'])).T
pred_baseline2 = user_mean + (movie_mean - mean)/std * user_std

__Baseline3 - SVR__

In [None]:
X = np.array(ratings_p*0) + movie_mean
svm = SVR(gamma=1, C=1)
pred_svm = ratings_p.copy()
for i in range(ratings_p.shape[0]):
    svm.fit(np.array(X.iloc[i].dropna()).reshape(-1,1), ratings_p.iloc[i].dropna())
    pred_svm.iloc[i] = svm.predict(np.array(movie_mean.iloc[0]).reshape(-1,1))

__Models From Surprise__

In [None]:
svd_grid = SVD(n_factors= 140, n_epochs= 100, reg_all= 0.1)
svdpp_grid = SVDpp(lr_all = 0.005, reg_all = 0.015)
nmf_bias = NMF(n_factors=3, n_epochs=100, biased=True, reg_bu=0.1, reg_bi=0.1)
nmf_grid = NMF(n_factors=240, n_epochs=90)
knnb_item_sgd = KNNBaseline(k=70, sim_options = {'user_based': False}, bsl_options = {'method': 'sgd', 'n_epochs': 100})
knnb_item_als = KNNBaseline(k=60, sim_options = {'user_based': False}, bsl_options = {'n_epochs': 100})
l = [svd_grid, svdpp_grid, nmf_bias, nmf_grid, knnb_item_sgd, knnb_item_als]
for i in l:
    print(i)
    i.fit(trainset)

- __Coverage__<br/>
 - Item Space Coverage
 - Shannon Entropy
 
- __Diversity__
 - Intra List Distance

In [None]:
def get_evaluation_model(model, model_name):
    '''Generate Item Space Coverage, Shannon Entropy and Intra List Distance
    with a fited model.'''
    a = time.time()
    
    '''Get a dataframe of genres for ILD'''
    genres = movies.drop(['title','release_date','video_release_date','imdb_url'],axis = 1)
    genres = genres.set_index('movie_id')
    
    '''Get pivot table using model and apply mask'''
    pred_matrix = np.ones(ratings_p.shape)
    for i in range(user_length):
        for j in range(item_length):
            score = model.predict(i, j)
            pred_matrix[i][j] = score.est
    pred = pd.DataFrame(pred_matrix)
    pred.columns = [i for i in range(1,1683)]
    pred.index = [i for i in range(1,944)]
    pred = pred*mask
    
    
    '''Build lists for three matrics'''
    coverage_index = []
    shannon_index = []
    ild_index = []
    
    '''Get Three Lists'''
    for i in range(user_length):
        rec_list = list(pred.iloc[i,:].sort_values(ascending = False).index[:10])
        ild_index.append(rec_list)
        for v in rec_list:
            shannon_index.append(v)
            if v not in coverage_index:
                coverage_index.append(v)
    
    '''Get item coverage and shannon entropy'''
    coverage = format(len(coverage_index)/item_length*100, '.2f')
    H_list = [(shannon_index.count(i)/len(shannon_index))*np.log(shannon_index.count(i)/len(shannon_index)) for i in list(movies['movie_id']) if shannon_index.count(i) != 0]
    H = format(sum(H_list)*(-1),'.4f')
    
    '''Get Intra List Distance'''
    iupper = np.triu_indices(10,1)
    ilds = []
    for ui in ild_index:
        one = cosine_similarity(genres.loc[ui,:])[iupper].sum()/90
        ilds.append(one)
    ILD = format(np.mean(ilds),'.4f')
    
    b = time.time()
    times = format(b-a,'.4f')
    print('The Item Coverage for model ' + model_name +  ' is: ' + str(coverage) + '%.')
    print('The Shannon Entropy for model ' + model_name +  ' is: ' + str(H) + '.')
    print('The Intra List Distance for model ' + model_name +  ' is: ' + str(ILD) + '.')
    print('Time used for ' + model_name +  ": " + str(times) + ' seconds.')
    print('\n')

In [None]:
get_evaluation_model(svd_grid, 'SVD after Grid')
get_evaluation_model(svdpp_grid, 'SVD++')
get_evaluation_model(nmf_bias, 'NMF with Bias')
get_evaluation_model(nmf_grid, 'NMF after Grid')
get_evaluation_model(knnb_item_sgd, 'KNN with SGD')
get_evaluation_model(knnb_item_als, 'KNN with ALS')

In [None]:
def get_evaluation_matrix(matrix, model_name):
    '''Generate Item Space Coverage, Shannon Entropy and Intra List Distance with a prediction matrix.'''
    a = time.time()
    
    '''Get a dataframe of genres for ILD'''
    genres = movies.drop(['title','release_date','video_release_date','imdb_url'],axis = 1)
    genres = genres.set_index('movie_id')
    
    '''Get pivot table using model and apply mask'''
    pred = pd.DataFrame(matrix)
    pred.columns = [i for i in range(1,1683)]
    pred.index = [i for i in range(1,944)]
    pred = pred*mask
    
    '''Build lists for three matrics'''
    coverage_index = []
    shannon_index = []
    ild_index = []
    
    '''Get Three Lists'''
    for i in range(user_length):
        rec_list = list(pred.iloc[i,:].sort_values(ascending = False).index[:10])
        ild_index.append(rec_list)
        for v in rec_list:
            shannon_index.append(v)
            if v not in coverage_index:
                coverage_index.append(v)
    
    '''Get item coverage and shannon entropy'''
    coverage = format(len(coverage_index)/item_length*100, '.2f')
    H_list = [(shannon_index.count(i)/len(shannon_index))*np.log(shannon_index.count(i)/len(shannon_index)) for i in list(movies['movie_id']) if shannon_index.count(i) != 0]
    H = format(sum(H_list)*(-1),'.4f')
    
    '''Get Intra List Distance'''
    iupper = np.triu_indices(10,1)
    ilds = []
    for ui in ild_index:
        one = cosine_similarity(genres.loc[ui,:])[iupper].sum()/90
        ilds.append(one)
    ILD = format(np.mean(ilds),'.4f')
    
    b = time.time()
    times = format(b-a,'.4f')
    print('The Item Coverage for model ' + model_name +  ' is: ' + str(coverage) + '%.')
    print('The Shannon Entropy for model ' + model_name +  ' is: ' + str(H) + '.')
    print('The Intra List Distance for model ' + model_name +  ' is: ' + str(ILD) + '.')
    print('Time used for ' + model_name +  ": " + str(times) + ' seconds.')
    print('\n')

In [None]:
get_evaluation_matrix(pred_baseline1, 'Baseline Model 1')
get_evaluation_matrix(pred_baseline2, 'Baseline Model Using Z-Score')
get_evaluation_matrix(pred_svm, 'Baseline Model Using SVR')

__Earlier & Separated Versions__

In [None]:
# def gen_matrix_coverage(model, r_num, model_name):
#     a = time.time()
#     pred_matrix = np.ones(ratings_p.shape)
#     for i in range(user_length):
#         for j in range(item_length):
#             score = model.predict(i, j)
#             pred_matrix[i][j] = score.est
#     pred = pred_matrix
#     rec_index = []
#     pred = pred*mask
#     for i in range(len(pred)):
#         rec_list = list(pred.iloc[i,:].sort_values(ascending = False).index[:r_num])
#         for v in rec_list:
#             if v not in rec_index:
#                 rec_index.append(v)
#     ratio = format(len(rec_index)/1682*100, '.2f')
#     b = time.time()
#     print(b-a)
#     print('The Item Coverage for model ' + model_name + ' when recommend number is ' + str(r_num) + ' is: ' + str(ratio) + '%.')

In [None]:
# def gen_matrix_shannon(model, r_num, model_name):
#     a = time.time()
#     pred_matrix = np.ones(ratings_p.shape)
#     for i in range(user_length):
#         for j in range(item_length):
#             score = model.predict(i, j)
#             pred_matrix[i][j] = score.est
#     pred = pred_matrix
#     rec_index = []
#     pred = pred*mask
#     for i in range(len(pred)):
#         rec_list = list(pred.iloc[i,:].sort_values(ascending = False).index[:r_num])
#         for v in rec_list:
#             rec_index.append(v)
#     H_list = [(rec_index.count(i)/len(rec_index))*np.log(rec_index.count(i)/len(rec_index)) for i in list(movies['movie_id']) if rec_index.count(i) != 0]
#     H = sum(H_list)*(-1)
#     b = time.time()
#     print(b-a)
#     print('The Shannon Entrophy for model ' + model_name + ' when recommend ' + str(r_num) + ' items is: ' + str(H) + '.')

In [None]:
# gen_matrix_shannon(svd_grid, 1, 'SVD after Grid')
# gen_matrix_shannon(svd_grid, 10, 'SVD after Grid')
# gen_matrix_shannon(svdpp_grid, 1, 'SVD++')
# gen_matrix_shannon(svdpp_grid, 10, 'SVD++')
# gen_matrix_shannon(nmf_bias, 1, 'NMF with Bias')
# gen_matrix_shannon(nmf_bias, 10, 'NMF with Bias')
# gen_matrix_shannon(nmf_grid, 1, 'NMF after Grid')
# gen_matrix_shannon(nmf_grid, 10, 'NMF after Grid')
# gen_matrix_shannon(knnb_item_sgd, 1, 'KNN with SGD')
# gen_matrix_shannon(knnb_item_sgd, 10, 'KNN with SGD')
# gen_matrix_shannon(knnb_item_als, 1, 'KNN with ALS')
# gen_matrix_shannon(knnb_item_als, 10, 'KNN with ALS')

In [None]:
# def get_coverage(model, r_num, model_name):
#     a = time.time()
#     '''Get the ratio of unique first r_num recommended items over all items.'''
#     rec_index = []
#     user_i_est = movies[['movie_id']]
#     for i in range(1,943):
#         user_i_est['Estimate_Score'] = user_i_est['movie_id'].apply(lambda x: model.predict(i, x).est)
#         user_i_est = user_i_est.sort_values('Estimate_Score', ascending=False)
#         for v in list(user_i_est['movie_id'])[:n_num]:
#             rec_index.append(v)
#     H_list = [(rec_index.count(i)/len(rec_index))*np.log(rec_index.count(i)/len(rec_index)) for i in list(movies['movie_id']) if rec_index.count(i) != 0]
#     H = sum(H_list)*(-1)
#     b = time.time()
#     print(b-a)
#     print('The Shannon Entrophy for model ' + model_name + ' when recommend ' + str(r_num) + ' items is: ' + str(H) + '.')

In [None]:
# def get_ILD(model, r_num, model_name):
#     a = time.time()
#     pred_matrix = np.ones(ratings_p.shape)
#     for i in range(user_length):
#         for j in range(item_length):
#             score = model.predict(i, j)
#             pred_matrix[i][j] = score.est
#     pred = pred_matrix
#     rec_index = []
#     pred = pred*mask
    
#     for i in range(len(pred)):
#         rec_list = list(pred.iloc[i,:].sort_values(ascending = False).index[:r_num])
#         rec_index.append(rec_list)
        
#     iupper = np.triu_indices(r_num,1)
#     ilds = []
#     for ui in ild_index:
#         one = cosine_similarity(genres.loc[ui,:])[iupper].sum()/(r_num*(r_num-1))
#         ilds.append(one)
#     ILD = format(np.mean(ilds),'.4f')
    
#     b = time.time()
#     print(b-a)
    
#     print('The ILD for model ' + model_name + ' when recommend number is ' + str(r_num) + ' is: ' + str(ILD) + '.')