In [1]:
# Requirements: numpy, pandas, scikit-learn, scikit-surprise, plotly

# Prevent codes being displayed when exporting to an HTML file
import IPython.core.display as di
di.display_html('<script>jQuery(function() {if (jQuery("body.notebook_app").length == 0) {jQuery(".input_area").toggle(); jQuery(".prompt").toggle();} jQuery(".highlight").show();});</script>', raw=True)

# Dynamically add a button to show/hide codes
di.display_html("<script>jQuery(function() {jQuery(function() {var b = jQuery('<input type=\"button\" value=\"Show/Hide codes\"/>'); b.click(function(){jQuery('.input_area').each(function(){jQuery(this).toggle();});}); jQuery('#References').parent().append(b);});});</script>", raw=True)

## References
- [Surprise](http://surpriselib.com/)
- [Surprise Github](https://github.com/NicolasHug/Surprise)

## Load Dataset

In [2]:
# load from file
import pandas as pd
import numpy as np
filename = 'ml-latest-small/ratings.csv'

#headers = ['userId', 'movieId', 'rating', 'timestamp']
#original data 
rating_data = pd.read_csv(filename)

## Build R Matrix

In [None]:
# extract key information
users = rating_data['userId'].values
movies = rating_data['movieId'].values
ratings = rating_data['rating'].values
movie_set = set(movies)
user_set = set(users)

movieId_map_col = {} # key is the movieId, value is column index
def create_R(users, movies, ratings):
    user_num = len(user_set)
    movie_num = len(movie_set)

    for i in range(movie_num):
        movieId_map_col[list(movie_set)[i]] = i
        
    matrix_R = np.zeros((user_num, movie_num),dtype = float)
    for i in range(len(users)):
        matrix_R[users[i]-1][movieId_map_col[movies[i]]] = ratings[i]
    return matrix_R

matrix_R = create_R(users, movies, ratings)
print(matrix_R.shape)

## Sparsity of the Movie Rating Dataset (Q1) ##

In [None]:
total_ratings = len(set(movies)) * len(set(users))
avaliable_ratings = len(ratings)
sparsity = avaliable_ratings / total_ratings
print("The Sparsicity of the Dataset:")
print(sparsity)

## Frequency of the Rating Values (Q2) ##

In [None]:
import matplotlib.pyplot as plt

plt.hist(ratings, bins = [0.5,1,1.5,2,2.5,3,3.5,4,4.5,5], align='mid')
plt.title("The Number of Each Rating Score")
plt.ylabel("Occurence of Rating Score")

plt.show()

## Distribution of Ratings Among Movies (Q3) ##

In [None]:
unique, counts = np.unique(movies, return_counts = True)
movie_count_dict = {}
for i in range(len(unique)):
    movie_count_dict[unique[i]] = counts[i]

movie_count_dict = sorted(movie_count_dict.items(), key = lambda item : item[1], reverse = True)  
plt.bar(range(len(movie_count_dict)), [movie_tuple[1] for movie_tuple in movie_count_dict], align='center')
plt.title('Frequency of Each Movie')
plt.ylabel('The Number of Movies')
plt.show()

## Distribution of Ratings Among Users (Q4) ##

In [None]:
unique, counts = np.unique(users, return_counts = True)
user_count_dict = {}
for i in range(len(unique)):
    user_count_dict[unique[i]] = counts[i]

user_count_dict = sorted(user_count_dict.items(), key = lambda item : item[1], reverse = True)   
plt.bar(range(len(user_count_dict)), [user_tuple[1] for user_tuple in user_count_dict], align='center')
plt.title('Frequency of Each User')
plt.ylabel('The Number of User')
plt.show()

## Frequency of Movie Rating Variance (Q6) ##

In [None]:
var_matrix = []  # save the variance of each movies

for i in range(matrix_R.shape[1]):
    valid = np.nonzero(matrix_R[:,i])
    var_matrix.append(np.var(matrix_R[valid,i]))


print("max variance of the rating score: %f" % max(var_matrix))
print("min variance of the rating score: %f" % min(var_matrix))

plt.hist(var_matrix, bins = [0,0.5,1,1.5,2,2.5,3,3.5,4,4.5,5], align='mid')

plt.title("Frequency of Movie Rating Variance")
plt.ylabel("The Number of movies")

plt.show()

## K-NN -- Whole Test Set (Q10)

In [3]:
# load from file
from surprise import Dataset
from surprise import Reader

file_path = './ml-latest-small/ratings.csv'

reader = Reader(line_format='user item rating timestamp', sep=',', rating_scale=(0.5, 5), skip_lines=1)

data = Dataset.load_from_file(file_path, reader=reader)
# data.raw_ratings

In [4]:
import sys
import numpy as np
from surprise import KNNWithMeans
from surprise.model_selection import cross_validate

sim_options = {'name': 'pearson', 'user_based': True}
k = range(2, 102, 2)
rmse = []
mae = []

with open('log.txt', 'w') as f:
    savedStdout = sys.stdout
    sys.stdout = f
    for i in k:
        result = cross_validate(KNNWithMeans(k=i, sim_options=sim_options), data, measures=['rmse', 'mae'], cv=10, verbose=True)
        rmse.append(np.mean(result['test_rmse']))
        mae.append(np.mean(result['test_mae']))
        print('\nk = %d\n' % i)
        print('-' * 80)
        sys.stdout.flush()
    sys.stdout = savedStdout

In [6]:
import plotly.offline as py
import plotly.graph_objs as go

py.init_notebook_mode(connected=True)

def draw_curve(x, y, name, title='', xlabel='', ylabel='', ROC=True):
    data = []
    width, height = 800, 600
    
    if type(name) != list or len(name) == 1:
        width, height = 600, 450
        trace1 = go.Scatter(x=x, y=y, 
                            mode='lines', 
                            line=dict(color='darkorange', width=2),
                            name=name
                           )
        data.append(trace1)
    else:
        for i in range(len(name)):
            trace1 = go.Scatter(x=x[i], y=y[i], 
                                mode='lines', 
                                line=dict(width=2),
                                name=name[i]
                               )
            data.append(trace1)
    
    if ROC:
        title = 'Receiver Operating Characteristic'
        xlabel = 'False Positive Rate'
        ylabel = 'True Positive Rate' 
        trace2 = go.Scatter(x=[0, 1], y=[0, 1], 
                            mode='lines', 
                            line=dict(color='navy', width=2, dash='dash'),
                            showlegend=False)
        data.append(trace2)
    
    layout = go.Layout(title=title,
                       autosize=False,
                       width=width,
                       height=height,
                       xaxis=dict(title=xlabel, ticks='outside', mirror=True, linewidth=1),
                       yaxis=dict(title=ylabel, ticks='outside', mirror=True, linewidth=1),
                       legend=dict(x=.5, y=.2, bordercolor='#D3D3D3', borderwidth=1))
    
    if ROC:
        layout.update(yaxis=dict(title=ylabel, ticks='outside', mirror=True, linewidth=1, range=[0, 1.05]))
    
    fig = go.Figure(data=data, layout=layout)
    py.iplot(fig)

In [4]:
draw_curve([k,k], [rmse,mae], ['RMSE','MAE'], 'K-NN (Whole Test Set)', 'K', 'Measures', ROC=False)

## K-NN -- Trimmed Test Set

In [None]:
from surprise import accuracy
from surprise.model_selection import KFold

kf = KFold(n_splits=10)
k = range(2, 102, 2)

rates = {}
for row in data.raw_ratings:
    if row[1] not in rates:
        rates[row[1]] = []
    rates[row[1]].append(row[2])

var = {}
for key in rates:
    var[key] = np.var(rates[key])

### Popular movie - more than 2 ratings (Q12)

In [7]:
pop_movie = [x for x in rates if len(rates[x]) > 2]
rmse = []

with open('log.txt', 'w') as f:
    savedStdout = sys.stdout
    sys.stdout = f
    for i in k:
        algo = KNNWithMeans(k=i, sim_options=sim_options)
        score = []
        for trainset, testset in kf.split(data):
            algo.fit(trainset)
            trimset = [x for x in testset if x[1] in pop_movie]
            predictions = algo.test(trimset)
            score.append(accuracy.rmse(predictions, verbose=True))
        rmse.append(sum(score) / len(score))
        print('\nk = %d, Average RMSE = %.4f\n' % (i, sum(score) / len(score)))
        print('-' * 80)
        sys.stdout.flush()
    sys.stdout = savedStdout

draw_curve(k, rmse, '', 'K-NN (Popular Movie Trimming)', 'K', 'RMSE', ROC=False)
print('Minimum RMSE: %.4f' % min(rmse))

Minimum RMSE: 0.8991


### Unpopular movie - no more than 2 ratings (Q13)

In [None]:
unpop_movie = [x for x in rates if len(rates[x]) <= 2]
rmse = []

with open('log.txt', 'w') as f:
    savedStdout = sys.stdout
    sys.stdout = f
    for i in k:
        algo = KNNWithMeans(k=i, sim_options=sim_options)
        score = []
        for trainset, testset in kf.split(data):
            algo.fit(trainset)
            trimset = [x for x in testset if x[1] in unpop_movie]
            predictions = algo.test(trimset)
            score.append(accuracy.rmse(predictions, verbose=True))
        rmse.append(sum(score) / len(score))
        print('\nk = %d, Average RMSE = %.4f\n' % (i, sum(score) / len(score)))
        print('-' * 80)
        sys.stdout.flush()
    sys.stdout = savedStdout

draw_curve(k, rmse, '', 'K-NN (Unpopular Movie Trimming)', 'K', 'RMSE', ROC=False)
print('Minimum RMSE: %.4f' % min(rmse))

### High variance movie - variance >= 2 and ratings >= 5 (Q14)

In [None]:
highvar_movie = [x for x in rates if len(rates[x]) >= 5 and var[x] >= 2]
rmse = []

with open('log.txt', 'w') as f:
    savedStdout = sys.stdout
    sys.stdout = f
    for i in k:
        algo = KNNWithMeans(k=i, sim_options=sim_options)
        score = []
        for trainset, testset in kf.split(data):
            algo.fit(trainset)
            trimset = [x for x in testset if x[1] in highvar_movie]
            predictions = algo.test(trimset)
            score.append(accuracy.rmse(predictions, verbose=True))
        rmse.append(sum(score) / len(score))
        print('\nk = %d, Average RMSE = %.4f\n' % (i, sum(score) / len(score)))
        print('-' * 80)
        sys.stdout.flush()
    sys.stdout = savedStdout

draw_curve(k, rmse, '', 'K-NN (High Variance Movie Trimming)', 'K', 'RMSE', ROC=False)
print('Minimum RMSE: %.4f' % min(rmse))

## ROC (Q15)

In [7]:
import pandas as pd
from surprise.model_selection import train_test_split
from collections import namedtuple
from sklearn import metrics

k_best = 20
threshold = [2.5, 3, 3.5, 4]
kNN_threshold_3_fpr = None
kNN_threshold_3_tpr = None
with open('log.txt', 'w') as f:
    savedStdout = sys.stdout
    for theta in threshold:
        # load using pandas
        df = pd.read_csv('./ml-latest-small/ratings.csv')
        # df['rating'] = df['rating'].apply(lambda x: 0 if x < theta else 1)
        reader = Reader(rating_scale=(0.5, 5))
        data = Dataset.load_from_df(df[['userId', 'movieId', 'rating']], reader)

        # split into train set and test set
        trainset, testset = train_test_split(data, test_size=.1)

        # fit and test
        sys.stdout = f
        algo = KNNWithMeans(k=k_best, sim_options=sim_options)
        algo.fit(trainset)
        predictions = algo.test(testset)
        sys.stdout.flush()
        sys.stdout = savedStdout

        trues = [0 if getattr(row, 'r_ui') < theta else 1 for row in predictions]
        scores = [getattr(row, 'est') for row in predictions]
        fpr, tpr, thresholds = metrics.roc_curve(trues, scores)
        if theta == 3:
            kNN_threshold_3_fpr = fpr
            kNN_threshold_3_tpr = tpr
        roc_auc = metrics.auc(fpr, tpr)

        name = 'K-NN θ=%.1f (area = %0.2f)' % (theta, roc_auc)
        draw_curve(fpr, tpr, name, ROC=True)

## NNMF (Q17-29)

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from surprise.prediction_algorithms import NMF
from surprise.prediction_algorithms import SVD
from surprise import Reader, Dataset
from surprise.model_selection import KFold
from surprise.model_selection import cross_validate
from surprise import accuracy
df = pd.read_csv('ml-latest-small/ratings.csv', names = ['userId', 'itemId', 'rating', 'timestamp'],header=0)
reader = Reader(rating_scale=(0.5, 5))
data = Dataset.load_from_df(df[['userId', 'itemId', 'rating']], reader)
kf = KFold(n_splits=10)

## NNMF on original dataset (Q17-18)

In [None]:
rmse = []
mse = []
mink_mse = 0
minmse = 10
minrmse = 10
mink_rmse = 0
for i in range(2, 52, 2):
    result = cross_validate(NMF(n_factors=i, biased=False), data, measures=['rmse', 'mae'], cv=10, verbose=False)
    # Compute and print Root Mean Squared Error
    ave_mse = np.mean(result['test_mae'])
    ave_rmse = np.mean(result['test_rmse'])
    if minmse > ave_mse:
        mink_mse = i
        minmse = ave_mse

    if minrmse > ave_rmse:
        mink_rmse = i
        minrmse = ave_rmse

    print "For k = ", i
    print "rmse = ", ave_rmse
    print "mse = ", ave_mse
    rmse.append(ave_mse)
    mse.append(ave_rmse)

print "minmse happened at k = :", mink_mse
print "minrmse happened at k = :", mink_rmse
krange = range(2, 52, 2)
plt.plot(krange, mse)
plt.plot(krange, rmse)
plt.title("NNMF without bias for k in range 2-50")
plt.show()

## NNMF on popular movie trimmed test set (Q19)

In [10]:
rmse = []
mse = []
file_path = './ml-latest-small/ratings.csv'
reader = Reader(line_format='user item rating timestamp', sep=',', rating_scale=(0.5, 5), skip_lines=1)
data = Dataset.load_from_file(file_path, reader=reader)
k = range(2, 52, 2)

rates = {}
for row in data.raw_ratings:
    if row[1] not in rates:
        rates[row[1]] = []
    rates[row[1]].append(row[2])

var = {}
for key in rates:
    var[key] = np.var(rates[key])


pop_movie = [x for x in rates if len(rates[x]) > 2]
for i in k:
    algo = NMF(n_factors=i, biased=False)
    score = []
    for trainset, testset in kf.split(data):
        algo.fit(trainset)
        trimset = [x for x in testset if x[1] in pop_movie]
        predictions = algo.test(trimset)
        score.append(accuracy.rmse(predictions, verbose=True))
    rmse.append(sum(score) / len(score))
    print('\nPopular: k = %d, Average RMSE = %.4f\n' % (i, sum(score) / len(score)))

draw_curve(k, rmse, '', 'NNMF (Popular Movie Trimming)', 'K', 'RMSE', ROC=False)
print('Minimum RMSE: %.4f' % min(rmse))

RMSE: 1.1794
RMSE: 1.1715
RMSE: 1.1687
RMSE: 1.1807
RMSE: 1.1754
RMSE: 1.1677
RMSE: 1.1610
RMSE: 1.1791
RMSE: 1.1738
RMSE: 1.1647

Popular: k = 2, Average RMSE = 1.1722

RMSE: 1.0615
RMSE: 1.0477
RMSE: 1.0586
RMSE: 1.0578
RMSE: 1.0647
RMSE: 1.0533
RMSE: 1.0451
RMSE: 1.0598
RMSE: 1.0640
RMSE: 1.0535

Popular: k = 4, Average RMSE = 1.0566

RMSE: 0.9910
RMSE: 0.9975
RMSE: 1.0032
RMSE: 0.9867
RMSE: 0.9962
RMSE: 1.0034
RMSE: 0.9942
RMSE: 0.9955
RMSE: 0.9861
RMSE: 0.9951

Popular: k = 6, Average RMSE = 0.9949

RMSE: 0.9653
RMSE: 0.9578
RMSE: 0.9643
RMSE: 0.9537
RMSE: 0.9603
RMSE: 0.9591
RMSE: 0.9604
RMSE: 0.9647
RMSE: 0.9447
RMSE: 0.9670

Popular: k = 8, Average RMSE = 0.9597

RMSE: 0.9427
RMSE: 0.9361
RMSE: 0.9391
RMSE: 0.9455
RMSE: 0.9295
RMSE: 0.9366
RMSE: 0.9285
RMSE: 0.9453
RMSE: 0.9407
RMSE: 0.9376

Popular: k = 10, Average RMSE = 0.9381

RMSE: 0.9235
RMSE: 0.9213
RMSE: 0.9324
RMSE: 0.9313
RMSE: 0.9327
RMSE: 0.9194
RMSE: 0.9257
RMSE: 0.9292
RMSE: 0.9241
RMSE: 0.9204

Popular: k = 12, A

Minimum RMSE: 0.9168


## NNMF on unpopular trimmed test set (Q20)

In [12]:
unpop_movie = [x for x in rates if len(rates[x]) <= 2]
rmse = []
for i in k:
    algo = NMF(n_factors=i, biased=False)
    score = []
    for trainset, testset in kf.split(data):
        algo.fit(trainset)
        trimset = [x for x in testset if x[1] in unpop_movie]
        predictions = algo.test(trimset)
        score.append(accuracy.rmse(predictions, verbose=True))
    rmse.append(sum(score) / len(score))
    print('\nUnPopular: k = %d, Average RMSE = %.4f\n' % (i, sum(score) / len(score)))

draw_curve(k, rmse, '', 'NNMF (UnPopular Movie Trimming)', 'K', 'RMSE', ROC=False)
print('Minimum RMSE: %.4f' % min(rmse))

RMSE: 1.2848
RMSE: 1.2636
RMSE: 1.2067
RMSE: 1.2919
RMSE: 1.2414
RMSE: 1.4066
RMSE: 1.3425
RMSE: 1.3188
RMSE: 1.2575
RMSE: 1.3079

UnPopular: k = 2, Average RMSE = 1.2922

RMSE: 1.2696
RMSE: 1.3413
RMSE: 1.2604
RMSE: 1.2646
RMSE: 1.2870
RMSE: 1.2698
RMSE: 1.2810
RMSE: 1.3342
RMSE: 1.1803
RMSE: 1.2413

UnPopular: k = 4, Average RMSE = 1.2729

RMSE: 1.2799
RMSE: 1.2392
RMSE: 1.2832
RMSE: 1.3325
RMSE: 1.2825
RMSE: 1.2753
RMSE: 1.2495
RMSE: 1.3525
RMSE: 1.2267
RMSE: 1.2431

UnPopular: k = 6, Average RMSE = 1.2765

RMSE: 1.2538
RMSE: 1.2575
RMSE: 1.2802
RMSE: 1.3038
RMSE: 1.2277
RMSE: 1.2314
RMSE: 1.2771
RMSE: 1.2537
RMSE: 1.2805
RMSE: 1.2597

UnPopular: k = 8, Average RMSE = 1.2625

RMSE: 1.2758
RMSE: 1.2529
RMSE: 1.2412
RMSE: 1.2930
RMSE: 1.2542
RMSE: 1.2994
RMSE: 1.2894
RMSE: 1.1950
RMSE: 1.2625
RMSE: 1.2569

UnPopular: k = 10, Average RMSE = 1.2620

RMSE: 1.2918
RMSE: 1.2647
RMSE: 1.2646
RMSE: 1.2757
RMSE: 1.1999
RMSE: 1.2595
RMSE: 1.2520
RMSE: 1.2345
RMSE: 1.2697
RMSE: 1.2637

UnPopula

Minimum RMSE: 0.9168


## NNMF on high variance trimmed test set (Q21)

In [None]:
rmse = []
highvar_movie = [x for x in rates if len(rates[x]) >= 5 and var[x] >= 2]
for i in k:
    algo = NMF(n_factors=i, biased=False)
    score = []
    for trainset, testset in kf.split(data):
        algo.fit(trainset)
        trimset = [x for x in testset if x[1] in highvar_movie]
        predictions = algo.test(trimset)
        score.append(accuracy.rmse(predictions, verbose=True))
    rmse.append(sum(score) / len(score))
    print('\nHigh Variance: k = %d, Average RMSE = %.4f\n' % (i, sum(score) / len(score)))

draw_curve(k, rmse, '', 'NNMF (High variance Movie Trimming)', 'K', 'RMSE', ROC=False)
print('Minimum RMSE: %.4f' % min(rmse))

## Roc curve for NNMF at best latent factor in Q17 (Q22)

In [10]:
from surprise.model_selection import train_test_split
from collections import namedtuple
from sklearn import metrics
bestLF = 20
thresholds = [2.5, 3, 3.5, 4]
reader = Reader(rating_scale=(0.5, 5))
data = Dataset.load_from_df(df[['userId', 'itemId', 'rating']], reader)
NNMF_threshold_3_fpr = None
NNMF_threshold_3_tpr = None
for threshold in thresholds:
    train_set, test_set = train_test_split(data, test_size=0.1)
    algo = NMF(n_factors=bestLF, biased=False)
    algo.fit(train_set)
    predictions = algo.test(test_set)
    trues = [0 if getattr(row, 'r_ui') < threshold else 1 for row in predictions]
    scores = [getattr(row, 'est') for row in predictions]
    fpr, tpr, _ = metrics.roc_curve(trues, scores)
    if threshold == 3:
        NNMF_threshold_3_fpr = fpr
        NNMF_threshold_3_tpr = tpr
    roc_auc = metrics.auc(fpr, tpr)
    name = 'NNMF theta=%.1f (area = %0.2f)' % (threshold, roc_auc)
    draw_curve(fpr, tpr, name, ROC=True)

## Q23

In [14]:
df2 = pd.read_csv('ml-latest-small/movies.csv', names = ['movieid', 'title', 'genres'],header=0)
train_set, test_set = train_test_split(data, test_size=0.1)
algo = NMF(n_factors=20, biased=False)
algo.fit(train_set)
V = algo.qi
for k in range(0,21):
    kMovies = V[:,k]
    moviesTuple = []
    for i in range(len(kMovies)):
        moviesTuple.append((i, kMovies[i]))
    moviesTuple.sort(key = lambda x:x[1], reverse=True)
    print "for k = ", k
    for mt in moviesTuple[0:10]:
        print df2['genres'][mt[0]]

for k =  0
Comedy|Drama
Drama
Drama|Mystery|Thriller
Thriller
Comedy|Romance
Crime|Drama|Thriller
Crime|Drama|Romance
Action|Adventure|Crime|Drama
Drama|Romance|Thriller
Adventure|Animation|Children|Drama
for k =  1
Comedy|Drama|Romance
Film-Noir
Comedy|Romance
Comedy|Drama
Comedy|Drama
Documentary|War
Comedy|Drama|Horror
Drama|War
Drama|Horror|Sci-Fi
Adventure|Animation|Comedy
for k =  2
Adventure|Animation|Fantasy|Musical|Sci-Fi
Comedy|Romance
Action|Adventure|Sci-Fi
Adventure|Animation|Children|Fantasy
Comedy|Crime|Romance
Adventure|Animation|Children|Fantasy
Action|Comedy
Animation|Children|Musical
Drama
Comedy
for k =  3
Comedy|Romance
Comedy|Drama|Romance
Drama|Romance
Horror
Comedy|Drama
Comedy|Fantasy|Romance
Drama|Horror|Thriller
Horror|Thriller
Comedy
Comedy|Horror|Sci-Fi
for k =  4
Drama|Mystery|Romance|Sci-Fi|Thriller
Drama|Thriller
Drama|War
Horror|Thriller
Drama|Mystery|Romance|Thriller
Action|Drama|Thriller
Adventure|Drama|Romance
Action|Sci-Fi|Thriller|IMAX
Drama
Action

## MF with bias (Q24-25)

In [20]:
rmse = []
mse = []
mink_mse = 0
minmse = 10
minrmse = 10
mink_rmse = 0
for i in range(2, 52, 2):
    result = cross_validate(SVD(n_factors=i, biased=True), data, measures=['rmse', 'mae'], cv=10, verbose=False)
    # Compute and print Root Mean Squared Error
    ave_mse = np.mean(result['test_mae'])
    ave_rmse = np.mean(result['test_rmse'])
    if minmse > ave_mse:
        mink_mse = i
        minmse = ave_mse

    if minrmse > ave_rmse:
        mink_rmse = i
        minrmse = ave_rmse

    print "For k = ", i
    print "rmse = ", ave_rmse
    print "mse = ", ave_mse
    rmse.append(ave_mse)
    mse.append(ave_rmse)

print "minmse happened at k = ", mink_mse
print "minrmse happened at k = ", mink_rmse
krange = range(2, 52, 2)
plt.plot(krange, mse)
plt.plot(krange, rmse)
plt.title("MF with bias for k in range 2-50")
plt.show()

For k =  2
rmse =  0.8878113597591579
mse =  0.6835179902679415
For k =  4
rmse =  0.8881977157133504
mse =  0.6845516942392382
For k =  6
rmse =  0.8869491526996509
mse =  0.682724131929329
For k =  8
rmse =  0.887171989949708
mse =  0.6828685919372814
For k =  10
rmse =  0.8870114559431534
mse =  0.6821802063987203
For k =  12
rmse =  0.8872678659335456
mse =  0.6829246489370487
For k =  14
rmse =  0.8873146645566997
mse =  0.6832631889070131
For k =  16
rmse =  0.8880469617197528
mse =  0.6833164682690153
For k =  18
rmse =  0.8867410262520987
mse =  0.6825985442373679
For k =  20
rmse =  0.8875363717769295
mse =  0.6830210604840878
For k =  22
rmse =  0.8873158738321736
mse =  0.6833315933704489
For k =  24
rmse =  0.8888710254059664
mse =  0.6843418139364037
For k =  26
rmse =  0.8869535906415299
mse =  0.6821766478462664
For k =  28
rmse =  0.8868963453491421
mse =  0.6821289453384127
For k =  30
rmse =  0.8870863328294941
mse =  0.6829346519434161
For k =  32
rmse =  0.888575850

## MF with bias on popular trimmed test set (Q26)

In [13]:
rmse = []
for i in krange:
    algo = SVD(n_factors=i, biased=True)
    score = []
    for trainset, testset in kf.split(data):
        algo.fit(trainset)
        trimset = [x for x in testset if x[1] in pop_movie]
        predictions = algo.test(trimset)
        score.append(accuracy.rmse(predictions, verbose=True))
    rmse.append(sum(score) / len(score))
    print('\nPopular: k = %d, Average RMSE = %.4f\n' % (i, sum(score) / len(score)))

draw_curve(krange, rmse, '', 'MF (Popular Movie Trimming)', 'K', 'RMSE', ROC=False)
print('Minimum RMSE: %.4f' % min(rmse))

NameError: name 'krange' is not defined

## MF with bias on unpopular trimmed test set (Q27)

In [None]:
rmse = []
for i in krange:
    algo = SVD(n_factors=i, biased=True)
    score = []
    for trainset, testset in kf.split(data):
        algo.fit(trainset)
        trimset = [x for x in testset if x[1] in unpop_movie]
        predictions = algo.test(trimset)
        score.append(accuracy.rmse(predictions, verbose=True))
    rmse.append(sum(score) / len(score))
    print('\nUnPopular: k = %d, Average RMSE = %.4f\n' % (i, sum(score) / len(score)))

draw_curve(krange, rmse, '', 'MF (UnPopular Movie Trimming)', 'K', 'RMSE', ROC=False)
print('Minimum RMSE: %.4f' % min(rmse))

## MF with bias on high variance trimmed test set (Q28)

In [None]:
rmse = []
for i in krange:
    algo = SVD(n_factors=i, biased=True)
    score = []
    for trainset, testset in kf.split(data):
        algo.fit(trainset)
        trimset = [x for x in testset if x[1] in highvar_movie]
        predictions = algo.test(trimset)
        score.append(accuracy.rmse(predictions, verbose=True))
    rmse.append(sum(score) / len(score))
    print('\nHigh Variance: k = %d, Average RMSE = %.4f\n' % (i, sum(score) / len(score)))

draw_curve(krange, rmse, '', 'MF (High variance Movie Trimming)', 'K', 'RMSE', ROC=False)
print('Minimum RMSE: %.4f' % min(rmse))

## Roc curve for MF with bias at best latent factor in Q24 (Q29)

In [13]:
from surprise.model_selection import train_test_split
from sklearn import metrics
bestLF = 16
thresholds = [2.5, 3, 3.5, 4]
reader = Reader(rating_scale=(0.5, 5))
data = Dataset.load_from_df(df[['userId', 'itemId', 'rating']], reader)
MF_threshold_3_fpr = None
MF_threshold_3_tpr = None
for threshold in thresholds:
    train_set, test_set = train_test_split(data, test_size=0.1)
    algo = SVD(n_factors=bestLF, biased=True)
    algo.fit(train_set)
    predictions = algo.test(test_set)
    trues = [0 if getattr(row, 'r_ui') < threshold else 1 for row in predictions]
    scores = [getattr(row, 'est') for row in predictions]
    fpr, tpr, _ = metrics.roc_curve(trues, scores)
    if threshold == 3:
        MF_threshold_3_fpr = fpr
        MF_threshold_3_tpr = tpr
    roc_auc = metrics.auc(fpr, tpr)
    name = 'Mf theta=%.1f (area = %0.2f)' % (threshold, roc_auc)
    draw_curve(fpr, tpr, name, ROC=True)

## Naive Collaborative Filter Results in Original Data (Q30)

In [None]:
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.model_selection import KFold

pred_matrix = [] # save the prediction values(the mean of each users rating)

for row in matrix_R:
    valid = np.nonzero(row)
    pred_matrix.append(np.mean(row[valid]))

pred_RMSE1 = [] # RMSE in each fold

def get_test_result(pred_RMSE,dataset):
    kf = KFold(n_splits = 10, random_state = None, shuffle = False)
    for train_test, test_index in kf.split(dataset):
        pred_results = []
        gt_results = dataset.loc[test_index]['rating'].as_matrix()
        for index in test_index:
            test_userid = int(dataset.loc[index]['userId']) - 1
            pred_results.append(float(pred_matrix[test_userid]))
        pred_RMSE.append(sqrt(mean_squared_error(gt_results,pred_results)))
        avg_RMSE = np.mean(pred_RMSE)
    return avg_RMSE

results = get_test_result(pred_RMSE1,rating_data)
print("The Average RMSE for Original Test Set:")
print(results)

## Naive Collaborative Filter Results in Popular Movie Trimmed Test Set (Q31)

In [None]:
rare_movies = [] # save unpopular movies
movies = list(movies)
for movie in movie_set:
    if(movies.count(movie) <= 2):
        rare_movies.append(movie)

pop_rating_data = pd.DataFrame(columns = rating_data.columns)     
for index in rating_data.index:
    if(rating_data.loc[index]['movieId'] not in rare_movies):
        pop_rating_data.loc[index] = rating_data.loc[index]

pop_rating_data.index = range(len(pop_rating_data)) # reset index from 0-len(pop_rating_data)

pred_RMSE2 = []    
results = get_test_result(pred_RMSE2, pop_rating_data)
print("The Average RMSE for Popular Movie Trimmed Test Set:")
print(results)

## Naive Collaborative Filter Results in Unpopular Movie Trimmed Test Set (Q32) ##

In [None]:
unpop_rating_data = pd.DataFrame(columns = rating_data.columns)     
for index in rating_data.index:
    if(rating_data.loc[index]['movieId'] in rare_movies):
        unpop_rating_data.loc[index] = rating_data.loc[index]
unpop_rating_data.index = range(len(unpop_rating_data)) # reset index from 0-len(pop_rating_data)

pred_RMSE3 = []    
results = get_test_result(pred_RMSE3, unpop_rating_data)
print("The Average RMSE for Unpopular Movie Trimmed Test Set:")
print(results)

## Naive Collaborative Filter Results in High Variance Movie Trimmed Test Set (Q33) ##

In [None]:
filtered_movies = []
movies = list(movies)
for movie in movie_set:
    if(movies.count(movie) >= 5 and var_matrix[movieId_map_col[movie]] >= 2):
        filtered_movies.append(movie)

highvar_rating_data = pd.DataFrame(columns = rating_data.columns)     
for index in rating_data.index:
    if(rating_data.loc[index]['movieId'] in filtered_movies) :
        highvar_rating_data .loc[index] = rating_data.loc[index]
highvar_rating_data .index = range(len(highvar_rating_data )) 

pred_RMSE4 = []    
results = get_test_result(pred_RMSE4, highvar_rating_data)
print("The Average RMSE for High Variance Movie Trimmed Test Set:")
print(results)

## Performance comparison (Q34)

In [12]:
draw_curve([kNN_threshold_3_fpr, NNMF_threshold_3_fpr, MF_threshold_3_fpr], [kNN_threshold_3_tpr, NNMF_threshold_3_tpr, MF_threshold_3_tpr], ['kNN', 'NNMF', 'MF'], ROC=True)

## Ranking (Q35-39)

In [8]:
import pandas as pd
import numpy as np
from surprise import Dataset
from surprise import Reader
from surprise import KNNWithMeans
from surprise.prediction_algorithms import NMF
from surprise.prediction_algorithms import SVD
from surprise.model_selection import KFold
import sys
sim_options = {'name': 'pearson', 'user_based': True}
bestKNN_k = 20
bestNNMF_k = 20
bestMF_k = 16
threshold = 3
df = pd.read_csv('./ml-latest-small/ratings.csv')
reader = Reader(rating_scale=(0.5, 5))
data = Dataset.load_from_df(df[['userId', 'movieId', 'rating']], reader)

def plotRankingEvaluation(algo, data, threshold=3):
    kf = KFold(n_splits=10)
    averagePrecision_over_t = []
    averageRecall_over_t = []
    for t in range(1,26):
        averagePrecisions_eachFold = []
        averageRecalls_eachFold = []
        for trainset, testset in kf.split(data):
            algo.fit(trainset)
            predictions = algo.test(testset)
            userMovieRecomm = {}
            for eachData in predictions:
                if userMovieRecomm.get(eachData[0]) is None:
                    userMovieRecomm[eachData[0]] = []
                    userMovieRecomm[eachData[0]].append((eachData[1], eachData[2]))
                else:
                    userMovieRecomm[eachData[0]].append((eachData[1], eachData[2]))
            userMovieLike = {}
            userMovieRating = {}
            for eachData in testset:
                if userMovieLike.get(eachData[0]) is None:
                    userMovieLike[eachData[0]] = []
                    userMovieRating[eachData[0]] = 1
                    if eachData[2] >= threshold :
                        userMovieLike[eachData[0]].append(eachData[1])
                else:
                    userMovieRating[eachData[0]] += 1
                    if eachData[2] >= threshold :
                        userMovieLike[eachData[0]].append(eachData[1])
            keysToPop = []
            for key, value in userMovieRating.iteritems():
                if value < t:
                    keysToPop.append(key)
            for removeKey in keysToPop:
                userMovieLike.pop(removeKey, None)
                userMovieRecomm.pop(removeKey, None)
            precisions = []
            recalls = []
            for key, value in userMovieLike.iteritems():
                G = set(value)
                if len(G) != 0:
                    recommList = sorted(userMovieRecomm[key], key = lambda x:x[1], reverse=True)
                    recommList = recommList[0:t]
                    recommList = set([r[0] for r in recommList])
                    precisions.append(float(len(recommList.intersection(G)))/float(len(recommList)))
                    recalls.append(float(len(recommList.intersection(G)))/float(len(G)))
            averagePrecisions_eachFold.append(sum(precisions)/float(len(precisions)))
            averageRecalls_eachFold.append(sum(recalls)/float(len(recalls)))
        averagePrecision_over_t.append(sum(averagePrecisions_eachFold)/float(len(averagePrecisions_eachFold)))
        averageRecall_over_t.append(sum(averageRecalls_eachFold)/float(len(averageRecalls_eachFold)))
    return averagePrecision_over_t, averageRecall_over_t

with open('log.txt', 'w') as f:
    savedStdout = sys.stdout
    sys.stdout = f
    kNN_precision_over_t, kNN_recall_over_t = plotRankingEvaluation(KNNWithMeans(k=bestKNN_k, sim_options=sim_options), data=data, threshold=3)
    sys.stdout.flush()
    sys.stdout = savedStdout

NNMF_precision_over_t, NNMF_recall_over_t = plotRankingEvaluation(NMF(n_factors=bestNNMF_k, biased=False), data=data, threshold=3)
MF_precision_over_t, MF_recall_over_t = plotRankingEvaluation(SVD(n_factors=bestMF_k, biased=True), data=data, threshold=3)

draw_curve(range(1,26), kNN_precision_over_t, 'Precision', 'kNN Precision against t', 't', 'Measures', ROC=False)
draw_curve(range(1,26), kNN_recall_over_t, 'Recall', 'kNN Recall against t', 't', 'Measures', ROC=False)
draw_curve(kNN_precision_over_t, kNN_recall_over_t, 'Precision over Recall', 'kNN Precision against Recall', 'Precision', 'Recall', ROC=False)

draw_curve(range(1,26), NNMF_precision_over_t, 'Precision', 'NNMF Precision against t', 't', 'Measures', ROC=False)
draw_curve(range(1,26), NNMF_recall_over_t, 'Recall', 'NNMF Recall against t', 't', 'Measures', ROC=False)
draw_curve(NNMF_precision_over_t, NNMF_recall_over_t, 'Precision over Recall', 'NNMF Precision against Recall', 'Precision', 'Recall', ROC=False)

draw_curve(range(1,26), MF_precision_over_t, 'Precision', 'MF Precision against t', 't', 'Measures', ROC=False)
draw_curve(range(1,26), MF_recall_over_t, 'Recall', 'MF Recall against t', 't', 'Measures', ROC=False)
draw_curve(MF_precision_over_t, MF_recall_over_t, 'Precision over Recall', 'MF Precision against Recall', 'Precision', 'Recall', ROC=False)

[1.0]
[0.24483053873279995]
[1.0, 0.9837721379553079]
[0.24483053873279995, 0.3823150080976087]
[1.0, 0.9837721379553079, 0.9770274726420272]
[0.24483053873279995, 0.3823150080976087, 0.45391437119775935]
[1.0, 0.9837721379553079, 0.9770274726420272, 0.9757540631161316]
[0.24483053873279995, 0.3823150080976087, 0.45391437119775935, 0.4930800817419348]
[1.0, 0.9837721379553079, 0.9770274726420272, 0.9757540631161316, 0.97815670711833]
[0.24483053873279995, 0.3823150080976087, 0.45391437119775935, 0.4930800817419348, 0.5156729517249913]
[1.0, 0.9837721379553079, 0.9770274726420272, 0.9757540631161316, 0.97815670711833, 0.9784674914627296]
[0.24483053873279995, 0.3823150080976087, 0.45391437119775935, 0.4930800817419348, 0.5156729517249913, 0.5381964094291518]
[1.0, 0.9837721379553079, 0.9770274726420272, 0.9757540631161316, 0.97815670711833, 0.9784674914627296, 0.9809859484584035]
[0.24483053873279995, 0.3823150080976087, 0.45391437119775935, 0.4930800817419348, 0.5156729517249913, 0.538

[1.0, 0.9837721379553079, 0.9770274726420272, 0.9757540631161316, 0.97815670711833, 0.9784674914627296, 0.9809859484584035, 0.9787203940231924, 0.9793163400220293, 0.9756153105616268, 0.977089957754021, 0.9765524340433757, 0.97668009386276, 0.9750637852466266, 0.9741508565984647, 0.9730649276079669, 0.9719770793727254, 0.973060182823588, 0.9721297115762926, 0.9744744635075326, 0.9720720487743831, 0.9726191222909835]
[0.24483053873279995, 0.3823150080976087, 0.45391437119775935, 0.4930800817419348, 0.5156729517249913, 0.5381964094291518, 0.5537852113983422, 0.5726471802952922, 0.5804700426842779, 0.5984230303855564, 0.6047149614572808, 0.6175446844946876, 0.6294763021045536, 0.6434432903220707, 0.6461942555026002, 0.6596896291111889, 0.664944517080226, 0.6689290386905548, 0.6815779263964716, 0.6846827401000525, 0.6923954427686846, 0.703476571717123]
[1.0, 0.9837721379553079, 0.9770274726420272, 0.9757540631161316, 0.97815670711833, 0.9784674914627296, 0.9809859484584035, 0.9787203940231

[1.0, 0.9838223973878438, 0.9772993415111184, 0.9765586426672972, 0.9762773974683681, 0.9779467587279784, 0.9785643573388143, 0.9792039433017463, 0.9780925847908442, 0.9758801249748489, 0.9776106174206708, 0.9764874700990482, 0.9747511928997028, 0.9740159892182977, 0.9752703922234801, 0.9715484728909033]
[0.24955244521011202, 0.3804547662048766, 0.45557782697193383, 0.49426873804482946, 0.5188695583831351, 0.5359432163039421, 0.5541915884411257, 0.5662454436891282, 0.5884620307718968, 0.5987298649724504, 0.608818237730633, 0.6205906688158331, 0.6286076874368867, 0.6413006557275958, 0.648195417628474, 0.6566814812494578]
[1.0, 0.9838223973878438, 0.9772993415111184, 0.9765586426672972, 0.9762773974683681, 0.9779467587279784, 0.9785643573388143, 0.9792039433017463, 0.9780925847908442, 0.9758801249748489, 0.9776106174206708, 0.9764874700990482, 0.9747511928997028, 0.9740159892182977, 0.9752703922234801, 0.9715484728909033, 0.9709153594260774]
[0.24955244521011202, 0.3804547662048766, 0.45

In [12]:
draw_curve(kNN_precision_over_t, kNN_recall_over_t, 'Precision over Recall', 'kNN Precision against Recall', 'Precision', 'Recall', ROC=False)
draw_curve(NNMF_precision_over_t, NNMF_recall_over_t, 'Precision over Recall', 'NNMF Precision against Recall', 'Precision', 'Recall', ROC=False)
draw_curve(MF_precision_over_t, MF_recall_over_t, 'Precision over Recall', 'MF Precision against Recall', 'Precision', 'Recall', ROC=False)

In [10]:
draw_curve([kNN_precision_over_t, NNMF_precision_over_t, MF_precision_over_t], [kNN_recall_over_t, NNMF_recall_over_t, MF_recall_over_t], ['kNN', 'NNMF', 'MF'], 'Precision against Recall Comparison', 'Precision', 'Recall', ROC=False)