In [1]:
# Requirements: numpy, pandas, scikit-learn, scikit-surprise, plotly

# Prevent codes being displayed when exporting to an HTML file
import IPython.core.display as di
di.display_html('<script>jQuery(function() {if (jQuery("body.notebook_app").length == 0) {jQuery(".input_area").toggle(); jQuery(".prompt").toggle();} jQuery(".highlight").show();});</script>', raw=True)

# Dynamically add a button to show/hide codes
di.display_html("<script>jQuery(function() {jQuery(function() {var b = jQuery('<input type=\"button\" value=\"Show/Hide codes\"/>'); b.click(function(){jQuery('.input_area').each(function(){jQuery(this).toggle();});}); jQuery('#References').parent().append(b);});});</script>", raw=True)

## References
- [Surprise](http://surpriselib.com/)
- [Surprise Github](https://github.com/NicolasHug/Surprise)

## Load Dataset

In [2]:
# load from file
from surprise import Dataset
from surprise import Reader

file_path = './ml-latest-small/ratings.csv'

reader = Reader(line_format='user item rating timestamp', sep=',', rating_scale=(0.5, 5), skip_lines=1)

data = Dataset.load_from_file(file_path, reader=reader)
# data.raw_ratings

## K-NN -- Whole Test Set

In [3]:
import sys
import numpy as np
from surprise import KNNWithMeans
from surprise.model_selection import cross_validate

sim_options = {'name': 'pearson', 'user_based': True}
k = range(2, 102, 2)
rmse = []
mae = []

with open('log.txt', 'w') as f:
    savedStdout = sys.stdout
    sys.stdout = f
    for i in k:
        result = cross_validate(KNNWithMeans(k=i, sim_options=sim_options), data, measures=['rmse', 'mae'], cv=10, verbose=True)
        rmse.append(np.mean(result['test_rmse']))
        mae.append(np.mean(result['test_mae']))
        print('\nk = %d\n' % i)
        print('-' * 80)
        sys.stdout.flush()
    sys.stdout = savedStdout

In [4]:
import plotly.offline as py
import plotly.graph_objs as go

py.init_notebook_mode(connected=True)

def draw_curve(x, y, name, title='', xlabel='', ylabel='', ROC=True):
    data = []
    width, height = 800, 600
    
    if type(name) != list or len(name) == 1:
        width, height = 600, 450
        trace1 = go.Scatter(x=x, y=y, 
                            mode='lines', 
                            line=dict(color='darkorange', width=2),
                            name=name
                           )
        data.append(trace1)
    else:
        for i in range(len(name)):
            trace1 = go.Scatter(x=x[i], y=y[i], 
                                mode='lines', 
                                line=dict(width=2),
                                name=name[i]
                               )
            data.append(trace1)
    
    if ROC:
        title = 'Receiver Operating Characteristic'
        xlabel = 'False Positive Rate'
        ylabel = 'True Positive Rate' 
        trace2 = go.Scatter(x=[0, 1], y=[0, 1], 
                            mode='lines', 
                            line=dict(color='navy', width=2, dash='dash'),
                            showlegend=False)
        data.append(trace2)
    
    layout = go.Layout(title=title,
                       autosize=False,
                       width=width,
                       height=height,
                       xaxis=dict(title=xlabel, ticks='outside', mirror=True, linewidth=1),
                       yaxis=dict(title=ylabel, ticks='outside', mirror=True, linewidth=1),
                       legend=dict(x=.5, y=.2, bordercolor='#D3D3D3', borderwidth=1))
    
    if ROC:
        layout.update(yaxis=dict(title=ylabel, ticks='outside', mirror=True, linewidth=1, range=[0, 1.05]))
    
    fig = go.Figure(data=data, layout=layout)
    py.iplot(fig)

In [5]:
draw_curve([k,k], [rmse,mae], ['RMSE','MAE'], 'K-NN (Whole Test Set)', 'K', 'Measures', ROC=False)

## K-NN -- Trimmed Test Set

In [6]:
from surprise import accuracy
from surprise.model_selection import KFold

kf = KFold(n_splits=10)
k = range(2, 102, 2)

rates = {}
for row in data.raw_ratings:
    if row[1] not in rates:
        rates[row[1]] = []
    rates[row[1]].append(row[2])

var = {}
for key in rates:
    var[key] = np.var(rates[key])

### Popular movie - more than 2 ratings

In [7]:
pop_movie = [x for x in rates if len(rates[x]) > 2]
rmse = []

with open('log.txt', 'w') as f:
    savedStdout = sys.stdout
    sys.stdout = f
    for i in k:
        algo = KNNWithMeans(k=i, sim_options=sim_options)
        score = []
        for trainset, testset in kf.split(data):
            algo.fit(trainset)
            trimset = [x for x in testset if x[1] in pop_movie]
            predictions = algo.test(trimset)
            score.append(accuracy.rmse(predictions, verbose=True))
        rmse.append(sum(score) / len(score))
        print('\nk = %d, Average RMSE = %.4f\n' % (i, sum(score) / len(score)))
        print('-' * 80)
        sys.stdout.flush()
    sys.stdout = savedStdout

draw_curve(k, rmse, '', 'K-NN (Popular Movie Trimming)', 'K', 'RMSE', ROC=False)
print('Minimum RMSE: %.4f' % min(rmse))

Minimum RMSE: 0.8994


### Unpopular movie - no more than 2 ratings

In [8]:
unpop_movie = [x for x in rates if len(rates[x]) <= 2]
rmse = []

with open('log.txt', 'w') as f:
    savedStdout = sys.stdout
    sys.stdout = f
    for i in k:
        algo = KNNWithMeans(k=i, sim_options=sim_options)
        score = []
        for trainset, testset in kf.split(data):
            algo.fit(trainset)
            trimset = [x for x in testset if x[1] in unpop_movie]
            predictions = algo.test(trimset)
            score.append(accuracy.rmse(predictions, verbose=True))
        rmse.append(sum(score) / len(score))
        print('\nk = %d, Average RMSE = %.4f\n' % (i, sum(score) / len(score)))
        print('-' * 80)
        sys.stdout.flush()
    sys.stdout = savedStdout

draw_curve(k, rmse, '', 'K-NN (Unpopular Movie Trimming)', 'K', 'RMSE', ROC=False)
print('Minimum RMSE: %.4f' % min(rmse))

Minimum RMSE: 1.1821


### High variance movie - variance >= 2 and ratings >= 5

In [9]:
highvar_movie = [x for x in rates if len(rates[x]) >= 5 and var[x] >= 2]
rmse = []

with open('log.txt', 'w') as f:
    savedStdout = sys.stdout
    sys.stdout = f
    for i in k:
        algo = KNNWithMeans(k=i, sim_options=sim_options)
        score = []
        for trainset, testset in kf.split(data):
            algo.fit(trainset)
            trimset = [x for x in testset if x[1] in highvar_movie]
            predictions = algo.test(trimset)
            score.append(accuracy.rmse(predictions, verbose=True))
        rmse.append(sum(score) / len(score))
        print('\nk = %d, Average RMSE = %.4f\n' % (i, sum(score) / len(score)))
        print('-' * 80)
        sys.stdout.flush()
    sys.stdout = savedStdout

draw_curve(k, rmse, '', 'K-NN (High Variance Movie Trimming)', 'K', 'RMSE', ROC=False)
print('Minimum RMSE: %.4f' % min(rmse))

Minimum RMSE: 1.5664


## ROC

In [6]:
import pandas as pd
from surprise.model_selection import train_test_split
from collections import namedtuple
from sklearn import metrics

k_best = 20
threshold = [2.5, 3, 3.5, 4]

with open('log.txt', 'w') as f:
    savedStdout = sys.stdout
    for theta in threshold:
        # load using pandas
        df = pd.read_csv('./ml-latest-small/ratings.csv')
        # df['rating'] = df['rating'].apply(lambda x: 0 if x < theta else 1)
        reader = Reader(rating_scale=(0.5, 5))
        data = Dataset.load_from_df(df[['userId', 'movieId', 'rating']], reader)

        # split into train set and test set
        trainset, testset = train_test_split(data, test_size=.1)

        # fit and test
        sys.stdout = f
        algo = KNNWithMeans(k=k_best, sim_options=sim_options)
        algo.fit(trainset)
        predictions = algo.test(testset)
        sys.stdout.flush()
        sys.stdout = savedStdout

        trues = [0 if getattr(row, 'r_ui') < theta else 1 for row in predictions]
        scores = [getattr(row, 'est') for row in predictions]
        fpr, tpr, thresholds = metrics.roc_curve(trues, scores)
        roc_auc = metrics.auc(fpr, tpr)

        name = 'K-NN θ=%.1f (area = %0.2f)' % (theta, roc_auc)
        draw_curve(fpr, tpr, name, ROC=True)