### Recommendations using surprise library models:

This notebook uses the surprise library to build a model using surprise library and generate ratings for the test data. \\
**Models:** SVD, Baseline estimation, item-item based KNN collaborative filtering etc are explored. Hyperparameter tuning for the models is performed. \\
**Evaluation:** The test data ratings are stored for further evaluation such as RMSE and MAE. The recommendations are evaluated using precision@5, recall@5, NDCG and overall accuracy. 

In [14]:
from surprise import SVD, SVDpp
from surprise.prediction_algorithms import KNNBasic, KNNWithMeans
from surprise import Dataset, Reader
from surprise.model_selection import cross_validate
from surprise import accuracy
from surprise.model_selection import train_test_split

In [2]:
import math
from collections import defaultdict
import csv
from sklearn.metrics import ndcg_score
import numpy as np
import pandas as pd
import time

In [6]:
def convert_traintest_dataframe_forsurprise(training_dataframe, testing_dataframe):
    reader = Reader(rating_scale=(0, 5))
    trainset = Dataset.load_from_df(training_dataframe[['userId', 'tmdbId', 'rating']], reader)
    testset = Dataset.load_from_df(testing_dataframe[['userId', 'tmdbId', 'rating']], reader)
    trainset = trainset.construct_trainset(trainset.raw_ratings)
    testset = testset.construct_testset(testset.raw_ratings)
    return trainset, testset

In [7]:
file_path_train = '../5_data/processed/training_data.csv'
file_path_test = '../5_data/processed/testing_data.csv'
traindf = pd.read_csv(file_path_train)
testdf = pd.read_csv(file_path_test)
trainset, testset = convert_traintest_dataframe_forsurprise(traindf, testdf)

In [8]:
def get_top_n(predictions, n):
    # First map the predictions to each user.
    top_n = defaultdict(list)
    org_ratings = defaultdict(list)

    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))
        org_ratings[uid].append((iid, true_r))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n, org_ratings

In [9]:
def dcg_at_k(scores):
    return scores[0] + sum(sc/math.log(ind, 2) for sc, ind in zip(scores[1:], range(2, len(scores) + 1)))

def ndcg_at_k(scores):
    idcg = dcg_at_k(sorted(scores, reverse=True))
    return (dcg_at_k(scores)/idcg) if idcg > 0.0 else 0.0

In [11]:
def precision_recall_at_k(predictions, k=5, threshold=3.5):
    '''Return precision and recall at k metrics for each user.'''

    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[:k])

        # Precision@K: Proportion of recommended items that are relevant
        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1

        # Recall@K: Proportion of relevant items that are recommended
        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 1

    precision = (sum(prec for prec in precisions.values()) / len(precisions))
    recall = (sum(rec for rec in recalls.values()) / len(recalls))

    return precision, recall

In [12]:
def recommendation(algo, trainset, testset):
  # Train the algorithm on the trainset, and predict ratings for the testset
  start_fit = time.time()
  algo.fit(trainset)
  end_fit = time.time()
  fit_time = end_fit - start_fit

  # Predictions on testing set
  start_test = time.time()
  test_predictions = algo.test(testset)
  end_test = time.time()
  test_time = end_test - start_test

  test_rmse = accuracy.rmse(test_predictions)
  test_mae = accuracy.mae(test_predictions)

  top_n, org_ratings = get_top_n(test_predictions, 5)

  precision, recall = precision_recall_at_k(test_predictions)

  f_measure = (2*precision*recall)/(precision+recall)

  ndcg_scores = dict()
  for uid, user_ratings in top_n.items():
    scores = []
    for iid, est_r in user_ratings:
        iid_found = False
        org_user_ratings = org_ratings[uid]
        for i, r in org_user_ratings:
            if iid == i:
                scores.append(r)
                iid_found = True
                break
        if not iid_found:
            scores.append(0)
    ndcg_scores[uid] = ndcg_at_k(scores)
  ndcg_score = sum(ndcg for ndcg in ndcg_scores.values())/len(ndcg_scores)

  return (test_rmse, test_mae, fit_time, test_time, precision, recall, f_measure, ndcg_score,test_predictions)

#### Basic algorithm (Baseline approach):

In [13]:
surprise_df = pd.DataFrame(columns= ['Algorithm', 'test_rmse', 'test_mae', 'fit_time', 'test_time', 'Precision', 'Recall', 'F-measure', 'NDCG'])

In [15]:
# Iterate over all algorithms
for algorithm in [KNNBasic(), KNNWithMeans(), SVD(), SVDpp()]:
    results = recommendation(algorithm,trainset,testset) 
    
    name =str(algorithm).split(' ')[0].split('.')[-1]
    print("Algorithm:", name)
    df = pd.DataFrame([[name, results[0], results[1], results[2], results[3], results[4], results[5], results[6], results[7]]], columns= ['Algorithm', 'test_rmse', 'test_mae', 'fit_time', 'test_time', 'Precision', 'Recall', 'F-measure', 'NDCG'])
    surprise_df = pd.concat([df, surprise_df], ignore_index=True)
surprise_df.sort_values(by='test_rmse', ascending=False) 

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9674
MAE:  0.7441
Algorithm: KNNBasic
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9208
MAE:  0.7057
Algorithm: KNNWithMeans
RMSE: 0.9004
MAE:  0.6941
Algorithm: SVD
RMSE: 0.8942
MAE:  0.6865
Algorithm: SVDpp


Unnamed: 0,Algorithm,test_rmse,test_mae,fit_time,test_time,Precision,Recall,F-measure,NDCG
3,KNNBasic,0.967357,0.744137,0.609963,2.771856,0.792226,0.442964,0.568216,0.960785
2,KNNWithMeans,0.920777,0.705683,0.235047,2.114045,0.806334,0.394724,0.529998,0.956217
1,SVD,0.900438,0.694067,7.22963,0.19519,0.816592,0.403079,0.539738,0.960958
0,SVDpp,0.894213,0.686454,669.893635,10.205841,0.827397,0.403331,0.542305,0.962874


In [21]:
surprise_df.sort_values(by='Precision') 

Unnamed: 0,Algorithm,test_rmse,test_mae,fit_time,test_time,Precision,Recall,F-measure,NDCG
3,KNNBasic,0.967357,0.744137,0.609963,2.771856,0.792226,0.442964,0.568216,0.960785
2,KNNWithMeans,0.920777,0.705683,0.235047,2.114045,0.806334,0.394724,0.529998,0.956217
1,SVD,0.900438,0.694067,7.22963,0.19519,0.816592,0.403079,0.539738,0.960958
0,SVDpp,0.894213,0.686454,669.893635,10.205841,0.827397,0.403331,0.542305,0.962874


In [22]:
surprise_df.sort_values(by='F-measure', ascending=False) 

Unnamed: 0,Algorithm,test_rmse,test_mae,fit_time,test_time,Precision,Recall,F-measure,NDCG
3,KNNBasic,0.967357,0.744137,0.609963,2.771856,0.792226,0.442964,0.568216,0.960785
0,SVDpp,0.894213,0.686454,669.893635,10.205841,0.827397,0.403331,0.542305,0.962874
1,SVD,0.900438,0.694067,7.22963,0.19519,0.816592,0.403079,0.539738,0.960958
2,KNNWithMeans,0.920777,0.705683,0.235047,2.114045,0.806334,0.394724,0.529998,0.956217


In [23]:
surprise_df.sort_values(by='NDCG', ascending=False)

Unnamed: 0,Algorithm,test_rmse,test_mae,fit_time,test_time,Precision,Recall,F-measure,NDCG
0,SVDpp,0.894213,0.686454,669.893635,10.205841,0.827397,0.403331,0.542305,0.962874
1,SVD,0.900438,0.694067,7.22963,0.19519,0.816592,0.403079,0.539738,0.960958
3,KNNBasic,0.967357,0.744137,0.609963,2.771856,0.792226,0.442964,0.568216,0.960785
2,KNNWithMeans,0.920777,0.705683,0.235047,2.114045,0.806334,0.394724,0.529998,0.956217


In [25]:
surprise_df.to_csv('../2_results/2_collab_filtering@surprise_recs_result.csv', index = False)