# Importing Libraries

In [1]:
from sklearn.model_selection import GroupKFold
import pandas as pd
from recommendation2 import MovieRecommender
import numpy as np
import random
from scipy.sparse import csr_matrix
import time

# Creating the fundamental DataFrame and Functions

In [2]:
def return_rating(userId, movieId):
    user_index = user_mapper[userId]
    user_row = matrix[user_index, :]
    movie_index = movie_mapper[movieId]
    rating = user_row[0, movie_index]
    return rating

In [3]:
users = pd.read_csv('ml-25m/ratings.csv')
users.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [4]:
def create_matrix(df):
    unique_users = df['userId'].nunique()
    unique_movies = df['movieId'].nunique()

    user_mapper = dict(zip(np.unique(df["userId"]), list(range(unique_users))))
    movie_mapper = dict(zip(np.unique(df["movieId"]), list(range(unique_movies))))
    
    user_inv_mapper = dict(zip(list(range(unique_users)), np.unique(df["userId"])))
    movie_inv_mapper = dict(zip(list(range(unique_movies)), np.unique(df["movieId"])))
    
    user_index = [user_mapper[i] for i in df['userId']]
    item_index = [movie_mapper[i] for i in df['movieId']]

    matrix = csr_matrix((df["rating"], (user_index,item_index)), shape=(unique_users,unique_movies))
    
    return matrix, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper
matrix, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper = create_matrix(users)

In [5]:
movies = pd.read_csv('ml-25m/movies4.csv')
movies.head()

Unnamed: 0,movieId,title,genres,year,imdbId
0,1,Toy Story,"['Adventure', 'Animation', 'Children', 'Comedy...",1995.0,tt0114709
1,2,Jumanji,"['Adventure', 'Children', 'Fantasy']",1995.0,tt0113497
2,3,Grumpier Old Men,"['Comedy', 'Romance']",1995.0,tt0113228
3,4,Waiting to Exhale,"['Comedy', 'Drama', 'Romance']",1995.0,tt0114885
4,5,Father of the Bride Part II,['Comedy'],1995.0,tt0113041


# Create the GroupKFold object

In [6]:
num_groups = 500         # Change this value to adjust the training : testing ratio
group_kfold = GroupKFold(n_splits=num_groups)

In [7]:
# Get the first training and testing group from the GroupKFold object
for train_index, test_index in group_kfold.split(users, groups=users['userId']):
    train_data = users.iloc[train_index]
    test_data = users.iloc[test_index]
    break

# Preprocess the training and testing data

In [8]:
train_data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [9]:
test_data.head()

Unnamed: 0,userId,movieId,rating,timestamp
123863,892,1,4.0,846970667
123864,892,3,3.0,838067328
123865,892,5,1.0,842098111
123866,892,7,3.0,838066338
123867,892,11,4.0,842654097


In [10]:
test_data1 = test_data[test_data['rating'] == 5]     # Change this value to determine which movies are included in the testing phase 
test_data1 = test_data1.sort_values(by=['userId','rating'], ascending=[True, False])
test_data1.head()

Unnamed: 0,userId,movieId,rating,timestamp
123868,892,14,5.0,838065135
123869,892,17,5.0,838064639
123871,892,21,5.0,838064380
123872,892,34,5.0,838064361
123878,892,105,5.0,842654210


In [11]:
# Randomly pick n samples of each user
random_samples = test_data1.groupby('userId').apply(lambda x: x.sample(min(3, len(x)))) # Change this value for the number of samples of each user
random_samples.reset_index(drop=True, inplace=True)
random_samples.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,892,551,5.0,838064715
1,892,314,5.0,842098288
2,892,509,5.0,838064663
3,1514,8361,5.0,1311010793
4,2060,55269,5.0,1280262504


In [12]:
print("Number of users:", users['userId'].nunique())
print("Number of training users:", train_data['userId'].nunique())
print("Number of testing users:", random_samples['userId'].nunique())
print("Training-Testing Ratio:", train_data['userId'].nunique()/users['userId'].nunique(), ":", random_samples['userId'].nunique() / users['userId'].nunique())

Number of users: 162541
Number of training users: 162281
Number of testing users: 248
Training : Testing Ratio: 0.9984004035904787 : 0.0015257688829279996


# Train the recommender system and perform the evaluation

In [13]:
recommender = MovieRecommender(train_data)

In [14]:
from IPython import display

start_time = time.time()
random_samples['Precision'] = np.nan
random_samples['Recall'] = np.nan
random_samples['Accuracy'] = np.nan
for index, row in random_samples.iterrows():
    processing_statement = f"Processing... {index}/{len(random_samples) - 1}"
    display.clear_output(wait=True)
    display.display(processing_statement)
    evaluation = []
    recommendations = recommender.find_similar_movies(row['movieId'])
    for movie in recommendations:
        rating = return_rating(row['userId'], row['movieId'])
        if rating > 3:
            evaluation.append(2)
        elif rating == 0:
            evaluation.append(0)
        else:
            evaluation.append(1)
    TP = evaluation.count(2)
    FP = evaluation.count(1)
    FN = evaluation.count(0)
    if TP + FP != 0:
        random_samples.loc[index, 'Precision'] = TP / (TP + FP)
    if TP + FN != 0:
        random_samples.loc[index, 'Recall'] = TP / (TP + FN)
    if TP + FP + FN != 0:
        random_samples.loc[index, 'Accuracy'] = (TP + FP) / (TP + FP + FN)
end_time = time.time()
print('Execution Time: ', end_time-start_time)

'Processing... 693/694'

Execution Time:  1350.6931533813477


In [15]:
random_samples

Unnamed: 0,userId,movieId,rating,timestamp,Precision,Recall,Accuracy
0,892,551,5.0,838064715,1.0,1.0,1.0
1,892,314,5.0,842098288,1.0,1.0,1.0
2,892,509,5.0,838064663,1.0,1.0,1.0
3,1514,8361,5.0,1311010793,1.0,1.0,1.0
4,2060,55269,5.0,1280262504,1.0,1.0,1.0
...,...,...,...,...,...,...,...
689,161893,161,5.0,829012593,1.0,1.0,1.0
690,161893,223,5.0,829012593,1.0,1.0,1.0
691,162135,58,5.0,857305887,1.0,1.0,1.0
692,162135,62,5.0,857305776,1.0,1.0,1.0


In [16]:
averagePrecision = random_samples['Precision'].mean()
averageRecall = random_samples['Recall'].mean()
averageAccuracy = random_samples['Accuracy'].mean()
print(f"Average Precision : {averagePrecision}")
print(f"Average Recall : {averageRecall}")
print(f"Average Accuracy : {averageAccuracy}")

Average Precision : 1.0
Average Recall : 1.0
Average Accuracy : 1.0


# Future Work
Iterate the evaluation of each group in the GroupKFold object then calculate the average performance across all iterations