In [41]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error

from similarity import pearson_similarity, manhattan_similarity

In [21]:
# Constants
MAX_NEIGHBORS = 50      # ~ 2*np.sqrt(num_users)

# Preprocessing

In [22]:
ratings, movies = pd.read_csv('./datasets/ratings.csv'), pd.read_csv('./datasets/movies.csv')

In [23]:
user_ids = ratings['userId'].unique().tolist()
movie_ids = movies['movieId'].unique().tolist()

matrix = pd.DataFrame(index=user_ids, columns=movie_ids, dtype=np.float32)

for i in range(len(ratings)):
    user_id, movie_id, rating = ratings.iloc[i]['userId'], ratings.iloc[i]['movieId'], ratings.iloc[i]['rating']
    matrix.at[user_id, movie_id] = rating

matrix.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,


# Evaluation

After extracting a (little) sample of the Users, for each User we predict the Scores for the "already_seen" movies. The evaluation computed the Mean Absolute Error (MAE) between the real ratings and the predicted ratings.

In [24]:
input_users = np.random.randint(low=1, high=matrix.shape[0]+1, size=5)
print("Sample of Input Users for the Evaluation:", input_users)

Sample of Input Users for the Evaluation: [239 448  11 359 422]


In [26]:
vals = pd.DataFrame(index=input_users, columns=['Pearson_MAE', 'Manhattan_MAE'])

In [44]:
def predict_already_seen(matrix, input_user, similarities):
    input_mean = matrix.loc[input_user].mean()
    already_seen = matrix.loc[input_user].dropna().index.tolist()

    y_pred = list()

    for movie_id in already_seen:
        numerator, denominator = 0, 0
        for (other_user, similarity) in similarities.items():
            other_mean = matrix.loc[other_user].mean()
            if not np.isnan(matrix.at[other_user, movie_id]):
                numerator += similarity * (matrix.at[other_user, movie_id] - other_mean) 
                denominator += np.abs(similarity)
        
        prediction = input_mean + (numerator/denominator) if denominator != 0 else input_mean
        y_pred.append(prediction)
    
    return y_pred        

In [45]:
for input_user in input_users:
    other_users = [u for u in matrix.index.tolist() if u != input_user]

    p_similarities, m_similarities = dict(), dict()
    for u in other_users:
        p_similarities[u] = pearson_similarity(matrix, input_user, u)
        m_similarities[u] = manhattan_similarity(matrix, input_user, u)
    
    p_similarities = {k: v for k, v in sorted(p_similarities.items(), key=lambda item: item[1], reverse=True)}
    p_similarities = dict(list(p_similarities.items())[:MAX_NEIGHBORS])

    m_similarities = {k: v for k, v in sorted(m_similarities.items(), key=lambda item: item[1], reverse=True)}
    m_similarities = dict(list(m_similarities.items())[:MAX_NEIGHBORS])

    y_true = matrix.loc[input_user].dropna().tolist()
    y_pred_p = predict_already_seen(matrix, input_user, p_similarities)
    y_pred_m = predict_already_seen(matrix, input_user, m_similarities)

    vals.at[input_user, 'Pearson_MAE'] = mean_absolute_error(y_true, y_pred_p)
    vals.at[input_user, 'Manhattan_MAE'] = mean_absolute_error(y_true, y_pred_m)

    print("Evaluation Completed for User:", input_user)

Evaluation Completed for User: 239
Evaluation Completed for User: 448
Evaluation Completed for User: 11
Evaluation Completed for User: 359
Evaluation Completed for User: 422


In [46]:
vals

Unnamed: 0,Pearson_MAE,Manhattan_MAE
239,0.49402,0.559655
448,0.772757,0.84975
11,0.56965,0.677363
359,0.54935,0.636637
422,0.801736,0.845349
