In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt

In [2]:
metric = 'cosine'
user = 1
n = 10

In [3]:
data = pd.read_json('../data/ratings_data.json', orient = 'split')
n_users = data['userId'].unique().shape[0]
n_items = data['movieId'].unique().shape[0]
users = data['userId'].unique()
movies = data['movieId'].unique()

In [4]:
train_data, test_data = train_test_split(data, test_size=0.25)

In [5]:
train_data_matrix = pd.DataFrame(np.zeros((n_users, n_items)), columns = movies, index = users)
for line in train_data.itertuples():
    train_data_matrix.at[line[4], line[1]] = line[2]

In [6]:
test_data_matrix = pd.DataFrame(np.zeros((n_users, n_items)), columns = movies, index = users)
for line in test_data.itertuples():
    test_data_matrix.at[line[4], line[1]] = line[2]

In [7]:
user_similarity = 1 - pairwise_distances(train_data_matrix, metric=metric)

In [8]:
def predict(ratings, similarity):
    mean_user_rating = ratings.mean(axis=1)
    ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
    pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T

    return pred

In [9]:
user_prediction = pd.DataFrame(predict(train_data_matrix.values, user_similarity), columns = movies, index = users)

In [10]:
def recommend(ratings, user, n = 10):
    user_movies = pd.DataFrame(train_data_matrix.loc[user])
    user_movies.columns = ['rating']
    viewed_movies = user_movies[user_movies['rating'] > 0].index
    predicted_ratings = pd.DataFrame(ratings.loc[user])
    predicted_ratings.columns = ['rating']
    recommended_movies = predicted_ratings.drop(viewed_movies).sort_values(['rating'], ascending=[0]).head(n).index.values

    return recommended_movies

In [11]:
def evaluate(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()

    return sqrt(mean_squared_error(prediction, ground_truth))

In [12]:
def get_avg_error(k=10):
    errors = []

    for i in range(k):
        train_data, test_data = train_test_split(data, test_size=0.25)
        
        train_data_matrix = pd.DataFrame(np.zeros((n_users, n_items)), columns = movies, index = users)
        for line in train_data.itertuples():
            train_data_matrix.at[line[4], line[1]] = line[2]
            
        test_data_matrix = pd.DataFrame(np.zeros((n_users, n_items)), columns = movies, index = users)
        for line in test_data.itertuples():
            test_data_matrix.at[line[4], line[1]] = line[2]
            
        similarity = 1 - pairwise_distances(train_data_matrix, metric=metric)
        prediction = pd.DataFrame(predict(train_data_matrix.values, similarity), columns = movies, index = users)
        
        error = evaluate(prediction.values, test_data_matrix.values)
        
        errors.append(round(error, 3))

    return pd.DataFrame(errors, columns=['Error']), round(sum(errors) / k, 3)

In [15]:
errors, avg_error = get_avg_error(10)

In [16]:
print(errors)
print(f'Average error: {avg_error}')

   Error
0  3.176
1  3.185
2  3.177
3  3.182
4  3.176
5  3.177
6  3.190
7  3.185
8  3.173
9  3.172
Average error: 3.179
