In [1]:
import os
import time
import math as mt
import numpy as np
import pandas as pd
import pickle
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt

In [68]:
class SVDModel:
    def __init__(self):
        self.n_users = 0
        self.n_items = 0

    def load_data(self):
        data = pd.read_json('../data/ratings_data.json', orient='split')
        self.n_users = data['userId'].unique().shape[0]
        self.n_items = data['movieId'].unique().shape[0]
        self.users = data['userId'].unique()
        self.movies = data['movieId'].unique()
        
        train_data, test_data = train_test_split(data, test_size=0.25)
        
        train_data_matrix = pd.DataFrame(np.zeros((self.n_users, self.n_items)), columns=self.movies, index=self.users)
        for line in train_data.itertuples():
            train_data_matrix.at[line[4], line[1]] = line[2]
            
        test_data_matrix = pd.DataFrame(np.zeros((self.n_users, self.n_items)), columns=self.movies, index=self.users)
        for line in test_data.itertuples():
            test_data_matrix.at[line[4], line[1]] = line[2]
            
        return csr_matrix(train_data_matrix, dtype=np.float32), test_data_matrix
    
    def _save_pickle_file(self, file_name, data):
        file_name = f'./models/SVD/{file_name}.pickle'
        mapping_file = open(file_name, 'wb')
        pickle.dump(data, mapping_file)
        mapping_file.close()  
    
    def save(self, U, sigma, Vt, data_mean):
        if not os.path.exists('./models/SVD'):
            os.makedirs('./models/SVD')
        
        self._save_pickle_file('u', U)
        self._save_pickle_file('sigma', sigma)
        self._save_pickle_file('vt', Vt)
        self._save_pickle_file('data_mean', data_mean)
    
    def train(self, data, k):
        data_mean = np.mean(data, axis=1)
        data_demeaned = data - data_mean.reshape(-1, 1)
        U, sigma, Vt = svds(data_demeaned, k=k)
        sigma = np.diag(sigma)
        
        return U, sigma, Vt, data_mean

In [3]:
class SVDRecommender:
    def __init__(self):
        self.U = SVDRecommender.load_pickle_file('./models/SVD/u')
        self.sigma = SVDRecommender.load_pickle_file('./models/SVD/sigma')
        self.Vt = SVDRecommender.load_pickle_file('./models/SVD/vt')
        self.data_mean = SVDRecommender.load_pickle_file('./models/SVD/data_mean')
        
    @staticmethod
    def load_pickle_file(file_name):
        file = open(f'{file_name}.pickle', 'rb')
        object_file = pickle.load(file)
        return object_file
    
    def get_predicted_ratings(self):
        data = pd.read_json('../data/ratings_data.json', orient='split')
        users_movies = data[['movieId', 'userId']]
        all_predicted_ratings = np.dot(np.dot(self.U, self.sigma), self.Vt) + self.data_mean.reshape(-1, 1)
        ratings_df = pd.DataFrame(all_predicted_ratings, columns=data['movieId'].unique(), index=data['userId'].unique())
        
        return ratings_df, users_movies
    
    def recommend(self, user_id, n=10):
        start = time.time()
        
        ratings_df, users_movies = self.get_predicted_ratings()
        user_rated_movies = users_movies.loc[users_movies['userId'] == user_id]['movieId'].values
        predicted_ratings = pd.DataFrame(ratings_df.loc[user_id])
        predicted_ratings.columns = ['rating']
        recommended_movies = predicted_ratings.drop(user_rated_movies).sort_values(['rating'], ascending=False).head(n)

        end = time.time()
        print(f'Finished in: {end - start}')
        
        return recommended_movies
    
    def evaluate(self, ground_truth):
        ratings_df, users_movies = self.get_predicted_ratings()
        prediction = ratings_df.values
        prediction = prediction[ground_truth.nonzero()].flatten()
        ground_truth = ground_truth[ground_truth.nonzero()].flatten()

        return sqrt(mean_squared_error(prediction, ground_truth))

In [60]:
def get_avg_error(data, k=10):
    errors = []

    for i in range(k):
        model = SVDModel()
        train_data, test_data = model.load_data()
        U, sigma, Vt, data_mean = model.train(train_data, 20)
        model.save(U, sigma, Vt, data_mean)
        recommender = SVDRecommender()
        
        error = recommender.evaluate(test_data.values)
        errors.append(round(error, 3))

    return pd.DataFrame(errors, columns=['Error']), round(sum(errors) / k, 3)

In [63]:
errors, avg_error = get_avg_error(test_data.values, 10)

In [64]:
print(errors)
print(f'Average error: {avg_error}')

   Error
0  3.059
1  3.066
2  3.050
3  3.057
4  3.059
5  3.062
6  3.070
7  3.057
8  3.058
9  3.063
Average error: 3.06


In [69]:
model = SVDModel()

In [70]:
train_data, test_data = model.load_data()

610
9724
100836


In [31]:
start = time.time()
U, sigma, Vt, data_mean = model.train(train_data, 20)
end = time.time()
print(end - start)

0.22948932647705078


In [32]:
model.save(U, sigma, Vt, data_mean)

In [33]:
recommender = SVDRecommender()