In [30]:
import os
import time
import math as mt
import numpy as np
import pandas as pd
import pickle
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds

In [2]:
class SVDModel:
    def __init__(self):
        self.n_users = 0
        self.n_items = 0

    def load_data(self):
        data = pd.read_json('../data/ratings_data.json', orient='split')
        self.n_users = data['userId'].unique().shape[0]
        self.n_items = data['movieId'].unique().shape[0]
        self.users = data['userId'].unique()
        self.movies = data['movieId'].unique()
        
        data_matrix = pd.DataFrame(np.zeros((self.n_users, self.n_items)), columns=self.movies, index=self.users)
        for line in data.itertuples():
            data_matrix.at[line[4], line[1]] = line[2]
            
        return csr_matrix(data_matrix, dtype=np.float32)
    
    def _save_pickle_file(self, file_name, data):
        file_name = f'./models/SVD/{file_name}.pickle'
        mapping_file = open(file_name, 'wb')
        pickle.dump(data, mapping_file)
        mapping_file.close()  
    
    def save(self, U, sigma, Vt, data_mean):
        if not os.path.exists('./models/SVD'):
            os.makedirs('./models/SVD')
        
        self._save_pickle_file('u', U)
        self._save_pickle_file('sigma', sigma)
        self._save_pickle_file('vt', Vt)
        self._save_pickle_file('data_mean', data_mean)
    
    def train(self, data, k):
        data_mean = np.mean(data, axis=1)
        data_demeaned = data - data_mean.reshape(-1, 1)
        U, sigma, Vt = svds(data_demeaned, k=k)
        sigma = np.diag(sigma)
        
        return U, sigma, Vt, data_mean

In [71]:
class SVDRecommender:
    def __init__(self):
        self.U = SVDRecommender.load_pickle_file('./models/SVD/u')
        self.sigma = SVDRecommender.load_pickle_file('./models/SVD/sigma')
        self.Vt = SVDRecommender.load_pickle_file('./models/SVD/vt')
        self.data_mean = SVDRecommender.load_pickle_file('./models/SVD/data_mean')
        
    @staticmethod
    def load_pickle_file(file_name):
        file = open(f'{file_name}.pickle', 'rb')
        object_file = pickle.load(file)
        return object_file
    
    def get_predicted_ratings(self):
        data = pd.read_json('../data/ratings_data.json', orient='split')
        users_movies = data[['movieId', 'userId']]
        all_predicted_ratings = np.dot(np.dot(self.U, self.sigma), self.Vt) + self.data_mean.reshape(-1, 1)
        ratings_df = pd.DataFrame(all_predicted_ratings, columns=data['movieId'].unique(), index=data['userId'].unique())
        
        return ratings_df, users_movies
    
    def recommend(self, user_id, n=10):
        start = time.time()
        
        ratings_df, users_movies = recommender.get_predicted_ratings()
        user_rated_movies = users_movies.loc[users_movies['userId'] == user_id]['movieId'].values
        predicted_ratings = pd.DataFrame(ratings_df.loc[user_id])
        predicted_ratings.columns = ['rating']
        recommended_movies = predicted_ratings.drop(user_rated_movies).sort_values(['rating'], ascending=False).head(n)
        print(recommended_movies)
        
        end = time.time()
        print(f'Finished in: {end - start}')

In [13]:
model = SVDModel()

In [14]:
data = model.load_data()

In [15]:
U, sigma, Vt, data_mean = model.train(data, 90)

In [16]:
model.save(U, sigma, Vt, data_mean)

In [72]:
recommender = SVDRecommender()

In [74]:
recommender.recommend(3, 20)

        rating
2571  1.822199
1704  1.625877
4993  1.457079
1     1.408088
480   1.392052
5952  1.355521
260   1.313654
2329  1.229197
590   1.080012
4306  1.061863
150   1.041650
47    0.991581
457   0.979779
1198  0.958697
2706  0.928712
364   0.903169
1193  0.888551
1196  0.873269
780   0.849104
380   0.818009
Finished in: 0.540163516998291
