In [93]:
import pandas as pd
import numpy as np
import random
import os
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
import surprise
from tqdm.autonotebook import tqdm
import pickle

In [3]:
def read_data(file_path):
    '''
    Read the ratings data into pandas dataframe. Will drop Timestamp and return a train test split. 
    '''
    df = pd.read_csv('ml-1m/ratings.dat', sep='::', names=['UserID', 'MovieID', 'Rating', 'Timestamp'])
    df = df.drop(columns='Timestamp')
    train=df.sample(frac=0.8,random_state=200) 
    test=df.drop(train.index)
    return train, test

In [4]:
train_data, test_data = read_data('ml-1m/ratings.dat')

  """


In [90]:
class MYSVD:
    def __init__(self, K = 20, epoch = 20, lr = 0.007, reg = 0.002):
        self.K = K
        self.epoch = epoch
        self.lr = lr
        self.reg = reg
            
    def predict(self, user_id, item_id): 
        self.bi.setdefault(item_id,0)  
        self.bu.setdefault(user_id,0)  
        self.qi.setdefault(item_id,np.random.random((1,self.K)) + 0.1)  
        self.pu.setdefault(user_id,np.random.random((1,self.K)) + 0.1)
        score = 0
        try:
            score = self.avg + self.bu[user_id] + self.bi[item_id] + np.dot(self.qi[item_id], self.pu[user_id].T)
        except:
            print (self.avg)
            print (self.bu[user_id])
            print (self.bi[item_id])
            print (self.qi[item_id])
            print (self.pu[user_id])
            raise
        return score

    def fit(self, train_df):
        print ("Fitting starts")
        self.train_df = train_df        
        self.avg = np.average(self.train_df['Rating'])
#         self.bi = np.zeros(item_num, np.double)
#         self.bu = np.zeros(user_num, np.double)
#         self.qi = np.zeros((item_num, self.K), np.double)+.1
#         self.pu = np.zeros((user_num, self.K), np.double)+.1
        self.bi={}  
        self.bu={}  
        self.qi={}  
        self.pu={}
        for i, row in tqdm(self.train_df.iterrows()):
            user_id = row['UserID']
            item_id = row['MovieID']
            self.bi.setdefault(user_id,0)  
            self.bu.setdefault(user_id,0)  
            self.qi.setdefault(item_id,np.zeros((1,self.K)) + 0.1)  
            self.pu.setdefault(user_id,np.zeros((1,self.K)) + 0.1) 
        print ("Fitting ends")
    
    def train(self, train_df):
        self.train_df = train_df        
        user_num = self.train_df.shape[0]
        item_num = self.train_df.shape[1]

        for i in tqdm(range(self.epoch)):
            print("Training epoch {}".format(i))
            for j, row in self.train_df.iterrows():
                user_id = row['UserID']
                item_id = row['MovieID']
                rating = row['Rating']
                rui = self.avg + self.bu[user_id] + self.bi[item_id] + np.dot(self.qi[item_id], self.pu[user_id].T)
                eui = rating - rui
                
                self.bu[user_id] += self.lr * (eui - self.reg * self.bu[user_id])
                self.bi[item_id] += self.lr * (eui - self.reg * self.bi[item_id])
                self.pu[user_id] += self.lr * (eui * self.qi[item_id] - self.reg * self.pu[user_id])
                self.qi[item_id] += self.lr * (eui * self.pu[user_id] - self.reg * self.qi[item_id])
            self.lr *= 0.93
       
    def test(self, test_df):   
        rmse=0.0
        mae=0  
        for j, row in test_df.iterrows():
            user_id = row['UserID']
            item_id = row['MovieID']
            rating = row['Rating']
            eui=rating-self.predict(user_id, item_id)  
            rmse+=eui**2  
            mae+=abs(eui)
        N = test_df.shape[0]
        return rmse, mae
    
    def read_model(self, )

In [91]:
model = MYSVD(epoch = 20)
model.fit(train_data)
model.train(train_data)
model.test(test_data)

Fitting starts


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Fitting ends


HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

Training epoch 0
Training epoch 1
Training epoch 2
Training epoch 3
Training epoch 4
Training epoch 5
Training epoch 6
Training epoch 7
Training epoch 8
Training epoch 9
Training epoch 10
Training epoch 11
Training epoch 12
Training epoch 13
Training epoch 14
Training epoch 15
Training epoch 16
Training epoch 17
Training epoch 18
Training epoch 19



(array([[158371.950798]]), array([[140078.92807162]]))

In [84]:
N = test_data.shape[0]

In [92]:
print('rmse is {0:3f}, ase is {1:3f}'.format(np.sqrt(158371.950798/N),140078.92807162/N))  

rmse is 0.889772, ase is 0.700248


In [94]:
with open('svd.model', 'wb') as svd_model:
    pickle.dump(model, svd_model)

In [95]:
with open('svd.model', 'rb') as svd_model:
    # Step 3
    model_read = pickle.load(svd_model)


In [97]:
model_read.test(test_data)

(array([[158371.950798]]), array([[140078.92807162]]))