In [1]:
import os
import random
import sys
import math
import json

import numpy as np
import pandas as pd
from decimal import Decimal
from collections import defaultdict
from datetime import datetime

In [None]:
df2 = pd.read_csv(r'./data/df.csv');
df = df2.reset_index().drop(['index'], axis=1).melt(id_vars = ['objectId'])
df.rename(columns = {'value':'counts', 'variable':'visitorIp'}, inplace = True)
df['counts'].fillna(0, inplace=True)
df['counts'].unique()

In [3]:
def ensure_dir(file_path):
    directory = os.path.dirname(file_path)
    if not os.path.exists(directory):
        os.makedirs(directory)

In [4]:
class MatrixFactorization(object):

    Regularization = Decimal(0.002)
    BiasLearnRate = Decimal(0.001)
    BiasReg = Decimal(0.002)

    LearnRate = Decimal(0.001)


    item_bias = None
    user_bias = None
    beta = 0.02

    iterations = 0

    def __init__(self, save_path, max_iterations=10):
        self.save_path = save_path
        self.user_factors = None
        self.item_factors = None
        self.item_counts = None
        self.u_inx = None
        self.i_inx = None
        self.user_ids = None
        self.object_ids = None
        self.rmse_values = {}

        self.MAX_ITERATIONS = max_iterations
        random.seed(42)

        ensure_dir(save_path)

    def initialize_factors(self, ratings, k=25):
        self.user_ids = set(ratings['visitorIp'].values)
        self.object_ids = set(ratings['objectId'].values)
        self.item_counts = ratings[['objectId', 'counts']].groupby('objectId').count()
        self.item_counts = self.item_counts.reset_index()

        self.u_inx = {r: i for i, r in enumerate(self.user_ids)}
        self.i_inx = {r: i for i, r in enumerate(self.object_ids)}

        self.item_factors = np.full((len(self.i_inx), k), Decimal(0.1))
        self.user_factors = np.full((len(self.u_inx), k), Decimal(0.1))

        print("User factors: {}".format(self.user_factors.shape))
        print("Item factors: {}".format(self.item_factors.shape))
        self.user_bias = defaultdict(lambda: 0)
        self.item_bias = defaultdict(lambda: 0)

    def predict(self, user, item):

        pq = np.dot(self.item_factors[item], self.user_factors[user].T)
        b_ui = self.user_bias[user] + self.item_bias[item]
        prediction = pq + b_ui

        if prediction > 2:
            prediction = 2
        elif prediction < 1:
            prediction = 0
        return prediction

    def build(self, ratings, params):

        if params:
            k = params['k']
            self.save_path = params['save_path']

        self.train(ratings, k)

    def split_data(self, ratings):

        users = set(ratings.loc[ratings['counts'] > 0.0]['visitorIp'].values)
        train_data_len = int((len(users) * 70 / 100))

        test_users = set(random.sample(list(users), (len(users) - train_data_len)))
        train_users = users - test_users

        train = ratings[ratings['visitorIp'].isin(train_users)]
        test = ratings[ratings['visitorIp'].isin(test_users)]

        return test, train

    def meta_parameter_train(self, ratings_df):

        for lr in [0.0005, 0.001, 0.005, 0.01, 0.05, 0.1]:
            self.Regularization = Decimal(0.002)
            self.BiasReg = Decimal(0.002)

            self.BiasLearnRate = Decimal(lr)
            self.LearnRate = Decimal(lr)

            for k in [25, 50, 75, 100, 150]:
                self.rmse_values[k] = {}
                self.rmse_values[k]['train'] = []
                self.rmse_values[k]['test'] = []
                self.initialize_factors(ratings_df, k)
                print("Treniranje na {} faktora".format(k))
                print(str(k), "faktor, iteracija, train_mse, test_mse, vrijeme")

                test_data, train_data = self.split_data(10, ratings_df)
                columns = ['visitorIp', 'objectId', 'counts']
                ratings = train_data[columns].to_numpy()
                test = test_data[columns].to_numpy()

                iterations = 0
                index_randomized = random.sample(range(0, len(ratings)), (len(ratings) - 1))

                for factor in range(k):
                    factor_iteration = 0
                    factor_time = datetime.now()

                    last_err = sys.maxsize
                    last_test_mse = sys.maxsize
                    finished = False

                    indexes = random.choices(index_randomized, k=math.floor(len(index_randomized) * 0.5))
                    while not finished:
                        train_mse = self.stocastic_gradient_descent(factor, indexes, ratings)

                        iterations += 1
                        test_mse = self.calculate_rmse(test, factor)

                        finished = self.finished(factor_iteration,
                                                last_err,
                                                train_mse,
                                                last_test_mse,
                                                test_mse)

                        last_err = train_mse
                        last_test_mse = test_mse
                        factor_iteration += 1
                        self.rmse_values[k]['train'].append(train_mse)
                        self.rmse_values[k]['test'].append(test_mse)

                self.save(k, False)

                ensure_dir('./save/model/'+ str(self.LearnRate*10000).split('.')[0] + '/rmse_values.json')
                
                with open('./save/model/' + str(self.LearnRate*10000).split('.')[0] + '/rmse_values.json', 'w') as outfile:
                    outfile.write(json.dumps(self.rmse_values))
            
    def calculate_rmse(self, ratings, factor):

        def difference(row):
            user = self.u_inx[row[0]]
            item = self.i_inx[row[1]]

            if Decimal(row[2]) > 0.0:
              pq = np.dot(self.item_factors[item][:factor + 1], self.user_factors[user][:factor + 1].T)
              b_ui = self.user_bias[user] + self.item_bias[item]
              prediction = pq + b_ui
              MSE = (prediction - Decimal(row[2])) ** 2
              return MSE
            else:
              return 0.0

            

        squared = np.apply_along_axis(difference, 1, ratings).sum()
        n = 0
        for x in ratings:
            if x[2] > 0:
                n += 1
        return math.sqrt(squared / n)

    def train(self, ratings_df, k=100):

        self.initialize_factors(ratings_df, k)

        print("Treniranje na {} faktora".format(datetime.now()))

        ratings = ratings_df[['visitorIp', 'objectId', 'counts']].to_numpy()

        index_randomized = random.sample(range(0, len(ratings)), (len(ratings) - 1))
        
        for factor in range(k):
            factor_time = datetime.now()
            iterations = 0
            last_err = sys.maxsize
            iteration_err = sys.maxsize
            finished = False
            indexes = random.choices(index_randomized, k=math.floor(len(index_randomized) * 0.5))
            
            while not finished:
                start_time = datetime.now()
                iteration_err = self.stocastic_gradient_descent(factor,
                                                              indexes,
                                                              ratings)


                iterations += 1
                print("epoha u {}, f={}, i={} err={}".format(datetime.now() - start_time,
                                                                       factor,
                                                                       iterations,
                                                                       iteration_err))
                finished = self.finished(iterations,
                                         last_err,
                                         iteration_err)
                last_err = iteration_err
            self.save(factor, finished)
            print("zavrsne faktor {} on f={} i={} err={}".format(factor,
                                                                  datetime.now() - factor_time,
                                                                  iterations,
                                                                  iteration_err))

    def stocastic_gradient_descent(self, factor, index_randomized, ratings):

        lr = self.LearnRate
        b_lr = self.BiasLearnRate
        r = self.Regularization
        bias_r = self.BiasReg
        self.test = index_randomized
        
        for inx in index_randomized:


            rating_row = ratings[inx]

            u = self.u_inx[rating_row[0]]
            i = self.i_inx[rating_row[1]]
            rating = Decimal(rating_row[2])

            if(rating == 0.0):
                continue

            pred = self.predict(u, i)
            err = (rating - pred)

            self.user_bias[u] += b_lr * (err - bias_r * self.user_bias[u])
            self.item_bias[i] += b_lr * (err - bias_r * self.item_bias[i])

            user_fac = self.user_factors[u][factor]
            item_fac = self.item_factors[i][factor]

            self.user_factors[u][factor] += lr * (err * item_fac - r * user_fac)
            self.item_factors[i][factor] += lr * (err * user_fac - r * item_fac)

        return self.calculate_rmse(ratings, factor)

    def finished(self, iterations, last_err, current_err,
                 last_test_mse=0.0, test_mse=0.0):

        if last_test_mse < test_mse or iterations >= self.MAX_ITERATIONS or last_err - current_err < 0.0001:
            print('Završeno sa: {} iteracija, diff: {}, last_err: {}, current_err {}, lst_rmse {}, rmse {}'
                             .format(iterations, last_err - current_err , last_err, current_err, last_test_mse, test_mse))
            return True
        else:
            self.iterations += 1
            return False

    def save(self, factor, finished):

        save_path = self.save_path + '/model/'
        if not finished:
            save_path += str(factor) + '_' + str(self.LearnRate*10000).split('.')[0] + '/'

        ensure_dir(save_path)

        print("spremanje faktora u {}".format(save_path))
        user_bias = {str(uid): float(self.user_bias[self.u_inx[uid]]) for uid in self.u_inx.keys()}
        item_bias = {str(iid): float(self.item_bias[self.i_inx[iid]]) for iid in self.i_inx.keys()}

        uf = pd.DataFrame(self.user_factors,
                          index=list(self.user_ids))
        it_f = pd.DataFrame(self.item_factors,
                            index=list(self.object_ids))

        with open(save_path + 'user_factors.json', 'w') as outfile:
            outfile.write(uf.to_json())
        with open(save_path + 'item_factors.json', 'w') as outfile:
            outfile.write(it_f.to_json())
        with open(save_path + 'user_bias.json', 'w') as outfile:
            json.dump(user_bias, outfile, default = str)
        with open(save_path + 'item_bias.json', 'w') as outfile:
            json.dump(item_bias, outfile, default = str)


In [None]:
MF = MatrixFactorization(save_path='./{}'.format('save'), max_iterations=40)
MF.train(df)