In [None]:
import numpy as np
import pandas as pd
import time
import json
import math
from numba import njit, prange, types
from numba.typed import Dict
import matplotlib.pyplot as plt
import os
import preprocessing
import training
import test
import tools
import importlib

# reload moudules
importlib.reload(tools)
importlib.reload(test)
importlib.reload(training)
importlib.reload(preprocessing)

# Preprocessing GroupLens Dataset
For testing a model, divide dataset into two parts, training and test. The original dataset is sequential, hence we should shuffle the dataset randomly. We will build an adjacency matrix for the rating matrix because of the memory limits. The whole rating matrix, about **`280000 X 58000`**, cannot be loaded on memory practically. "userMapper" and "movieMapper" are a hashmap to convert an user and a movie ids into continuous natural number.

In [None]:
# set training - test ratio
train_ratio = 0.8
test_ratio = 1 - train_ratio

# for testing, shuffle dataset
if not os.path.isfile('shuffled_ratings.csv'):
    shuffle_data()
    
ratings = pd.read_csv('shuffled_ratings.csv')
split_bound = int(ratings.shape[0] * train_ratio)

print(f"Load shuffled data : {ratings.shape}")
print(f"Train / Test ratio : {train_ratio} / {test_ratio}")
print(f"Split data : [0:{split_bound}] , [{split_bound}:{ratings.shape[0]}]")

train_data = ratings[:split_bound]
test_data = ratings[split_bound:]
train_data.reset_index(inplace=True, drop=True)
test_data.reset_index(inplace=True, drop=True)

# training data
print("##### Get mapper and adjacency matrix")
start_time = time.time()
userMapper = userIdMapper(train_data)
print(f"userMapper processing Time : {time.time() - start_time}")
start_time = time.time()
movieMapper = movieIdMapper(train_data)
print(f"movieMapper processing Time : {time.time() - start_time}")
start_time = time.time()
user_item = build_adjacency_matrix(train_data, userMapper, movieMapper)
print(f"adjacency matrix processing Time : {time.time() - start_time}")

# Training model

LFM(Latent Factor Model) is a modeling assuming that there are latent factors to determine an entity. We set 20 dimension features to represent characteristics for each user and movie. These features are the unknown characteristics to calculate a movie rating of a user. This is driven by the assumption that "Some factors absolutely affect to determine rating but, I do not know what the factors are".

In [None]:
# Collaborative Filtering (Model-based), Trainig
epoch = 1
learning_rate = 0.005
regular_term = 0.1
latent_dim = 20

user_count = len(userMapper.keys())
movie_count = len(movieMapper.keys())

[fm_users, fm_movies, train_error] = tools.load_files(user_count, movie_count, latent_dim)

for i in range(epoch):
    start_time = time.time()
    lossValue = training.training(user_item, fm_users, user_count, fm_movies, learning_rate, regular_term)
    print(f"Time : {time.time() - start_time}")
    train_error.append(lossValue)
    print(f"###### epoch {i} {lossValue}")
    
plt.plot(train_error)
plt.ylabel('train errors per epoch')
plt.show()

# Testing

After training model, we should check whether model is trained well. We already splited the dataset to test the model, therefore we just use it! We measure performance of the model using RMSE(Root Mean Squared Error).

In [None]:
start_time = time.time()
test_err = test_error(test_data)
print(f"Time : {time.time() - startTime}")
print(f"Test dataset error : {test_err}")

In [None]:
save_files(fm_users, fm_movies, train_error)

In [None]:
print(predict(50, 10))