In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from subprocess import call
from sklearn.metrics import mean_squared_error
import math


In [20]:
# Load the data
df_full = pd.read_csv('/home/dustin/Documents/Study/Master2/CILProject22/data_raw/data_train.csv')
df_train = pd.read_csv('/home/dustin/Documents/Study/Master2/CILProject22/data_raw/cross_validation/train_split_4.csv')
df_test = pd.read_csv('/home/dustin/Documents/Study/Master2/CILProject22/data_raw/cross_validation/test_split_4.csv')

dic_full = {
    'user_id': [str(x).partition("_")[0][1:] for x in df_full['Id']],
    'item_id': [str(x).partition("_")[2][1:] for x in df_full['Id']],
    #'combined': [(str(x).partition("_")[0][1:],str(x).partition("_")[2][1:]) for x in df['Id']],
    'rating': df_full['Prediction'],
}
dic_train = {
    'user_id': [str(x).partition("_")[0][1:] for x in df_train['Id']],
    'item_id': [str(x).partition("_")[2][1:] for x in df_train['Id']],
    'rating': df_train['Prediction'],
}
dic_test = {
    'user_id': [str(x).partition("_")[0][1:] for x in df_test['Id']],
    'item_id': [str(x).partition("_")[2][1:] for x in df_test['Id']],
    'rating': df_test['Prediction'],
}

full_data = pd.DataFrame(dic_full)
train_data = pd.DataFrame(dic_train)
test_data = pd.DataFrame(dic_test)
test_data[:100]


Unnamed: 0,user_id,item_id,rating
0,61,1,3
1,120,1,2
2,457,1,2
3,670,1,3
4,966,1,5
...,...,...,...
95,3261,2,5
96,3278,2,4
97,3288,2,1
98,3303,2,5


In [21]:
n_users = 10000
n_items = 1000
train = np.zeros((n_users, n_items))
test = np.zeros((n_users, n_items))

# Create train and test sets via Tobi
for row in train_data.itertuples(index = False):
    train[int(row.user_id) - 1, int(row.item_id) - 1] = int(row.rating)

for row in test_data.itertuples(index = False):
    test[int(row.user_id) - 1, int(row.item_id) - 1] = int(row.rating)

# Create train and test sets randomly
def create_random_train_test(ratings):
    """
    split into training and test sets,
    remove 10 ratings from each user
    and assign them to the test set
    """
    test = np.zeros(ratings.shape)
    train = ratings.copy()
    for user in range(ratings.shape[0]):
        test_index = np.random.choice(
            np.flatnonzero(ratings[user]), size = 10, replace = True)

        train[user, test_index] = 0.0
        test[user, test_index] = ratings[user, test_index]
        
    # assert that training and testing set are truly disjoint
    assert np.all(train * test == 0)
    return train, test
    

# train, test = create_random_train_test(ratings)
# del ratings

print(train[:10, :10])

[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 5. 0. 4. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 2. 0. 0. 0. 5. 0. 3. 0. 0.]
 [0. 0. 0. 0. 0. 5. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 3.]
 [0. 0. 0. 1. 0. 5. 0. 5. 0. 0.]]


In [22]:
# normalization per item
train_T = np.transpose(train)
test_T = np.transpose(test)
print(train_T.shape)
avgs, stds = [], []
for item in range(n_items):
    mask = np.nonzero(train_T[item])
    #mask_test = np.nonzero(test_T[item])
    avg = np.mean(train_T[item][mask])
    std = np.std(train_T[item][mask])
    avgs.append(avg)
    stds.append(std)
    train_T[item][mask] = train_T[item][mask] - avg / std
    #test_T[item][mask_test] = test_T[item][mask_test] - avg / std


train = np.transpose(train_T)
test = np.transpose(test_T)
print(train.shape)
print(test[:10][:10])

(1000, 10000)
(10000, 1000)
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 2. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [23]:
# Todo:
# - change n_iter to some stop condition (often converges after 10 iterations)

class ExplicitMF:
    """
    Train a matrix factorization model using Alternating Least Squares
    to predict empty entries in a matrix
    
    Parameters
    ----------
    n_iters : int
        number of iterations to train the algorithm
        
    n_factors : int
        number of latent factors to use in matrix 
        factorization model, some machine-learning libraries
        denote this as rank
        
    reg : float
        regularization term for item/user latent factors,
        since lambda is a keyword in python we use reg instead
    """

    def __init__(self, n_iters, n_factors, reg):
        self.reg = reg
        self.n_iters = n_iters
        self.n_factors = n_factors  
        self.n_user, self.n_item = train.shape
        self.user_factors = np.random.random((self.n_user, self.n_factors))
        self.item_factors = np.random.random((self.n_item, self.n_factors))
        
        
    def fit(self, train, test):
        """
        pass in training and testing at the same time to record
        model convergence, assuming both dataset is in the form
        of User x Item matrix with cells as ratings
        """

        self.test_rmse_record  = []
        self.train_rmse_record = []   
        for _ in range(self.n_iters):
            self.user_factors = self._als_step(train, self.user_factors, self.item_factors)
            self.item_factors = self._als_step(train.T, self.item_factors, self.user_factors)
            predictions = self.predict()
            test_rmse = self.compute_rmse(test, predictions)
            train_rmse = self.compute_rmse(train, predictions)
            self.test_rmse_record.append(test_rmse)
            self.train_rmse_record.append(train_rmse)
        
        return self    
    
    def _als_step(self, ratings, solve_vecs, fixed_vecs):
        """
        when updating the user matrix,
        the item matrix is the fixed vector and vice versa
        """
        A = fixed_vecs.T.dot(fixed_vecs) + np.eye(self.n_factors) * self.reg
        b = ratings.dot(fixed_vecs)
        A_inv = np.linalg.inv(A)
        solve_vecs = b.dot(A_inv)
        return solve_vecs
    
    def predict(self):
        """predict ratings for every user and item"""
        pred = self.user_factors.dot(self.item_factors.T)
        return pred
    
    @staticmethod
    def compute_rmse(y_true, y_pred):
        """ignore zero terms prior to comparing the mse"""
        mask = np.nonzero(y_true)
        mse = mean_squared_error(y_true[mask], y_pred[mask])
        return math.sqrt(mse)

In [31]:
model = ExplicitMF(n_iters = 50, n_factors = 10, reg = .10)

model.fit(train, test)

<__main__.ExplicitMF at 0x7fe23fc5c400>

In [34]:
# print(model.test_mse_record)
pred = model.predict()
pred_T = np.transpose(pred)
for item in range(n_items):
    pred_T[item] = pred_T[item] * stds[item] + avgs[item]

pred = np.transpose(pred_T)

print(f'RMSE: {model.compute_rmse(test, pred)}')
# print(model.train_rmse_record, '\n')
print('\n', pred)

RMSE: 1.019889825420967

 [[3.35755022 3.51721261 3.44835208 ... 3.20554749 3.34785839 3.66994406]
 [3.35629181 3.50820292 3.46304968 ... 3.22317655 3.34363682 3.64951072]
 [3.3480242  3.50945019 3.45117381 ... 3.18928639 3.326243   3.61431565]
 ...
 [3.356552   3.51643673 3.4382637  ... 3.19712123 3.34064867 3.68857261]
 [3.34023736 3.50071495 3.41926294 ... 3.20418481 3.32930313 3.72256877]
 [3.37598691 3.52764566 3.47093533 ... 3.23919523 3.36585445 3.69036787]]


In [None]:
# ToDo
# normalization