In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from subprocess import call
from sklearn.metrics import mean_squared_error
import math


In [2]:
# Load the data
df_full = pd.read_csv('/home/dustin/Documents/Study/Master2/CILProject22/data_raw/data_train.csv')
df_train = pd.read_csv('/home/dustin/Documents/Study/Master2/CILProject22/data_raw/cross_validation/train_split_4.csv')
df_test = pd.read_csv('/home/dustin/Documents/Study/Master2/CILProject22/data_raw/cross_validation/test_split_4.csv')

dic_full = {
    'user_id': [str(x).partition("_")[0][1:] for x in df_full['Id']],
    'item_id': [str(x).partition("_")[2][1:] for x in df_full['Id']],
    #'combined': [(str(x).partition("_")[0][1:],str(x).partition("_")[2][1:]) for x in df['Id']],
    'rating': df_full['Prediction'],
}
dic_train = {
    'user_id': [str(x).partition("_")[0][1:] for x in df_train['Id']],
    'item_id': [str(x).partition("_")[2][1:] for x in df_train['Id']],
    'rating': df_train['Prediction'],
}
dic_test = {
    'user_id': [str(x).partition("_")[0][1:] for x in df_test['Id']],
    'item_id': [str(x).partition("_")[2][1:] for x in df_test['Id']],
    'rating': df_test['Prediction'],
}

full_data = pd.DataFrame(dic_full)
train_data = pd.DataFrame(dic_train)
test_data = pd.DataFrame(dic_test)
full_data[:100]


Unnamed: 0,user_id,item_id,rating
0,44,1,4
1,61,1,3
2,67,1,4
3,72,1,3
4,86,1,5
...,...,...,...
95,2706,1,4
96,2820,1,3
97,2883,1,2
98,2939,1,3


In [3]:
n_users = 10000
n_items = 1000
ratings = np.zeros((n_users, n_items))
train = np.zeros((n_users, n_items))
test = np.zeros((n_users, n_items))

# Create train and test sets via Tobi
for row in full_data.itertuples(index = False):
    ratings[int(row.user_id) - 1, int(row.item_id) - 1] = int(row.rating)

for row in train_data.itertuples(index = False):
    train[int(row.user_id) - 1, int(row.item_id) - 1] = int(row.rating)

for row in test_data.itertuples(index = False):
    test[int(row.user_id) - 1, int(row.item_id) - 1] = int(row.rating)

# Create train and test sets randomly
def create_random_train_test(ratings):
    """
    split into training and test sets,
    remove 10 ratings from each user
    and assign them to the test set
    """
    test = np.zeros(ratings.shape)
    train = ratings.copy()
    for user in range(ratings.shape[0]):
        test_index = np.random.choice(
            np.flatnonzero(ratings[user]), size = 10, replace = True)

        train[user, test_index] = 0.0
        test[user, test_index] = ratings[user, test_index]
        
    # assert that training and testing set are truly disjoint
    assert np.all(train * test == 0)
    return train, test
    

# train, test = create_random_train_test(ratings)
# del ratings

print(ratings[:10, :10])

[[0. 0. 0. 0. 0. 0. 0. 0. 0. 5.]
 [0. 0. 0. 3. 0. 5. 0. 4. 0. 0.]
 [0. 0. 0. 2. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 2. 0. 0. 0. 5. 0. 3. 0. 0.]
 [0. 0. 0. 0. 0. 5. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 5. 0. 0. 0. 3.]
 [0. 0. 0. 1. 0. 5. 0. 5. 0. 0.]]


In [4]:
# normalization per item
train_T = np.transpose(train)
test_T = np.transpose(test)
print(train_T.shape)
avgs, stds = [], []
for item in range(n_items):
    mask = np.nonzero(train_T[item])
    #mask_test = np.nonzero(test_T[item])
    avg = np.mean(train_T[item][mask])
    std = np.std(train_T[item][mask])
    avgs.append(avg)
    stds.append(std)
    train_T[item][mask] = train_T[item][mask] - avg / std
    #test_T[item][mask_test] = test_T[item][mask_test] - avg / std


train = np.transpose(train_T)
test = np.transpose(test_T)
print(train.shape)
print(test[:10][:10])

(1000, 10000)
(10000, 1000)
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 2. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [5]:
# Todo:
# - change n_iter to some stop condition (often converges after 10 iterations)

class ExplicitMF:
    """
    Train a matrix factorization model using Alternating Least Squares
    to predict empty entries in a matrix
    
    Parameters
    ----------
    n_iters : int
        number of iterations to train the algorithm
        
    n_factors : int
        number of latent factors to use in matrix 
        factorization model, some machine-learning libraries
        denote this as rank
        
    reg : float
        regularization term for item/user latent factors,
        since lambda is a keyword in python we use reg instead
    """

    def __init__(self, n_iters, n_factors, reg):
        self.reg = reg
        self.n_iters = n_iters
        self.n_factors = n_factors  
        self.n_user, self.n_item = train.shape
        self.user_factors = np.random.random((self.n_user, self.n_factors))
        self.item_factors = np.random.random((self.n_item, self.n_factors))
        
        
    def fit(self, train, test):
        """
        pass in training and testing at the same time to record
        model convergence, assuming both dataset is in the form
        of User x Item matrix with cells as ratings
        """

        self.test_rmse_record  = []
        self.train_rmse_record = []   
        for _ in range(self.n_iters):
            self.user_factors = self._als_step(train, self.user_factors, self.item_factors)
            self.item_factors = self._als_step(train.T, self.item_factors, self.user_factors)
            predictions = self.predict()
            test_rmse = self.compute_rmse(test, predictions)
            train_rmse = self.compute_rmse(train, predictions)
            self.test_rmse_record.append(test_rmse)
            self.train_rmse_record.append(train_rmse)
        
        return self    
    
    def _als_step(self, ratings, solve_vecs, fixed_vecs):
        """
        when updating the user matrix,
        the item matrix is the fixed vector and vice versa
        """
        A = fixed_vecs.T.dot(fixed_vecs) + np.eye(self.n_factors) * self.reg
        b = ratings.dot(fixed_vecs)
        A_inv = np.linalg.inv(A)
        solve_vecs = b.dot(A_inv)
        return solve_vecs
    
    def predict(self):
        """predict ratings for every user and item"""
        pred = self.user_factors.dot(self.item_factors.T)
        return pred
    
    @staticmethod
    def compute_rmse(y_true, y_pred):
        """ignore zero terms prior to comparing the mse"""
        mask = np.nonzero(y_true)
        mse = mean_squared_error(y_true[mask], y_pred[mask])
        return math.sqrt(mse)

In [29]:
model = ExplicitMF(n_iters = 100, n_factors = 30, reg = .1)
model.fit(train, test)
print(model.train_rmse_record)

[1.0589637714456948, 1.0106541033020207, 1.003833944275247, 1.0019876753488157, 1.0014608169827068, 1.0013819254836094, 1.0014727983194125, 1.001623904011846, 1.0017852766559567, 1.0019320581791025, 1.0020533614753724, 1.0021470927015326, 1.0022159530243262, 1.0022645103644356, 1.0022975142041084, 1.0023191473068396, 1.0023328054931293, 1.0023411184181479, 1.002346057456138, 1.0023490607991836, 1.0023511485317353, 1.002353019529055, 1.002355129711885, 1.002357753856125, 1.0023610337744342, 1.0023650155423793, 1.0023696780623121, 1.002374954846789, 1.0023807505306066, 1.0023869533044443, 1.0023934442029028, 1.002400103968601, 1.0024068180446846, 1.002413480114171, 1.0024199945007866, 1.002426277667915, 1.002432258995788, 1.0024378809780405, 1.002443098953103, 1.0024478804694845, 1.0024522043733448, 1.0024560596987655, 1.0024594444337842, 1.0024623642273216, 1.0024648310931072, 1.0024668621567712, 1.0024684784818587, 1.0024697040002595, 1.0024705645629957, 1.0024710871189475, 1.002471299

In [30]:
# print(model.test_mse_record)
pred = model.predict()
pred_T = np.transpose(pred)
for item in range(n_items):
    pred_T[item] = pred_T[item] * stds[item] + avgs[item]

pred = np.transpose(pred_T)

print(f'RMSE: {model.compute_rmse(test, pred)}')
print('\n', pred)

RMSE: 1.0144163560266977

 [[3.36261859 3.51664976 3.46123186 ... 3.21976197 3.35006664 3.66690271]
 [3.35008633 3.52553775 3.43770043 ... 3.30589592 3.3981146  3.68784434]
 [3.32860758 3.50610571 3.42209739 ... 3.22897011 3.3496739  3.68636958]
 ...
 [3.3678808  3.52400683 3.45386761 ... 3.18523598 3.3686551  3.7000951 ]
 [3.36416262 3.52812416 3.45112718 ... 3.2006763  3.38247363 3.77351732]
 [3.37324219 3.55258383 3.49219747 ... 3.22606623 3.37375106 3.68391879]]


In [26]:
# Big Cross Validation
regs = [0.5, 0.1, 0.05]
n_factors = [25, 30, 35]

best = (0, 0)
best_rmse = 100
for reg in regs:
    for n_factor in n_factors:
        model = ExplicitMF(n_iters = 100, n_factors = n_factor, reg = reg)
        model.fit(train, test)
        pred = model.predict()
        pred_T = np.transpose(pred)
        for item in range(n_items):
            pred_T[item] = pred_T[item] * stds[item] + avgs[item]
        pred = np.transpose(pred_T)
        rmse = model.compute_rmse(test, pred)
        if rmse < best_rmse:
            best_rmse = rmse
            best = (reg, n_factor)
            print(f'New best Hyperparameters: {best} with RMSE: {best_rmse}')


New best Hyperparameters: (0.5, 25) with RMSE: 1.0144726551695518
New best Hyperparameters: (0.5, 28) with RMSE: 1.014391762823866
New best Hyperparameters: (0.1, 28) with RMSE: 1.0143647130245828


# Submission

In [65]:
# normalize training data
ratings_T = np.transpose(ratings)
avgs_full, stds_full = [], []
for item in range(n_items):
    mask = np.nonzero(ratings_T[item])
    avg = np.mean(ratings_T[item][mask])
    std = np.std(ratings_T[item][mask])
    avgs_full.append(avg)
    stds_full.append(std)
    ratings_T[item][mask] = ratings_T[item][mask] - avg / std

ratings = np.transpose(ratings_T)

# train the model
model = ExplicitMF(n_iters = 100, n_factors = 30, reg = 3.0)
model.fit(ratings, test)

# predict ratings
pred = model.predict()
pred_T = np.transpose(pred)
for item in range(n_items):
    pred_T[item] = pred_T[item] * stds_full[item] + avgs_full[item]
pred = np.transpose(pred_T)
print(pred)

# write to submission-file
sample_sub = pd.read_csv("/home/dustin/Documents/Study/Master2/CILProject22/data_raw/sampleSubmission.csv")
prediction = []
for cell_id in sample_sub.Id:
    row, col = cell_id.split("_")
    prediction.append(pred[int(row[1:])-1, int(col[1:])-1])
sample_sub.Prediction = prediction
sample_sub.to_csv("../data/als.csv", index=False)
sample_sub

[[3.36541446 3.50222545 3.49768747 ... 3.22301762 3.32739128 3.68432503]
 [3.37129598 3.51177384 3.45411692 ... 3.35747631 3.40930599 3.74930346]
 [3.35422208 3.4945931  3.5054094  ... 3.25303381 3.36372622 3.65856614]
 ...
 [3.38551082 3.49370078 3.47494096 ... 3.22431976 3.37396801 3.73376356]
 [3.39680268 3.51936894 3.51356781 ... 3.21662515 3.31572792 3.75279551]
 [3.38298089 3.54173777 3.52161527 ... 3.31457306 3.35603526 3.74558618]]


Unnamed: 0,Id,Prediction
0,r37_c1,3.412387
1,r73_c1,3.367076
2,r156_c1,3.535411
3,r160_c1,3.450662
4,r248_c1,3.425766
...,...,...
1176947,r9974_c1000,3.684070
1176948,r9977_c1000,3.682352
1176949,r9978_c1000,3.629826
1176950,r9982_c1000,3.591850
