# Alternating Least Squares (Demo)
This jupyter notebook is dedicated to demo one of our baseline models: Alternating Least Squares). You may need to install some python libraries if you run into import errors. Pip installation is provided in comment.

In [4]:
# If use on ETHZ Euler cluster, you need to add '--user' option e.g. !pip install --user scikit-surprise
#!pip install numpy
#!pip install pandas
#!pip install scikit-surprise

In [5]:
# Import Libraries
import numpy as np
import pandas as pd
from surprise import AlgoBase, PredictionImpossible, Reader, Dataset, accuracy
from surprise.model_selection import train_test_split, cross_validate, GridSearchCV
import time

In [6]:
# Define some helper functions
def clean_df(df):
    '''
    Cleans initial representation to separate rows (users) and columns (movies) into columns with integer values
    '''
    row_str = df["Id"].apply(lambda x: x.split("_")[0])
    row_id = row_str.apply(lambda x: int(x.split("r")[1]) - 1)
    col_str = df["Id"].apply(lambda x: x.split("_")[1])
    col_id = col_str.apply(lambda x: int(x.split("c")[1]) - 1)
    
    data_df = pd.DataFrame(data = {'row': row_id, 'col': col_id, 'Prediction': df.loc[:,'Prediction']})
    
    return data_df

def read_submission_indexes(file_path):
    '''
    Reads the indexes for which to make the predictions.

    Parameters:
    file_path (string): the file path to read

    Returns:
    indexes (list): a list where each element is a tuple of the form (row, column)
    '''

    data_train_raw = pd.read_csv(file_path).values

    tuples = []

    for i in range(data_train_raw.shape[0]):
        indices, value = data_train_raw[i,0], int(data_train_raw[i,1])
        indices = indices.split('_')
        row, column = int(indices[0][1:])-1, int(indices[1][1:])-1 # indexing in the data starts from 1
        tuples.append((row, column))

    return tuples

def write_predictions_to_csv(predictions, file_path):
    '''
    Writes on a csv file the predictions. The format of the prediction file is the following (second row is example):

    Id, Prediction
    r1_c1, 1

    Parameters:
    predictions (surprise.prediction_algorithms.predictions.Prediction): the object holding the predictions
    file_path (string): the path to the prediction file
    '''

    # Define header
    header = []
    header.append('Id')
    header.append('Prediction')

    # Write file
    with open(file_path, 'w') as csvfile:
        writer = csv.writer(csvfile)
        writer = csv.DictWriter(csvfile, fieldnames=header)
        writer.writeheader()
        for row, col, pred in predictions:
            # Row of the csv file
            file_row = {}
            # Build file row
            file_row['Id'] = 'r'+ str(row+1) + '_c' + str(col+1)
            file_row['Prediction'] = pred
            # Write file row
            writer.writerow(file_row)
            
def compute_rmse(target, pred):
    mse = np.mean(np.square(target-pred))
    return np.sqrt(mse)

## Prepare Input

In [7]:
# read data
data_train_raw = pd.read_csv('../data/data-train.csv')
# train dataset as data frame
data_train_df = clean_df(data_train_raw)

In [8]:
# set up surprise dataset
reader = Reader()
dataset = Dataset.load_from_df(data_train_df[['row', 'col', 'Prediction']], reader)

# now set up training and test set, with a test split of 25%
trainset, testset = train_test_split(dataset, test_size=0.25)

In [9]:
def sum_column_norm_square(M):
    """
    Calculate the sum of the squares of column vector norms

    Param:
    ========
    M: np.array, input matrix
    """
    return np.sum(np.square(np.linalg.norm(M, axis = 0)))

def als(trainset, k, lamb, max_iter):
    """
    Alternating Least Squares algorithm to decompose input trainset into the product of two lower rank matrices, i.e X_train = P^TQ, which P, Q are returned by this function.

    Param:
    ========
    trainset: dataset formatted via surprise dataset
    k: int, rank of lower representation in in vector space 
    lamb: float, regularized factor lambda
    max_iter: int, number of maximum iterations

    Return:
    ========
    P: np.array, low-rank matrix 
    Q: np.array, low-rank matrix
    """
    m, n = trainset.n_users, trainset.n_items
    # Get rating matrix
    A = np.zeros((m, n))
    for u, i, r in trainset.all_ratings():
        A[u,i] = r
    # Initialize P, Q
    P = np.ones((k,m)) #user matrix
    Q = np.ones((k,n)) #item matrix
    Id_k = np.eye(N=k)
    # Alternate to optimize
    num_iter = 0
    best_PQ = [None, None, None]
    obs_idx = np.nonzero(A)
    target = A[obs_idx]
    num_rmse_incr = 0
    while num_iter < max_iter:
        # train rmse
        pred =(P.T@Q)[obs_idx]
        last_train_rmse = compute_rmse(target, pred)
        # Optimize user matrix by fixing item matrix 
        for u in range(m):
            obs_items = np.nonzero(A[u, :])
            num_obs_items = obs_items[0].shape[0]
            sum_qqT = np.sum([Q[:, i]*np.reshape(Q[:, i], (-1, 1)) for i in obs_items[0]], 0)
            P[:, u] = np.squeeze(np.linalg.inv(sum_qqT+lamb*num_obs_items*Id_k) @ np.reshape(np.sum([A[u,item]*Q[:, item] for item in obs_items[0]], 0), (-1, 1)))
        # Optimize item matrix by fixing user matrix
        for i in range(n):
            obs_users = np.nonzero(A[:, i])
            num_obs_users = obs_users[0].shape[0]
            sum_ppT = np.sum([P[:, u]*np.reshape(P[:, u], (-1, 1)) for u in obs_users[0]], 0)
            Q[:, i] = np.squeeze(np.linalg.inv(sum_ppT+lamb*num_obs_users*Id_k) @ np.reshape(np.sum([A[u, i]*P[:, u] for u in obs_users[0]], 0), (-1, 1)))
        best_PQ[0],  best_PQ[1], best_PQ[2] = best_PQ[1], best_PQ[2], (P, Q)
        pred = (P.T@Q)[obs_idx]
        train_rmse = compute_rmse(target, pred)        
        if (num_iter+1) % 5 == 0:
            print("Iter  {}: rmse error {}".format(num_iter+1, train_rmse))
        if train_rmse - last_train_rmse >= 0:
            num_rmse_incr += 1
        else:
            num_rmse_incr =  0
        if num_rmse_incr >=  2:
            print("RMSE stops decreasing at iter {}, early stopping...".format(num_iter-1))
            return  best_PQ[0]
        num_iter += 1
    P, Q = best_PQ[2]
    return P, Q

class ALS(AlgoBase):
    """
    Model via Alternating Least Square
    """
    def __init__(self, k, lamb, max_iter):
        self.k = k
        self.lamb = lamb
        self.max_iter = max_iter

    def fit(self, trainset):
        AlgoBase.fit(self, trainset)
        start = time.time()
        self.P, self.Q = als(self.trainset, self.k, self.lamb, self.max_iter)
        print("Used time: {}".format(time.time() - start))   

    def estimate(self, u, i):
        return np.clip(np.dot(self.P[:, u], self.Q[:, i]), 1, 5)


## Best Model via Grid Search
Here we presented the best model via grid search on selected hyper-parameters. We stored the best hyper-parameters from experiments.

Grid Search was run on Euler cluster with parallelization. We provide the commented code in the following section "Grid Search" only if you would like to re-run the search again. However, we do not recommend to do so without sufficient computational power.

In [10]:
# Best params
k = 20
lamb = 0.1
max_iter = 100

# Define the model
model = ALS(k=k, lamb=lamb, max_iter=max_iter)

# Fit the model with trainset
model.fit(trainset)

# Compute the predictions on testset
predictions = model.test(testset)

# Compute RMSE on testset
test_rmse = accuracy.rmse(predictions)

Iter  5: rmse error 1.0188438604480456
Iter  10: rmse error 1.0029974188691189
Iter  15: rmse error 0.9983336284828854
Iter  20: rmse error 0.9964159854459105
Iter  25: rmse error 0.9928907078322519
Iter  30: rmse error 0.9711109889811866
Iter  35: rmse error 0.9560882997617441
Iter  40: rmse error 0.9422869316964702
Iter  45: rmse error 0.9224173758009514
Iter  50: rmse error 0.9063769574204679
Iter  55: rmse error 0.9031025960300871
Iter  60: rmse error 0.9026669211298688
Iter  65: rmse error 0.9024513082057595
Iter  70: rmse error 0.9023123896085158
Iter  75: rmse error 0.9022137441746727
Iter  80: rmse error 0.9021392963035701
Iter  85: rmse error 0.9020806743601565
Iter  90: rmse error 0.9020331362423839
Iter  95: rmse error 0.9019938403171276
Iter  100: rmse error 0.9019609910130367
Used time: 1756.3296949863434
RMSE: 0.9905


## Grid Search 
The Grid Search was run to find our ALS model with lowest validation RMSE. You could modify the param_grid to experiment other ranges.

In [11]:
# grid_search_model = GridSearchCV(ALS, param_grid = {"k":[3, 5, 10, 20], "lamb":[0.05, 0.1, 0.5, 1], max_iter":[10, 30,50,100]}, n_jobs = -1)
# grid_search_model.fit(dataset)
# grid_search_model.best_score, model.best_params

## Submission

In [12]:
# Prepare submission indices
pred_ids = read_submission_indexes("../data/sample-submission.csv")

In [13]:
# Best params
model = ALS(k=k, lamb=lamb, max_iter=max_iter)
# Fit on all training data
all_train  = dataset.build_full_trainset()
model.fit(all_train)

Iter  5: rmse error 1.0203619791401577
Iter  10: rmse error 1.0045349402366655
Iter  15: rmse error 0.9998780378420568
Iter  20: rmse error 0.9979634542924309
Iter  25: rmse error 0.9969707985014635
Iter  30: rmse error 0.9813964653038831
Iter  35: rmse error 0.9754864713389988
Iter  40: rmse error 0.972235010942289
Iter  45: rmse error 0.9595719837489957
Iter  50: rmse error 0.9569756982040228
Iter  55: rmse error 0.9543549400211193
Iter  60: rmse error 0.9496515609419316
Iter  65: rmse error 0.9454709174511202
Iter  70: rmse error 0.9411906575531267
Iter  75: rmse error 0.937933394540405
Iter  80: rmse error 0.9348407487685714
Iter  85: rmse error 0.9328052335998263
Iter  90: rmse error 0.9319471856753584
Iter  95: rmse error 0.9318032229448242
Iter  100: rmse error 0.9317605055117049
Used time: 2174.0647082328796


In [14]:
# Predictions on Kaggle test data# Predictions on Kaggle test data
submission = []
for r, c in pred_ids:
    submission.append((r, c, model.estimate(r, c)))

In [15]:
import datetime 
import csv
# Save submission predictions
submission_csv = "../predictions/submission_als_{}.csv".format(datetime.datetime.now().strftime('%Y%m%d_%H%M'))
write_predictions_to_csv(submission, submission_csv)