# SVD

In this notebook we implement a simple SVD with item sample mean imputing for the missing values. The rank of the matrix k is determined with cross validation. 

In [2]:
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import math
import copy
from sklearn.metrics import mean_squared_error

In [3]:
data = pd.read_csv('../data/data-train.csv')
sample = pd.read_csv('../data/sample-Submission.csv')

In [17]:
# Clean input code

row_str = data["Id"].apply(lambda x: x.split("_")[0])
row_id = row_str.apply(lambda x: int(x.split("r")[1]) - 1)
col_str = data["Id"].apply(lambda x: x.split("_")[1])
col_id = col_str.apply(lambda x: int(x.split("c")[1]) - 1)

data['row'] = row_id
data['col'] = col_id

data_df = data.loc[:,['row', 'col', 'Prediction']]

In [18]:
#data_df = data_df.sort_values(by="row")
data_df.head()

Unnamed: 0,row,col,Prediction
0,43,0,4
1,60,0,3
2,66,0,4
3,71,0,3
4,85,0,5


In [19]:
def separate_in_folds(df, k = 5):
    
    # Create empty df to fill with shuffled data
    shuffled_df = pd.DataFrame(columns = ["row", "col", "Prediction"])
    
    # Shuffle data by row
    for i in tqdm(np.arange(10000)):
        num_rows = df[df['row'] == i].shape[0]
        row_sample = df[df['row'] == i].sample(n=num_rows)
        shuffled_df = shuffled_df.append(row_sample)
    
    # Assign folds to rows
    folds = np.append(np.matlib.repmat(np.arange(k)+1, 1, math.floor(shuffled_df.shape[0] / k)),
         (np.arange(k)+1)[0:shuffled_df.shape[0] % k])
    
    shuffled_df["Fold"] = folds
    
    # Return separate fold data frames in list, drop fold column
    fold_list = []
    for j in np.arange(k)+1:
        fold_list.append(shuffled_df[shuffled_df["Fold"] == j].drop('Fold', axis = 1))
    
    return fold_list
    

In [20]:
folds = separate_in_folds(df = data_df)

HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))




In [22]:
# Take a look at one of the folds

folds[0]

Unnamed: 0,row,col,Prediction
457577,0,391,5
15455,0,9,5
247518,0,205,2
720365,0,604,5
1139363,0,966,4
...,...,...,...
100355,9999,83,4
47781,9999,36,4
313922,9999,256,4
646329,9999,550,5


In [23]:
# Define a function for t

def svd_mean_recommender(data, param):
    
    num_eigen = param
    # 1. Compute data matrix with imputed means
    
    # Initialize empty
    data_matrix = np.zeros([10000,1000])
    data_matrix[:] = np.nan
    
    # Fill with observed values
    for i in np.arange(data.shape[0]):
        row_ind = data.iloc[i,0]
        col_ind = data.iloc[i,1]
        pred = data.iloc[i,2]
        data_matrix[row_ind, col_ind] = pred
    
    # Fill with column means
    item_means = np.apply_along_axis(np.nanmean, 0, data_matrix)
    for i in np.arange(data_matrix.shape[1]):
        data_matrix[np.isnan(data_matrix[:,i]), i] = item_means[i]
    
    # 2. Perform SVD (this is exercise 2.2 of series 2)
    
    u, s, vh = np.linalg.svd(data_matrix)
    
    s_diag = np.concatenate((np.diag(s), np.zeros((9000,1000))), axis = 0) # Represent singular values in matrix
    u_prime = np.matmul(u,np.sqrt(s_diag))
    v_prime = np.matmul(np.sqrt(s_diag), vh.T) # transpose since output vh is V^T
    
    # Compute prediction matrix. NOTE: currently not using square root matrices (prime) as recommended in exercise.
    # Tried this when first doing the exercise, got strange values and don't fully understand it. 
    predictions = np.matmul(u[:,0:num_eigen], np.matmul(s_diag[0:num_eigen,0:num_eigen], vh[0:num_eigen,:]))
    
    # Note that this is still complete matrix representation
    return predictions
    

In [43]:
RMSE = np.array([])
eigen_values = np.array([2,4,6,8,12,16,20,30,50,100])
for eig in eigen_values:
    
    
    # Cross-validation loop

    # Initialize scores array for RMSE's
    scores = np.array([])

    for i in np.arange(len(folds)): 
        # Make a copy of folds list to be able to use pop()
        fold_copy = copy.deepcopy(folds)

        test_set = fold_copy.pop(i)
        train_set = pd.concat([x for j,x in enumerate(fold_copy)])

        # Now train on training set
        # Insert any model for training here
        print("starting training...")
        pred_matrix = svd_mean_recommender(data = train_set, param = eig)

        # Predict on test_set
        print("starting predicting...")
        # Extract predictions
        row_ind = test_set.loc[:,'row']
        col_ind = test_set.loc[:,'col']
        preds = pred_matrix[row_ind.values.astype(int), col_ind.values.astype(int)]

        # Add column to test_set with predictions from model
        test_set['Model Predictions'] = preds

        # Compute RMSE and add value to scores array
        print("compute RMSE")
        scores = np.append(scores, 
                           np.sqrt(mean_squared_error(test_set.loc[:,'Prediction'], test_set.loc[:,'Model Predictions'])))


    # Compute mean of scores for estimate of model performance
    RMSE = np.append(RMSE, np.mean(scores))

RMSE

starting training...
starting predicting...
compute RMSE
starting training...
starting predicting...
compute RMSE
starting training...
starting predicting...
compute RMSE
starting training...
starting predicting...
compute RMSE
starting training...
starting predicting...
compute RMSE
starting training...
starting predicting...
compute RMSE
starting training...
starting predicting...
compute RMSE
starting training...
starting predicting...
compute RMSE
starting training...
starting predicting...
compute RMSE
starting training...
starting predicting...
compute RMSE
starting training...
starting predicting...
compute RMSE
starting training...
starting predicting...
compute RMSE
starting training...
starting predicting...
compute RMSE
starting training...
starting predicting...
compute RMSE
starting training...
starting predicting...
compute RMSE
starting training...
starting predicting...
compute RMSE
starting training...
starting predicting...
compute RMSE
starting training...
starting p

array([1.01584237, 1.01324189, 1.01175288, 1.01040682, 1.0102738 ,
       1.01054016, 1.01090769, 1.01241311, 1.01484045, 1.02052501])

In [44]:
# Inspect which k is optimal

print(eigen_values[np.argmin(RMSE)])

12

In [45]:
# Define some helper functions

def clean_df(df):
    '''
    Cleans initial representation to separate rows (users) and columns (movies) into columns with integer values
    '''
    row_str = df["Id"].apply(lambda x: x.split("_")[0])
    row_id = row_str.apply(lambda x: int(x.split("r")[1]) - 1)
    col_str = df["Id"].apply(lambda x: x.split("_")[1])
    col_id = col_str.apply(lambda x: int(x.split("c")[1]) - 1)
    
    data_df = pd.DataFrame(data = {'row': row_id, 'col': col_id, 'Prediction': df.loc[:,'Prediction']})
    
    return data_df

def predict_scores(test_set, pred_matrix):
    '''
    Retrieve predictions for test set from prediction matrix
    Input:
    test_set: data frame with columns: row, col and Prediction
    pred_matrix: full rows x cols matrix with predictions outputted from model
    Output:
    test_set: test set with updated predictions in Predictions column
    '''
    
    row_ind = test_set.loc[:,'row']
    col_ind = test_set.loc[:,'col']
    preds = pred_matrix[row_ind, col_ind]
    test_set['Prediction'] = preds
    
    return test_set

In [46]:
# Train model on optimal number of k

k = eigen_values[np.argmin(RMSE)]

test = clean_df(sample)

pred_matrix = svd_mean_recommender(data_df, param = k)

test = predict_scores(test_set = test, pred_matrix = pred_matrix)


In [49]:
# Export

sample['Prediction'] = test['Prediction']
sample.to_csv("../predictions/simple_svd.csv", index = False)