# Singular Value Thresholding

We implement the singular value thresholding algorithm as presented in Cai, Candés and Shen (2008). The algorithm is computationally slow however so it is unfeasible to perform a grid search.

In [8]:
import numpy as np
import pandas as pd
import math

from scipy.sparse import linalg
from scipy.sparse importcsc_matrix
import scipy

In [9]:
# Define helper functions

def clean_df(df):
    '''
    Cleans initial representation to separate rows (users) and columns (movies) into columns with integer values
    '''
    row_str = df["Id"].apply(lambda x: x.split("_")[0])
    row_id = row_str.apply(lambda x: int(x.split("r")[1]) - 1)
    col_str = df["Id"].apply(lambda x: x.split("_")[1])
    col_id = col_str.apply(lambda x: int(x.split("c")[1]) - 1)
    
    data_df = pd.DataFrame(data = {'row': row_id, 'col': col_id, 'Prediction': df.loc[:,'Prediction']})
    
    return data_df

def predict_scores(test_set, pred_matrix):
    '''
    Retrieve predictions for test set from prediction matrix
    Input:
    test_set: data frame with columns: row, col and Prediction
    pred_matrix: full rows x cols matrix with predictions outputted from model
    Output:
    test_set: test set with updated predictions in Predictions column
    '''
    
    row_ind = test_set.loc[:,'row']
    col_ind = test_set.loc[:,'col']
    preds = pred_matrix[row_ind, col_ind]
    test_set['Prediction'] = preds
    
    return test_set

In [10]:
data = pd.read_csv('../data/data-train.csv')
train_data = clean_df(data)
train_data.head()

Unnamed: 0,row,col,Prediction
0,43,0,4
1,60,0,3
2,66,0,4
3,71,0,3
4,85,0,5


In [11]:
# Implementation as in paper by Cai, Candes & Shen

def singular_value_thresholding(data, param, stepsize = 1.99):
    
    tau = param
    
    # Initialize
    Y = np.zeros([10000,1000])
    X = np.zeros([10000,1000])
    epsilon = 0.1
    err = 0.2
    r = 1 
    l = 5
    
    A = np.zeros([10000,1000])
    row_ind = data.loc[:,'row']
    col_ind = data.loc[:,'col']
    A[row_ind, col_ind] = data.loc[:,'Prediction']
    
    

    while err >= epsilon:
        
        # Shrinkage operator
        sigma_min = tau + 1 # Make sure to enter while loop
        s = max(r, 1)  
        while sigma_min > tau:            
            u, sigma, vh = linalg.svds(csc_matrix(Y), k = min(s, 999)) # Compute SVD with s first singular values
            sigma_min = min(sigma)
            s = s + l
        r = np.count_nonzero(sigma > tau) # Update r to be the number of singular values > tau
        
        sigma_diag = np.diag(sigma) # Represent singular values in matrix
        sigma_new = sigma_diag - tau
        sigma_new[sigma_new < 0] = 0
        X = np.dot(u, np.dot(sigma_new, vh))
        
        # Projection
        proj = np.zeros([10000,1000])
        proj[row_ind, col_ind] = (A - X)[row_ind, col_ind]
        
        # Print error
        err = np.linalg.norm((X - A)[row_ind, col_ind], ord = 2)/np.linalg.norm(A[row_ind, col_ind])
        print("Error is: ", err)
        
        # Step forward
        Y = Y + stepsize * proj
    
    # Do final shrinkage
    u, s, vh = np.linalg.svd(Y)
    s_diag = np.concatenate((np.diag(s), np.zeros((9000,1000))), axis = 0) # Represent singular values in matrix
    s_new = s_diag - tau
    s_new[s_new < 0] = 0
    pred_matrix = np.dot(u, np.dot(s_new, vh))
    
    return pred_matrix

In [132]:
pred_matrix = singular_value_thresholding(train_data, param = 10000, stepsize = 1.99)

Error is:  1.0
Error is:  1.0
Error is:  1.0
Error is:  0.7049881920503492
Error is:  0.5363885815842077
Error is:  0.48271987120531995
Error is:  0.44742632294615514
Error is:  0.4197623208120368
Error is:  0.39725733504870075
Error is:  0.3786496005068357
Error is:  0.3630839712567698
Error is:  0.34993582940492757
Error is:  0.3387354961752354
Error is:  0.3291231852021084
Error is:  0.32081906518951714
Error is:  0.31360255272246473
Error is:  0.307297670504084
Error is:  0.30064901964708973
Error is:  0.28978869835663773
Error is:  0.2855470612427575
Error is:  0.2824674559197408
Error is:  0.2798122500766365
Error is:  0.2774395770884058
Error is:  0.2752966468029135
Error is:  0.2733509824424347
Error is:  0.27157799772588187
Error is:  0.269957615004406
Error is:  0.2684729000061702
Error is:  0.26710932933242854
Error is:  0.2658543200562507
Error is:  0.2646968931753675
Error is:  0.2636274168241656
Error is:  0.2626374024081918
Error is:  0.2617193383647986
Error is:  0.2608

In [133]:
# Submit score

sample = pd.read_csv('../data/sample-Submission.csv')
temp = clean_df(sample)

temp = predict_scores(test_set = temp, pred_matrix = pred_matrix)
sample['Prediction'] = temp['Prediction']
sample.to_csv("../predictions/svt_tau_10000_2.csv", index = False)