In [7]:
import numpy as np
import pandas as pd
import math
from tqdm.auto import tqdm

from scipy.sparse import linalg
from scipy.sparse import csr_matrix, csc_matrix

import utils
import models

In [2]:
data = pd.read_csv('data_train.csv')
train_data = utils.clean_df(data)
train_data.head()

Unnamed: 0,row,col,Prediction
0,43,0,4
1,60,0,3
2,66,0,4
3,71,0,3
4,85,0,5


In [46]:
def singular_value_thresholding(data, param, stepsize = 1):
    
    tao = param
    
    # Initialize
    Y = np.zeros([10000,1000])
    X = np.zeros([10000,1000])
    epsilon = 0.0001
    err = 10
    
    
    A = np.zeros([10000,1000])
    row_ind = data.loc[:,'row']
    col_ind = data.loc[:,'col']
    A[row_ind, col_ind] = data.loc[:,'Prediction']
    

    while err >= epsilon:
        
        # Shrinkage operator
        u, s, vh = np.linalg.svd(Y)
        s_diag = np.concatenate((np.diag(s), np.zeros((9000,1000))), axis = 0) # Represent singular values in matrix
        s_new = s_diag - tao
        s_new[s_new < 0] = 0
        X = np.dot(u, np.dot(s_new, vh))
        
        # Projection
        proj = np.zeros([10000,1000])
        proj[row_ind, col_ind] = (A - X)[row_ind, col_ind]
        
        # Print error
        err = np.linalg.norm((X - A)[row_ind, col_ind], ord = 2)/np.linalg.norm(A[row_ind, col_ind])
        print("Error is: ", err)
        
        # Step forward
        Y = Y + stepsize * proj
    
    # Do final shrinkage
    u, s, vh = np.linalg.svd(Y)
    s_diag = np.concatenate((np.diag(s), np.zeros((9000,1000))), axis = 0) # Represent singular values in matrix
    s_new = s_diag - tao
    s_new[s_new < 0] = 0
    pred_matrix = np.dot(u, np.dot(s_new, vh))
    
    return pred_matrix

In [51]:
pred_matrix = singular_value_thresholding(train_data, param = 200, stepsize = 1.5)

Error is:  1.0
Error is:  0.5339300379285385
Error is:  0.23666940511409482
Error is:  0.11064290161109319
Error is:  0.0570297022067749
Error is:  0.03166974921414365
Error is:  0.019212210542069973
Error is:  0.01261903493552372
Error is:  0.00877545337467797
Error is:  0.006385587192746265
Error is:  0.0048158485604412994
Error is:  0.003738482733536121
Error is:  0.0029715033101875036
Error is:  0.002407478674056061


KeyboardInterrupt: 

## Now try cross validation

In [80]:
fold_1 = pd.read_csv('fold_1.csv', index_col = 0)
fold_2 = pd.read_csv('fold_2.csv', index_col = 0)
fold_3 = pd.read_csv('fold_3.csv', index_col = 0)
fold_4 = pd.read_csv('fold_4.csv', index_col = 0)
fold_5 = pd.read_csv('fold_5.csv', index_col = 0)
folds = [fold_1, fold_2, fold_3, fold_4, fold_5]

In [93]:
tao = np.array([50])
rmse = utils.k_fold_cv(folds, tao, singular_value_thresholding)

Error is:  1.0
Error is:  0.3599283581513972
Error is:  0.04396712693193316
Error is:  0.014524659907214433
Error is:  0.005893872523799301
Error is:  0.0030002452530667025
Error is:  0.0019705377972428014
Error is:  0.0011927628091518627
Error is:  0.000903072132670947
Error is:  0.0007065488049324655
Error is:  0.000563361664929391
Error is:  0.00045554514886161186
Error is:  0.00037280336465938857
Error is:  0.00030841458777907153
Error is:  0.00025766836590867886
Error is:  0.0002171634172289716
Error is:  0.0001844215819926238
Error is:  0.00015763339193499402
Error is:  0.00013547366907393064
Error is:  0.00011696507703032067
Error is:  0.00010137850888592666
Error is:  8.816190408570791e-05
Error is:  1.0
Error is:  0.359887312888428
Error is:  0.044103069200794424
Error is:  0.014533105398321365
Error is:  0.00571469618563651
Error is:  0.003176845648051834
Error is:  0.0021824399647983636
Error is:  0.0014855420613217162
Error is:  0.0011700288361297065
Error is:  0.0007264565

In [94]:
rmse

array([3.75723877])

## Very slow, try with scipy sparse

In [3]:
A = np.zeros([10000,1000])
row_ind = train_data.loc[:,'row']
col_ind = train_data.loc[:,'col']
A[row_ind, col_ind] = train_data.loc[:,'Prediction']
A[row_ind, col_ind]

array([4., 3., 4., ..., 3., 4., 3.])

In [32]:
from scipy.sparse import linalg
#matrix = csr_matrix((train_data.loc[:,'Prediction'], (train_data.loc[:,'row'],train_data.loc[:,'col'])))
#csc_matrix([[1, 0, 0], [5, 0, 2], [0, -1, 0], [0, 0, 3]], dtype=float)
u, s, vt = linalg.svds(A, k= 100)
#u, s, vt = np.linalg.svd(A)
u.shape, s.shape, vt.shape

((10000, 100), (100,), (100, 1000))

In [26]:
np.count_nonzero(s > 200)

63

In [41]:
# Implementation as in paper by Cai, Candes & Shen (kind of) 

def singular_value_thresholding2(data, param, stepsize = 1):
    
    tao = param
    
    # Initialize
    Y = np.zeros([10000,1000])
    X = np.zeros([10000,1000])
    epsilon = 0.0001
    err = 10
    r = 100
    l = 50
    
    A = np.zeros([10000,1000])
    row_ind = data.loc[:,'row']
    col_ind = data.loc[:,'col']
    A[row_ind, col_ind] = data.loc[:,'Prediction']
    
    

    while err >= epsilon:
        
        # Shrinkage operator
        sigma = tao + 1
        rank = r + 1
        while sigma > tao:            
            u, s, vh = linalg.svds(Y, k= min(rank, 999))
            sigma = s[0]
            rank = rank + l
        s_diag = np.diag(s) # Represent singular values in matrix
        s_new = s_diag - tao
        s_new[s_new < 0] = 0
        X = np.dot(u, np.dot(s_new, vh))
        r = np.count_nonzero(s > tao)
        
        # Projection
        proj = np.zeros([10000,1000])
        proj[row_ind, col_ind] = (A - X)[row_ind, col_ind]
        
        # Print error
        err = np.linalg.norm((X - A)[row_ind, col_ind], ord = 2)/np.linalg.norm(A[row_ind, col_ind])
        print("Error is: ", err)
        
        # Step forward
        Y = Y + stepsize * proj
    
    # Do final shrinkage
    u, s, vh = np.linalg.svd(Y)
    s_diag = np.concatenate((np.diag(s), np.zeros((9000,1000))), axis = 0) # Represent singular values in matrix
    s_new = s_diag - tao
    s_new[s_new < 0] = 0
    pred_matrix = np.dot(u, np.dot(s_new, vh))
    
    return pred_matrix

In [43]:
pred_matrix = singular_value_thresholding2(train_data, param = 200, stepsize = 10)

Error is:  1.0


KeyboardInterrupt: 