In [29]:
import numpy as np
import pandas as pd
import math
from tqdm.auto import tqdm

from scipy.sparse import linalg
from scipy.sparse import csr_matrix, csc_matrix

import utils
import models

In [2]:
data = pd.read_csv('data_train.csv')
train_data = utils.clean_df(data)
train_data.head()

Unnamed: 0,row,col,Prediction
0,43,0,4
1,60,0,3
2,66,0,4
3,71,0,3
4,85,0,5


## Naive implementation

In [56]:
def singular_value_thresholding(data, param, stepsize = 1.99):
    
    tao = param
    
    # Initialize
    Y = np.zeros([10000,1000])
    X = np.zeros([10000,1000])
    epsilon = 0.001
    err = 10
    
    
    A = np.zeros([10000,1000])
    row_ind = data.loc[:,'row']
    col_ind = data.loc[:,'col']
    A[row_ind, col_ind] = data.loc[:,'Prediction']
    

    while err >= epsilon:
        
        # Shrinkage operator
        u, s, vh = np.linalg.svd(Y, full_matrices = False)
        s_diag = np.diag(s) # Represent singular values in matrix
        s_new = s_diag - tao
        s_new[s_new < 0] = 0
        X = np.dot(u, np.dot(s_new, vh))
        
        # Projection
        proj = np.zeros([10000,1000])
        proj[row_ind, col_ind] = (A - X)[row_ind, col_ind]
        
        # Print error
        err = np.linalg.norm((X - A)[row_ind, col_ind], ord = 2)/np.linalg.norm(A[row_ind, col_ind])
        print("Error is: ", err)
        
        # Step forward
        Y = Y + stepsize * proj
    
    # Do final shrinkage
    u, s, vh = np.linalg.svd(Y)
    s_diag = np.concatenate((np.diag(s), np.zeros((9000,1000))), axis = 0) # Represent singular values in matrix
    s_new = s_diag - tao
    s_new[s_new < 0] = 0
    pred_matrix = np.dot(u, np.dot(s_new, vh))
    
    return pred_matrix

In [63]:
pred_matrix = singular_value_thresholding(train_data, param = 500, stepsize = 1)

Error is:  1.0
Error is:  6.559022766870573
Error is:  55.64959924674157
Error is:  507.18693044358474
Error is:  4659.458933518212
Error is:  42843.039169483905
Error is:  393972.6705687631
Error is:  3622900.3200248885
Error is:  33315563.180187136
Error is:  306364179.7621323
Error is:  2817272257.2368183
Error is:  25907150728.148342
Error is:  238237698622.0659
Error is:  2190792868010.5957
Error is:  20146154106990.75
Error is:  185260565354706.56
Error is:  1703624269588837.2


KeyboardInterrupt: 

## Now try cross validation

In [5]:
fold_1 = pd.read_csv('fold_1.csv', index_col = 0)
fold_2 = pd.read_csv('fold_2.csv', index_col = 0)
fold_3 = pd.read_csv('fold_3.csv', index_col = 0)
fold_4 = pd.read_csv('fold_4.csv', index_col = 0)
fold_5 = pd.read_csv('fold_5.csv', index_col = 0)
folds = [fold_1, fold_2, fold_3, fold_4, fold_5]

In [22]:
tao = np.array([500])
rmse = utils.k_fold_cv(folds, tao, singular_value_thresholding)

## too slow...

Error is:  1.0
Error is:  0.6853079970390306
Error is:  0.4475273534694455
Error is:  0.2974162309966758
Error is:  0.20043579446347662
Error is:  0.14285146178948485
Error is:  0.10809431623202413
Error is:  0.08600090682540557
Error is:  0.0708897898760776
Error is:  0.06006891233944709
Error is:  0.051958262064876484
Error is:  0.04570147226236277
Error is:  0.040672285103972805
Error is:  0.0365443905583592
Error is:  0.03306246369934366
Error is:  0.03009882978577329
Error is:  0.027517138073131577
Error is:  0.025265183539552684
Error is:  0.023262390494683653
Error is:  0.021484628583530216
Error is:  0.019879516020762492
Error is:  0.01843593940588571
Error is:  0.017119737865932256
Error is:  0.015925273551469745
Error is:  0.014827922133026005
Error is:  0.01382551666655284
Error is:  0.01289964670540607
Error is:  0.012049434510047148
Error is:  0.011261144210924933
Error is:  0.010534305815062489
Error is:  0.009858366481027121
Error is:  0.009233152406682587
Error is:  0.0

KeyboardInterrupt: 

## Same implementation as in paper

In [131]:
# Implementation as in paper by Cai, Candes & Shen

def singular_value_thresholding2(data, param, stepsize = 1.99):
    
    tao = param
    
    # Initialize
    Y = np.zeros([10000,1000])
    X = np.zeros([10000,1000])
    epsilon = 0.1 ## Tune
    err = 10
    r = 1 # r=0 in paper
    l = 5
    
    A = np.zeros([10000,1000])
    row_ind = data.loc[:,'row']
    col_ind = data.loc[:,'col']
    A[row_ind, col_ind] = data.loc[:,'Prediction']
    
    

    while err >= epsilon:
        
        # Shrinkage operator
        sigma_min = tao + 1 # Make sure to enter while loop
        s = max(r, 1)  # s = r +1 in paper
        while sigma_min > tao:            
            u, sigma, vh = linalg.svds(csc_matrix(Y), k = min(s, 999)) # Compute SVD with s first singular values
            sigma_min = min(sigma) # look at this, was s[0] before. Should be the smallest singular value
            s = s + l
        r = np.count_nonzero(sigma > tao) # Update r to be the number of singular values > tao
        
        sigma_diag = np.diag(sigma) # Represent singular values in matrix
        sigma_new = sigma_diag - tao
        sigma_new[sigma_new < 0] = 0
        X = np.dot(u, np.dot(sigma_new, vh))
        
        # Projection
        proj = np.zeros([10000,1000])
        proj[row_ind, col_ind] = (A - X)[row_ind, col_ind]
        
        # Print error
        err = np.linalg.norm((X - A)[row_ind, col_ind], ord = 2)/np.linalg.norm(A[row_ind, col_ind])
        print("Error is: ", err)
        
        # Step forward
        Y = Y + stepsize * proj
    
    # Do final shrinkage
    u, s, vh = np.linalg.svd(Y)
    s_diag = np.concatenate((np.diag(s), np.zeros((9000,1000))), axis = 0) # Represent singular values in matrix
    s_new = s_diag - tao
    s_new[s_new < 0] = 0
    pred_matrix = np.dot(u, np.dot(s_new, vh))
    
    return pred_matrix

In [132]:
pred_matrix = singular_value_thresholding2(train_data, param = 10000, stepsize = 1.99)

Error is:  1.0
Error is:  1.0
Error is:  1.0
Error is:  0.7049881920503492
Error is:  0.5363885815842077
Error is:  0.48271987120531995
Error is:  0.44742632294615514
Error is:  0.4197623208120368
Error is:  0.39725733504870075
Error is:  0.3786496005068357
Error is:  0.3630839712567698
Error is:  0.34993582940492757
Error is:  0.3387354961752354
Error is:  0.3291231852021084
Error is:  0.32081906518951714
Error is:  0.31360255272246473
Error is:  0.307297670504084
Error is:  0.30064901964708973
Error is:  0.28978869835663773
Error is:  0.2855470612427575
Error is:  0.2824674559197408
Error is:  0.2798122500766365
Error is:  0.2774395770884058
Error is:  0.2752966468029135
Error is:  0.2733509824424347
Error is:  0.27157799772588187
Error is:  0.269957615004406
Error is:  0.2684729000061702
Error is:  0.26710932933242854
Error is:  0.2658543200562507
Error is:  0.2646968931753675
Error is:  0.2636274168241656
Error is:  0.2626374024081918
Error is:  0.2617193383647986
Error is:  0.2608

In [122]:
tao = np.array([10000])
rmse = utils.k_fold_cv(folds, tao, singular_value_thresholding2)

Error is:  1.0
Error is:  1.0
Error is:  1.0
Error is:  0.9664584612149036
Error is:  0.6465622616240232
Error is:  0.5422575551420934
Error is:  0.49433710568904593
Error is:  0.46224224766563166
Error is:  0.43700277526503967
Error is:  0.4160550659559526
Error is:  0.39829028803438865
Error is:  0.3830447701404567
Error is:  0.36985054848208493
Error is:  0.35835335450877476
Error is:  0.3482758334396521
Error is:  0.33939659809915246
Error is:  0.33153651940779233
Error is:  0.3245490827033815
Error is:  0.31831333702662457
Error is:  0.3127286174667486
Error is:  0.30771052816686006
Error is:  0.30318784369496055
Error is:  0.29331197240372364
Error is:  0.28822056844615607
Error is:  0.2850817403531234
Error is:  0.2825780065578252
Error is:  0.28037904087727633
Error is:  0.27838564890096196
Error is:  0.2765572733091442
Error is:  0.27487089037560625
Error is:  0.27331017264437474
Error is:  0.2718621434038272
Error is:  0.2705159119583066
Error is:  0.2692620823610642
Error is

Error is:  0.24947023694385143
Error is:  0.24919983439578236
Error is:  0.24822334127718568
Error is:  0.2472240738048801
Error is:  0.24676531469748747
Error is:  0.24610810015588364
Error is:  0.2448581905440511
Error is:  0.24429175990106303
Error is:  0.24394099546422915
Error is:  0.2429898631454824
Error is:  0.24193949464211845
Error is:  0.24144864989370557
Error is:  0.24113655254064434
Error is:  0.2408870567746213
Error is:  0.24066402340906473
Error is:  0.23979600828536704
Error is:  0.23931970617164222
Error is:  0.23895165347077849
Error is:  0.23861420930596605
Error is:  0.23821967141366449
Error is:  0.23733155051328372
Error is:  0.2362057779962753
Error is:  0.23544018132515893
Error is:  0.2343672689857269
Error is:  0.23336356083696133
Error is:  0.23233042224658354
Error is:  0.23171489722622052
Error is:  0.23082552609920673
Error is:  0.229574060108862
Error is:  0.22855340322050272
Error is:  0.22725755497606637
Error is:  0.22595963664905194
Error is:  0.224

In [123]:
rmse

array([1.02983031])

In [133]:
# Try a submission

sample = pd.read_csv('sampleSubmission.csv')
temp = utils.clean_df(sample)

temp = utils.predict_scores(test_set = temp, pred_matrix = pred_matrix)
#temp.head()
sample['Prediction'] = temp['Prediction']
sample.to_csv("svt_tao_10000_2.csv", index = False)