In [1]:
import numpy as np
import pandas as pd
import math
from tqdm.auto import tqdm

from scipy.sparse import linalg
from scipy.sparse import csr_matrix, csc_matrix
import scipy

import utils
import models

In [124]:
data = pd.read_csv('data_train.csv')
train_data = utils.clean_df(data)
train_data.head()

Unnamed: 0,row,col,Prediction
0,43,0,4
1,60,0,3
2,66,0,4
3,71,0,3
4,85,0,5


## Naive implementation

In [3]:
def singular_value_thresholding(data, param, stepsize = 1.99):
    
    tao = param
    
    # Initialize
    Y = np.zeros([10000,1000])
    X = np.zeros([10000,1000])
    epsilon = 0.001
    err = 10
    
    
    A = np.zeros([10000,1000])
    row_ind = data.loc[:,'row']
    col_ind = data.loc[:,'col']
    A[row_ind, col_ind] = data.loc[:,'Prediction']
    

    while err >= epsilon:
        
        # Shrinkage operator
        u, s, vh = np.linalg.svd(Y, full_matrices = False)
        s_diag = np.diag(s) # Represent singular values in matrix
        s_new = s_diag - tao
        s_new[s_new < 0] = 0
        X = np.dot(u, np.dot(s_new, vh))
        
        # Projection
        proj = np.zeros([10000,1000])
        proj[row_ind, col_ind] = (A - X)[row_ind, col_ind]
        
        # Print error
        err = np.linalg.norm((X - A)[row_ind, col_ind], ord = 2)/np.linalg.norm(A[row_ind, col_ind])
        print("Error is: ", err)
        
        # Step forward
        Y = Y + stepsize * proj
    
    # Do final shrinkage
    u, s, vh = np.linalg.svd(Y)
    s_diag = np.concatenate((np.diag(s), np.zeros((9000,1000))), axis = 0) # Represent singular values in matrix
    s_new = s_diag - tao
    s_new[s_new < 0] = 0
    pred_matrix = np.dot(u, np.dot(s_new, vh))
    
    return pred_matrix

In [63]:
pred_matrix = singular_value_thresholding(train_data, param = 500, stepsize = 1)

Error is:  1.0
Error is:  6.559022766870573
Error is:  55.64959924674157
Error is:  507.18693044358474
Error is:  4659.458933518212
Error is:  42843.039169483905
Error is:  393972.6705687631
Error is:  3622900.3200248885
Error is:  33315563.180187136
Error is:  306364179.7621323
Error is:  2817272257.2368183
Error is:  25907150728.148342
Error is:  238237698622.0659
Error is:  2190792868010.5957
Error is:  20146154106990.75
Error is:  185260565354706.56
Error is:  1703624269588837.2


KeyboardInterrupt: 

## Now try cross validation

In [4]:
fold_1 = pd.read_csv('fold_1.csv', index_col = 0)
fold_2 = pd.read_csv('fold_2.csv', index_col = 0)
fold_3 = pd.read_csv('fold_3.csv', index_col = 0)
fold_4 = pd.read_csv('fold_4.csv', index_col = 0)
fold_5 = pd.read_csv('fold_5.csv', index_col = 0)
folds = [fold_1, fold_2, fold_3, fold_4, fold_5]

In [5]:
tao = np.array([500])
rmse = utils.k_fold_cv(folds, tao, singular_value_thresholding)

## too slow...

Error is:  1.0
Error is:  0.6870735970403185
Error is:  0.4492089015930965
Error is:  0.29856914578031857
Error is:  0.20065124757081537
Error is:  0.14220487659291436
Error is:  0.10672356494066394
Error is:  0.08406386810732433
Error is:  0.06850590879025638
Error is:  0.057329718389121716
Error is:  0.04894787240711076
Error is:  0.04248530185496356
Error is:  0.037311496980030896
Error is:  0.03307990118679394
Error is:  0.029534959184097594
Error is:  0.026535630369319305
Error is:  0.02394646884335389
Error is:  0.021705454334242688


KeyboardInterrupt: 

## Same implementation as in paper

In [131]:
# Implementation as in paper by Cai, Candes & Shen

def singular_value_thresholding2(data, param, stepsize = 1.99):
    
    tao = param
    
    # Initialize
    Y = np.zeros([10000,1000])
    X = np.zeros([10000,1000])
    epsilon = 0.1 ## Tune
    err = 10
    r = 1 # r=0 in paper
    l = 5
    
    A = np.zeros([10000,1000])
    row_ind = data.loc[:,'row']
    col_ind = data.loc[:,'col']
    A[row_ind, col_ind] = data.loc[:,'Prediction']
    
    

    while err >= epsilon:
        
        # Shrinkage operator
        sigma_min = tao + 1 # Make sure to enter while loop
        s = max(r, 1)  # s = r +1 in paper
        while sigma_min > tao:            
            u, sigma, vh = linalg.svds(csc_matrix(Y), k = min(s, 999)) # Compute SVD with s first singular values
            sigma_min = min(sigma) # look at this, was s[0] before. Should be the smallest singular value
            s = s + l
        r = np.count_nonzero(sigma > tao) # Update r to be the number of singular values > tao
        
        sigma_diag = np.diag(sigma) # Represent singular values in matrix
        sigma_new = sigma_diag - tao
        sigma_new[sigma_new < 0] = 0
        X = np.dot(u, np.dot(sigma_new, vh))
        
        # Projection
        proj = np.zeros([10000,1000])
        proj[row_ind, col_ind] = (A - X)[row_ind, col_ind]
        
        # Print error
        err = np.linalg.norm((X - A)[row_ind, col_ind], ord = 2)/np.linalg.norm(A[row_ind, col_ind])
        print("Error is: ", err)
        
        # Step forward
        Y = Y + stepsize * proj
    
    # Do final shrinkage
    u, s, vh = np.linalg.svd(Y)
    s_diag = np.concatenate((np.diag(s), np.zeros((9000,1000))), axis = 0) # Represent singular values in matrix
    s_new = s_diag - tao
    s_new[s_new < 0] = 0
    pred_matrix = np.dot(u, np.dot(s_new, vh))
    
    return pred_matrix

In [132]:
pred_matrix = singular_value_thresholding2(train_data, param = 10000, stepsize = 1.99)

Error is:  1.0
Error is:  1.0
Error is:  1.0
Error is:  0.7049881920503492
Error is:  0.5363885815842077
Error is:  0.48271987120531995
Error is:  0.44742632294615514
Error is:  0.4197623208120368
Error is:  0.39725733504870075
Error is:  0.3786496005068357
Error is:  0.3630839712567698
Error is:  0.34993582940492757
Error is:  0.3387354961752354
Error is:  0.3291231852021084
Error is:  0.32081906518951714
Error is:  0.31360255272246473
Error is:  0.307297670504084
Error is:  0.30064901964708973
Error is:  0.28978869835663773
Error is:  0.2855470612427575
Error is:  0.2824674559197408
Error is:  0.2798122500766365
Error is:  0.2774395770884058
Error is:  0.2752966468029135
Error is:  0.2733509824424347
Error is:  0.27157799772588187
Error is:  0.269957615004406
Error is:  0.2684729000061702
Error is:  0.26710932933242854
Error is:  0.2658543200562507
Error is:  0.2646968931753675
Error is:  0.2636274168241656
Error is:  0.2626374024081918
Error is:  0.2617193383647986
Error is:  0.2608

In [122]:
tao = np.array([10000])
rmse = utils.k_fold_cv(folds, tao, singular_value_thresholding2)

Error is:  1.0
Error is:  1.0
Error is:  1.0
Error is:  0.9664584612149036
Error is:  0.6465622616240232
Error is:  0.5422575551420934
Error is:  0.49433710568904593
Error is:  0.46224224766563166
Error is:  0.43700277526503967
Error is:  0.4160550659559526
Error is:  0.39829028803438865
Error is:  0.3830447701404567
Error is:  0.36985054848208493
Error is:  0.35835335450877476
Error is:  0.3482758334396521
Error is:  0.33939659809915246
Error is:  0.33153651940779233
Error is:  0.3245490827033815
Error is:  0.31831333702662457
Error is:  0.3127286174667486
Error is:  0.30771052816686006
Error is:  0.30318784369496055
Error is:  0.29331197240372364
Error is:  0.28822056844615607
Error is:  0.2850817403531234
Error is:  0.2825780065578252
Error is:  0.28037904087727633
Error is:  0.27838564890096196
Error is:  0.2765572733091442
Error is:  0.27487089037560625
Error is:  0.27331017264437474
Error is:  0.2718621434038272
Error is:  0.2705159119583066
Error is:  0.2692620823610642
Error is

Error is:  0.24947023694385143
Error is:  0.24919983439578236
Error is:  0.24822334127718568
Error is:  0.2472240738048801
Error is:  0.24676531469748747
Error is:  0.24610810015588364
Error is:  0.2448581905440511
Error is:  0.24429175990106303
Error is:  0.24394099546422915
Error is:  0.2429898631454824
Error is:  0.24193949464211845
Error is:  0.24144864989370557
Error is:  0.24113655254064434
Error is:  0.2408870567746213
Error is:  0.24066402340906473
Error is:  0.23979600828536704
Error is:  0.23931970617164222
Error is:  0.23895165347077849
Error is:  0.23861420930596605
Error is:  0.23821967141366449
Error is:  0.23733155051328372
Error is:  0.2362057779962753
Error is:  0.23544018132515893
Error is:  0.2343672689857269
Error is:  0.23336356083696133
Error is:  0.23233042224658354
Error is:  0.23171489722622052
Error is:  0.23082552609920673
Error is:  0.229574060108862
Error is:  0.22855340322050272
Error is:  0.22725755497606637
Error is:  0.22595963664905194
Error is:  0.224

In [123]:
rmse

array([1.02983031])

In [133]:
# Try a submission

sample = pd.read_csv('sampleSubmission.csv')
temp = utils.clean_df(sample)

temp = utils.predict_scores(test_set = temp, pred_matrix = pred_matrix)
#temp.head()
sample['Prediction'] = temp['Prediction']
sample.to_csv("svt_tao_10000_2.csv", index = False)

## Fast randomized approximation
Paper by Oh et al

In [50]:
def gramschmidt(A):
    """
    Applies the Gram-Schmidt method to A
    and returns Q and R, so Q*R = A.
    """
    R = np.zeros((A.shape[1], A.shape[1]))
    Q = np.zeros(A.shape)
    for k in range(0, A.shape[1]):
        R[k, k] = np.sqrt(np.dot(A[:, k], A[:, k]))
        Q[:, k] = A[:, k]/R[k, k]
        for j in range(k+1, A.shape[1]):
            R[k, j] = np.dot(Q[:, k], A[:, j])
            A[:, j] = A[:, j] - R[k, j]*Q[:, k]
    return Q

In [58]:
def frsvt(A, tau, k, p, q, Q_tilda = 0): # q isn't used???
    m, n = A.shape
    l = k + p #>0
    if Q_tilda == 0:
        is_range_prop = False#
    # Find optimal (for NNM ~> lowest rank) X*
    # X* = SVT(A) ~=Q*SVT(B) to avoid SVD on A (mn)

    # Step 1: estimate orthonormal Q (mn) with k-rank << n
    if not is_range_prop:
        Sigma = np.random.multivariate_normal() #nl
        Y = A@Sigma #(ml)
        Q = sp.linalg.qr(Y, pivoting=True)# interface to lapack dgeqp3, also exists a low-level wrapper
    else:
        Sigma = np.random.multivariate_normal() # np
        Y = A@Sigma #Y(mp)
        ## Initialize Q via range propagation
            ## orthonormal biasis evolve slow
        Q_Y = gramschmidt(Y) #todo
        Q = concat_by_col(Q_tilda, Y)
    ## power interation
    eta = 2
    while eta > 0: #what is eta?
        Q = sp.linalg.qr(A@A.T@Q)
        eta -= 1
    #Step 2: SVT on B (~kn)
        ## SVD (by eigen decomposition if pos semi-def) + shrinkage on SV
    # C = WP = WVDV.T
    H, C = sp.linalg.qr(A.T@Q)
    # Form a pos semi-def, W(cpl, mn), P(cpl, nn, herm pos semi-def)
    W, P = sp.linalg.polar(C)
    V, D = sp.linalg.eig(P)
    Q_tilda = Q@V
    SS_D = np.sign(D) * np.max(np.abs(D)-tau, 0) #soft shrinkage
    SVT_A = Q_tilda@SS_D@(H@W@V).T
    return SVT_A, Q_tilda

In [59]:
frsvt(A = np.random.rand(3,3), Q_tilda = np.array([[1,0,0],[0,1,0],[0,0,1]]), tau = 2, k = 2, p = 1, q = 0)

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

## SVT without SVD (by Cai & Osher)

In [115]:
np.random.seed(0)
matrix = np.random.multivariate_normal(mean = np.zeros(500), cov = np.eye(500), size=500)
matrix.shape

(500, 500)

In [127]:
def svt_without_svd(Y, tau, delta):
    m, n = Y.shape
    epsilon = 10**(-6)
    max_iter = 1000
    
    # Step 1: compute polar decomposition Y = WZ
    W, Z = scipy.linalg.polar(Y)
    
    # Step 2: project Z onto 2-norm ball
    
    # Save computations
    Z_squared_25 = 0.25*Z@Z
    Z_25 = 0.25*Z
    Id = np.eye(n)
    I_75 = 0.75 * Id
    I_tau = tau*Id
    
    st1 = time.time()
    Sigma, V = scipy.linalg.eig(Z)
    cond = np.bitwise_and(Sigma <= tau*(1+delta), Sigma >= tau*(1-delta))
    Sigma1 = np.zeros(V.shape)
    np.fill_diagonal(Sigma1, np.where(cond, Sigma, np.zeros(Sigma.shape)))
    V1 = np.zeros(V.shape)
    V1[:, cond] = V[:, cond]
    Z -= V1@Sigma1@V1.T
    error = epsilon * np.linalg.norm(Z, ord = 'fro')
    P = np.zeros((n,n))
    #print("eig decomp: {}".format(time.time()-st1))
    for i in np.arange(max_iter):
        #if i==0:
            #st2 = time.time()
        temp = 0.5*P + Z_25 + I_75 - np.linalg.inv(2*P - Z - I_tau)@(P - Z_squared_25 - I_75)
        if (np.linalg.norm(temp - P, ord = 'fro') <= error):
            break
        else:
            P = temp
        #if i==0:
            #print("one iteration: {}".format(time.time()-st2))
    proj_Z = P
    
    # Step 3: return SVT
    return Y - W@proj_Z
    

In [100]:
## Takes long and does not give correct results...

In [135]:
# Implementation as in paper by Cai, Candes & Shen

def singular_value_thresholding3(data, param, stepsize = 1.99):
    
    tao = param
    
    # Initialize
    Y = np.zeros([10000,1000])
    X = np.zeros([10000,1000])
    epsilon = 0.1 ## Tune
    err = 10
    r = 1 # r=0 in paper
    l = 5
    
    A = np.zeros([10000,1000])
    row_ind = data.loc[:,'row']
    col_ind = data.loc[:,'col']
    A[row_ind, col_ind] = data.loc[:,'Prediction']
    
    

    while err >= epsilon:
        
        # Shrinkage operator
        X = svt_without_svd(Y, tao, 0.1)
        
        # Projection
        proj = np.zeros([10000,1000])
        proj[row_ind, col_ind] = (A - X)[row_ind, col_ind]
        
        # Print error
        err = np.linalg.norm((X - A)[row_ind, col_ind], ord = 2)/np.linalg.norm(A[row_ind, col_ind])
        print("Error is: ", err)
        
        # Step forward
        Y = Y + stepsize * proj
    
    # Do final shrinkage
    u, s, vh = np.linalg.svd(Y)
    s_diag = np.concatenate((np.diag(s), np.zeros((9000,1000))), axis = 0) # Represent singular values in matrix
    s_new = s_diag - tao
    s_new[s_new < 0] = 0
    pred_matrix = np.dot(u, np.dot(s_new, vh))
    
    return pred_matrix

In [137]:
pred_matrix = singular_value_thresholding3(train_data, param = 10000, stepsize = 1.99)

Error is:  1.000043735354056
Error is:  0.11722745456664233
Error is:  0.030390312541884287


# Delete Below?

In [117]:
tau = np.sqrt(500)/2
delta = 0.5
m1 = svt_without_svd(matrix, tau, delta)

  a.flat[:end:step] = val


eig decomp: 0.10676002502441406
one iteration: 0.011145830154418945


In [119]:
np.linalg.matrix_rank(m1)

500

In [120]:
st = time.time()
np.linalg.svd(matrix, full_matrices = False)
print(time.time()-st)

0.047998905181884766


In [26]:
u, s, vh = np.linalg.svd(matrix, full_matrices = False)
s_diag = np.diag(s) # Represent singular values in matrix
s_new = s_diag - tau
s_new[s_new < 0] = 0
m2 = np.dot(u, np.dot(s_new, vh))

In [29]:
m2

array([[0.46834937, 0.47240482, 0.46675819, ..., 0.46871294, 0.46742548,
        0.46567567],
       [0.45623515, 0.4601857 , 0.45468512, ..., 0.45658931, 0.45533515,
        0.4536306 ],
       [0.47580777, 0.4799278 , 0.47419125, ..., 0.47617713, 0.47486917,
        0.47309149],
       ...,
       [0.47353118, 0.47763149, 0.47192239, ..., 0.47389876, 0.47259706,
        0.47082789],
       [0.46755861, 0.47160721, 0.46597012, ..., 0.46792156, 0.46663628,
        0.46488942],
       [0.44974799, 0.45364237, 0.44822001, ..., 0.45009711, 0.44886079,
        0.44718048]])