In [1]:
import numpy as np
import pandas as pd
import math
from tqdm.auto import tqdm

from scipy.sparse import linalg
from scipy.sparse import csr_matrix, csc_matrix
import scipy

import utils
import models

In [2]:
data = pd.read_csv('data_train.csv')
train_data = utils.clean_df(data)
train_data.head()

Unnamed: 0,row,col,Prediction
0,43,0,4
1,60,0,3
2,66,0,4
3,71,0,3
4,85,0,5


## Naive implementation

In [3]:
def singular_value_thresholding(data, param, stepsize = 1.99):
    
    tao = param
    
    # Initialize
    Y = np.zeros([10000,1000])
    X = np.zeros([10000,1000])
    epsilon = 0.001
    err = 10
    
    
    A = np.zeros([10000,1000])
    row_ind = data.loc[:,'row']
    col_ind = data.loc[:,'col']
    A[row_ind, col_ind] = data.loc[:,'Prediction']
    

    while err >= epsilon:
        
        # Shrinkage operator
        u, s, vh = np.linalg.svd(Y, full_matrices = False)
        s_diag = np.diag(s) # Represent singular values in matrix
        s_new = s_diag - tao
        s_new[s_new < 0] = 0
        X = np.dot(u, np.dot(s_new, vh))
        
        # Projection
        proj = np.zeros([10000,1000])
        proj[row_ind, col_ind] = (A - X)[row_ind, col_ind]
        
        # Print error
        err = np.linalg.norm((X - A)[row_ind, col_ind], ord = 2)/np.linalg.norm(A[row_ind, col_ind])
        print("Error is: ", err)
        
        # Step forward
        Y = Y + stepsize * proj
    
    # Do final shrinkage
    u, s, vh = np.linalg.svd(Y)
    s_diag = np.concatenate((np.diag(s), np.zeros((9000,1000))), axis = 0) # Represent singular values in matrix
    s_new = s_diag - tao
    s_new[s_new < 0] = 0
    pred_matrix = np.dot(u, np.dot(s_new, vh))
    
    return pred_matrix

In [63]:
pred_matrix = singular_value_thresholding(train_data, param = 500, stepsize = 1)

Error is:  1.0
Error is:  6.559022766870573
Error is:  55.64959924674157
Error is:  507.18693044358474
Error is:  4659.458933518212
Error is:  42843.039169483905
Error is:  393972.6705687631
Error is:  3622900.3200248885
Error is:  33315563.180187136
Error is:  306364179.7621323
Error is:  2817272257.2368183
Error is:  25907150728.148342
Error is:  238237698622.0659
Error is:  2190792868010.5957
Error is:  20146154106990.75
Error is:  185260565354706.56
Error is:  1703624269588837.2


KeyboardInterrupt: 

## Now try cross validation

In [4]:
fold_1 = pd.read_csv('fold_1.csv', index_col = 0)
fold_2 = pd.read_csv('fold_2.csv', index_col = 0)
fold_3 = pd.read_csv('fold_3.csv', index_col = 0)
fold_4 = pd.read_csv('fold_4.csv', index_col = 0)
fold_5 = pd.read_csv('fold_5.csv', index_col = 0)
folds = [fold_1, fold_2, fold_3, fold_4, fold_5]

In [5]:
tao = np.array([500])
rmse = utils.k_fold_cv(folds, tao, singular_value_thresholding)

## too slow...

Error is:  1.0
Error is:  0.6870735970403185
Error is:  0.4492089015930965
Error is:  0.29856914578031857
Error is:  0.20065124757081537
Error is:  0.14220487659291436
Error is:  0.10672356494066394
Error is:  0.08406386810732433
Error is:  0.06850590879025638
Error is:  0.057329718389121716
Error is:  0.04894787240711076
Error is:  0.04248530185496356
Error is:  0.037311496980030896
Error is:  0.03307990118679394
Error is:  0.029534959184097594
Error is:  0.026535630369319305
Error is:  0.02394646884335389
Error is:  0.021705454334242688


KeyboardInterrupt: 

## Same implementation as in paper

In [131]:
# Implementation as in paper by Cai, Candes & Shen

def singular_value_thresholding2(data, param, stepsize = 1.99):
    
    tao = param
    
    # Initialize
    Y = np.zeros([10000,1000])
    X = np.zeros([10000,1000])
    epsilon = 0.1 ## Tune
    err = 10
    r = 1 # r=0 in paper
    l = 5
    
    A = np.zeros([10000,1000])
    row_ind = data.loc[:,'row']
    col_ind = data.loc[:,'col']
    A[row_ind, col_ind] = data.loc[:,'Prediction']
    
    

    while err >= epsilon:
        
        # Shrinkage operator
        sigma_min = tao + 1 # Make sure to enter while loop
        s = max(r, 1)  # s = r +1 in paper
        while sigma_min > tao:            
            u, sigma, vh = linalg.svds(csc_matrix(Y), k = min(s, 999)) # Compute SVD with s first singular values
            sigma_min = min(sigma) # look at this, was s[0] before. Should be the smallest singular value
            s = s + l
        r = np.count_nonzero(sigma > tao) # Update r to be the number of singular values > tao
        
        sigma_diag = np.diag(sigma) # Represent singular values in matrix
        sigma_new = sigma_diag - tao
        sigma_new[sigma_new < 0] = 0
        X = np.dot(u, np.dot(sigma_new, vh))
        
        # Projection
        proj = np.zeros([10000,1000])
        proj[row_ind, col_ind] = (A - X)[row_ind, col_ind]
        
        # Print error
        err = np.linalg.norm((X - A)[row_ind, col_ind], ord = 2)/np.linalg.norm(A[row_ind, col_ind])
        print("Error is: ", err)
        
        # Step forward
        Y = Y + stepsize * proj
    
    # Do final shrinkage
    u, s, vh = np.linalg.svd(Y)
    s_diag = np.concatenate((np.diag(s), np.zeros((9000,1000))), axis = 0) # Represent singular values in matrix
    s_new = s_diag - tao
    s_new[s_new < 0] = 0
    pred_matrix = np.dot(u, np.dot(s_new, vh))
    
    return pred_matrix

In [132]:
pred_matrix = singular_value_thresholding2(train_data, param = 10000, stepsize = 1.99)

Error is:  1.0
Error is:  1.0
Error is:  1.0
Error is:  0.7049881920503492
Error is:  0.5363885815842077
Error is:  0.48271987120531995
Error is:  0.44742632294615514
Error is:  0.4197623208120368
Error is:  0.39725733504870075
Error is:  0.3786496005068357
Error is:  0.3630839712567698
Error is:  0.34993582940492757
Error is:  0.3387354961752354
Error is:  0.3291231852021084
Error is:  0.32081906518951714
Error is:  0.31360255272246473
Error is:  0.307297670504084
Error is:  0.30064901964708973
Error is:  0.28978869835663773
Error is:  0.2855470612427575
Error is:  0.2824674559197408
Error is:  0.2798122500766365
Error is:  0.2774395770884058
Error is:  0.2752966468029135
Error is:  0.2733509824424347
Error is:  0.27157799772588187
Error is:  0.269957615004406
Error is:  0.2684729000061702
Error is:  0.26710932933242854
Error is:  0.2658543200562507
Error is:  0.2646968931753675
Error is:  0.2636274168241656
Error is:  0.2626374024081918
Error is:  0.2617193383647986
Error is:  0.2608

In [122]:
tao = np.array([10000])
rmse = utils.k_fold_cv(folds, tao, singular_value_thresholding2)

Error is:  1.0
Error is:  1.0
Error is:  1.0
Error is:  0.9664584612149036
Error is:  0.6465622616240232
Error is:  0.5422575551420934
Error is:  0.49433710568904593
Error is:  0.46224224766563166
Error is:  0.43700277526503967
Error is:  0.4160550659559526
Error is:  0.39829028803438865
Error is:  0.3830447701404567
Error is:  0.36985054848208493
Error is:  0.35835335450877476
Error is:  0.3482758334396521
Error is:  0.33939659809915246
Error is:  0.33153651940779233
Error is:  0.3245490827033815
Error is:  0.31831333702662457
Error is:  0.3127286174667486
Error is:  0.30771052816686006
Error is:  0.30318784369496055
Error is:  0.29331197240372364
Error is:  0.28822056844615607
Error is:  0.2850817403531234
Error is:  0.2825780065578252
Error is:  0.28037904087727633
Error is:  0.27838564890096196
Error is:  0.2765572733091442
Error is:  0.27487089037560625
Error is:  0.27331017264437474
Error is:  0.2718621434038272
Error is:  0.2705159119583066
Error is:  0.2692620823610642
Error is

Error is:  0.24947023694385143
Error is:  0.24919983439578236
Error is:  0.24822334127718568
Error is:  0.2472240738048801
Error is:  0.24676531469748747
Error is:  0.24610810015588364
Error is:  0.2448581905440511
Error is:  0.24429175990106303
Error is:  0.24394099546422915
Error is:  0.2429898631454824
Error is:  0.24193949464211845
Error is:  0.24144864989370557
Error is:  0.24113655254064434
Error is:  0.2408870567746213
Error is:  0.24066402340906473
Error is:  0.23979600828536704
Error is:  0.23931970617164222
Error is:  0.23895165347077849
Error is:  0.23861420930596605
Error is:  0.23821967141366449
Error is:  0.23733155051328372
Error is:  0.2362057779962753
Error is:  0.23544018132515893
Error is:  0.2343672689857269
Error is:  0.23336356083696133
Error is:  0.23233042224658354
Error is:  0.23171489722622052
Error is:  0.23082552609920673
Error is:  0.229574060108862
Error is:  0.22855340322050272
Error is:  0.22725755497606637
Error is:  0.22595963664905194
Error is:  0.224

In [123]:
rmse

array([1.02983031])

In [133]:
# Try a submission

sample = pd.read_csv('sampleSubmission.csv')
temp = utils.clean_df(sample)

temp = utils.predict_scores(test_set = temp, pred_matrix = pred_matrix)
#temp.head()
sample['Prediction'] = temp['Prediction']
sample.to_csv("svt_tao_10000_2.csv", index = False)

## Fast randomized approximation
Paper by Oh et al

In [50]:
def gramschmidt(A):
    """
    Applies the Gram-Schmidt method to A
    and returns Q and R, so Q*R = A.
    """
    R = np.zeros((A.shape[1], A.shape[1]))
    Q = np.zeros(A.shape)
    for k in range(0, A.shape[1]):
        R[k, k] = np.sqrt(np.dot(A[:, k], A[:, k]))
        Q[:, k] = A[:, k]/R[k, k]
        for j in range(k+1, A.shape[1]):
            R[k, j] = np.dot(Q[:, k], A[:, j])
            A[:, j] = A[:, j] - R[k, j]*Q[:, k]
    return Q

In [58]:
def frsvt(A, tau, k, p, q, Q_tilda = 0): # q isn't used???
    m, n = A.shape
    l = k + p #>0
    if Q_tilda == 0:
        is_range_prop = False#
    # Find optimal (for NNM ~> lowest rank) X*
    # X* = SVT(A) ~=Q*SVT(B) to avoid SVD on A (mn)

    # Step 1: estimate orthonormal Q (mn) with k-rank << n
    if not is_range_prop:
        Sigma = np.random.multivariate_normal() #nl
        Y = A@Sigma #(ml)
        Q = sp.linalg.qr(Y, pivoting=True)# interface to lapack dgeqp3, also exists a low-level wrapper
    else:
        Sigma = np.random.multivariate_normal() # np
        Y = A@Sigma #Y(mp)
        ## Initialize Q via range propagation
            ## orthonormal biasis evolve slow
        Q_Y = gramschmidt(Y) #todo
        Q = concat_by_col(Q_tilda, Y)
    ## power interation
    eta = 2
    while eta > 0: #what is eta?
        Q = sp.linalg.qr(A@A.T@Q)
        eta -= 1
    #Step 2: SVT on B (~kn)
        ## SVD (by eigen decomposition if pos semi-def) + shrinkage on SV
    # C = WP = WVDV.T
    H, C = sp.linalg.qr(A.T@Q)
    # Form a pos semi-def, W(cpl, mn), P(cpl, nn, herm pos semi-def)
    W, P = sp.linalg.polar(C)
    V, D = sp.linalg.eig(P)
    Q_tilda = Q@V
    SS_D = np.sign(D) * np.max(np.abs(D)-tau, 0) #soft shrinkage
    SVT_A = Q_tilda@SS_D@(H@W@V).T
    return SVT_A, Q_tilda

In [59]:
frsvt(A = np.random.rand(3,3), Q_tilda = np.array([[1,0,0],[0,1,0],[0,0,1]]), tau = 2, k = 2, p = 1, q = 0)

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

## SVT without SVD (by Cai & Osher)

In [6]:
matrix = np.random.rand(10000,1000)

In [7]:
w, z = scipy.linalg.polar(matrix)

In [8]:
w.shape, z.shape

((10000, 1000), (1000, 1000))

In [13]:
def svt_without_svd(Y, tau):
    
    epsilon = 10**(-6)
    max_iter = 1000
    
    # Step 1: compute polar decomposition
    W, Z = scipy.linalg.polar(Y)
    
    # Step 2: project onto 2-norm ball
    
    # Save computations
    Z_squared = np.dot(Z,Z)
    Id = np.eye(Z.shape[1])
    error = epsilon * np.linalg.norm(Z, ord = 'fro')
    P = np.zeros([Z.shape[1], Z.shape[1]])
    
    for i in np.arange(max_iter):
        print(i)
        temp = 0.5*P + 0.25*Z + 0.75*Id - np.dot(np.linalg.inv(2*P - Z - tau*Id), P - 0.25*Z_squared - 0.75*Id)
        if (np.linalg.norm(temp - P, ord = 'fro') <= error):
            break
        else:
            P = temp
    
    proj_Z = P
    
    # Step 3: return SVT
    return Y - np.dot(W, proj_Z)
    

In [30]:
## Takes long and does not give correct results...

In [24]:
tau = 100

m1 = svt_without_svd(matrix, tau)
m1

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

array([[ 4.48888812e+07,  4.52775750e+07,  4.47363742e+07, ...,
         4.49237269e+07,  4.48003306e+07,  4.46326206e+07],
       [-2.94711110e+08, -2.97263018e+08, -2.93709850e+08, ...,
        -2.94939882e+08, -2.94129744e+08, -2.93028669e+08],
       [ 9.32247592e+07,  9.40319943e+07,  9.29080348e+07, ...,
         9.32971264e+07,  9.30408583e+07,  9.26925596e+07],
       ...,
       [ 8.54597213e+07,  8.61997187e+07,  8.51693783e+07, ...,
         8.55260605e+07,  8.52911379e+07,  8.49718505e+07],
       [-2.95247588e+08, -2.97804142e+08, -2.94244507e+08, ...,
        -2.95476778e+08, -2.94665165e+08, -2.93562085e+08],
       [ 6.89944978e+06,  6.95919233e+06,  6.87600977e+06, ...,
         6.90480566e+06,  6.88583949e+06,  6.86006261e+06]])

In [25]:
np.linalg.matrix_rank(m1)

999

In [12]:
np.linalg.svd(matrix, full_matrices = False)

(array([[-0.00999283, -0.01480008, -0.00232517, ...,  0.00568939,
         -0.00195651, -0.00497401],
        [-0.00973436,  0.00211894, -0.00058523, ..., -0.00284311,
          0.00270593, -0.00091853],
        [-0.01015197,  0.00319142, -0.00427293, ..., -0.0182403 ,
          0.01185327,  0.0012075 ],
        ...,
        [-0.01010339, -0.00484237, -0.00717768, ...,  0.0093838 ,
          0.01024332, -0.00540563],
        [-0.00997596,  0.01743875, -0.00593837, ..., -0.00024727,
          0.01169877,  0.00383054],
        [-0.00959595, -0.00049733, -0.00151274, ...,  0.00654988,
          0.00348116,  0.00608377]]),
 array([1582.30381962,   37.96443425,   37.86932916,   37.7642661 ,
          37.65236197,   37.59908041,   37.55029446,   37.47372743,
          37.4498594 ,   37.42068059,   37.36128846,   37.31705191,
          37.26316097,   37.25095251,   37.20327359,   37.17388219,
          37.1038561 ,   37.08034047,   37.0520557 ,   37.01520146,
          36.97868907,   36.89533

In [26]:
u, s, vh = np.linalg.svd(matrix, full_matrices = False)
s_diag = np.diag(s) # Represent singular values in matrix
s_new = s_diag - tau
s_new[s_new < 0] = 0
m2 = np.dot(u, np.dot(s_new, vh))

In [29]:
m2

array([[0.46834937, 0.47240482, 0.46675819, ..., 0.46871294, 0.46742548,
        0.46567567],
       [0.45623515, 0.4601857 , 0.45468512, ..., 0.45658931, 0.45533515,
        0.4536306 ],
       [0.47580777, 0.4799278 , 0.47419125, ..., 0.47617713, 0.47486917,
        0.47309149],
       ...,
       [0.47353118, 0.47763149, 0.47192239, ..., 0.47389876, 0.47259706,
        0.47082789],
       [0.46755861, 0.47160721, 0.46597012, ..., 0.46792156, 0.46663628,
        0.46488942],
       [0.44974799, 0.45364237, 0.44822001, ..., 0.45009711, 0.44886079,
        0.44718048]])