In [1]:
import scipy.io
import numpy as np
from scipy.sparse import csr_matrix
from proj_L1 import proj_L1
from time import time

In [2]:
def proj_nuc(Z, kappa):
    #PROJNUC This function implements the projection onto nuclear norm ball.
    
    u,s,vh = np.linalg.svd(Z, full_matrices=False)
    s = proj_L1(s, kappa)

    return u @ np.diag(s) @ vh

In [3]:
def run100k():
    data = scipy.io.loadmat('./dataset/ml-100k/ub_base')  # load 100k dataset

    Rating = data['Rating'].flatten()
    UserID = data['UserID'].flatten() - 1  # Python indexing starts from 0 whereas Matlab from 1
    MovID = data['MovID'].flatten() - 1    # Python indexing starts from 0 whereas Matlab from 1

    nM = np.amax(data['MovID'])
    nU = np.amax(data['UserID'])

    Z = csr_matrix((Rating, (MovID, UserID)),shape=(nM, nU),dtype=float).toarray()
    kappa = 5000

    tstart = time()
    Z_proj = proj_nuc(Z, kappa)
    elapsed = time() - tstart
    print('proj for 100k data takes {} sec'.format(elapsed))

In [4]:
# NOTE: This one can take few minutes!
def run1M():
    data = scipy.io.loadmat('./dataset/ml-1m/ml1m_base')  # load 1M dataset

    Rating = data['Rating'].flatten()
    UserID = data['UserID'].flatten() - 1  # Python indexing starts from 0 whereas Matlab from 1
    MovID = data['MovID'].flatten() - 1    # Python indexing starts from 0 whereas Matlab from 1

    nM = np.amax(data['MovID'])
    nU = np.amax(data['UserID'])

    Z = csr_matrix((Rating, (MovID, UserID)),shape=(nM, nU),dtype=float).toarray()
    kappa = 5000

    tstart = time()
    Z_proj = proj_nuc(Z, kappa)
    elapsed = time() - tstart
    print('proj for 1M data takes {} sec'.format(elapsed))

In [5]:
for i in range(5):
    print(f"Run #{i+1}")
    run100k()
    run1M()

Run #1
proj for 100k data takes 0.31986260414123535 sec
proj for 1M data takes 25.209717512130737 sec
Run #2
proj for 100k data takes 0.29839134216308594 sec
proj for 1M data takes 24.565474033355713 sec
Run #3
proj for 100k data takes 0.2955615520477295 sec
proj for 1M data takes 24.431020259857178 sec
Run #4
proj for 100k data takes 0.29817795753479004 sec
proj for 1M data takes 24.41025185585022 sec
Run #5
proj for 100k data takes 0.294313907623291 sec
proj for 1M data takes 24.31813359260559 sec
