In [1]:
import pandas as pd
import numpy as np

In [2]:
# read data
train = pd.read_csv("data_train.csv")
missing = pd.read_csv("sampleSubmission.csv")
nusers = 10000
nmovies = 1000

In [3]:
# extract row and column id from train
train_rowid = train['Id'].apply(lambda x: int(x.split("_")[0][1:]))
train_colid = train['Id'].apply(lambda x: int(x.split("_")[1][1:]))

In [4]:
# add row/col id as new columns
train['rowid'] = train_rowid
train['colid'] = train_colid

In [5]:
# calc missing row ids per column
## For each movie(col), build a set {1,..., num_users} subtracting the 
## train set {all users rated the movie}
missing = train.groupby(by='colid').apply(lambda x: list(set(np.arange(1, nusers+1)) \
                                                 - set(np.array(x['rowid']))))

In [12]:
# construct vectors of missing columns and rows
## TODO: forloop is still slow, haven't found a parallel way
missing_explode = missing.explode()
missing_colid = np.array(missing_explode.index)
missing_rowid = np.array(missing_explode)

In [13]:
# construct full dataframe from train and missing arrays
ratings = pd.DataFrame()
ratings['rowid'] = np.concatenate((train_rowid, missing_rowid))
ratings['colid'] = np.concatenate((train_colid, missing_colid))
ratings['rating'] = np.concatenate((train['Prediction'], np.zeros(len(missing_rowid))))

In [14]:
ratings.shape

(10000000, 3)

In [15]:
# Sort by row and then column, 'prediction' vector ready to be reshaped
a = np.array(ratings.sort_values(by=['rowid', 'colid'])['rating'])

In [16]:
full = np.reshape(a, newshape=(nusers, nmovies))

In [17]:
# Test
train.head()

Unnamed: 0,Id,Prediction,rowid,colid
0,r44_c1,4,44,1
1,r61_c1,3,61,1
2,r67_c1,4,67,1
3,r72_c1,3,72,1
4,r86_c1,5,86,1


In [18]:
assert full[43, 1]==4
assert full[60, 0]==3
assert full[66, 0]==4
assert full[71, 0]==3
assert full[85, 0]==5

In [98]:
X_sample = np.array([[1, 0, 3, 5], [0, 5, 0, 0], [0, 4, 2, 0]])
D, N = X_sample.shape
K = 2
U_sample, Z_sample = np.ones((D, K)), np.ones((N, K))
X_sample

array([[1, 0, 3, 5],
       [0, 5, 0, 0],
       [0, 4, 2, 0]])

In [30]:
def mse(X, U, Z): 
    obs_idx = np.nonzero(X)
    num_obs = obs_idx[0].shape[0]
    return np.sum(np.square(X[obs_idx] - (U@Z.T)[obs_idx])/2)/num_obs
‘’‘    
def mse_gradient(X, U, Z):
    obs_idx = np.nonzero(X)
    num_obs = obs_idx[0].shape[0]
    grad_U = (np.sum((X[obs_idx] - (U@Z.T)[obs_idx]))/num_obs)*Z
    grad_Z = (np.sum((X[obs_idx] - (U@Z.T)[obs_idx]))/num_obs)*U
    return np.vstack((grad_U, grad_Z))


In [99]:
def sgd(X, U, Z, gamma=0.1, max_iter=10):
    D, N = U.shape[0], Z.shape[0]
    objectives = []
    UZ = U@Z.T
    nonzero_idx = np.nonzero(X)
    while max_iter > 0:
        i = np.random.randint(nonzero_idx[0].shape[0])
        d, n = nonzero_idx[0][i], nonzero_idx[1][i]
        deviation = -(X[d,n] - UZ[d,n])
        sg_U, sg_Z = deviation*Z[n,:], deviation*U[d,:]
        U[d,:] = U[d,:] - gamma*sg_U
        Z[n,:] = Z[n,:] - gamma*sg_Z
        obj = mse(X, U, Z)
        objectives.append(obj)
        print("Loss: {}".format(obj))
        max_iter -=1
    return U, Z, objectives

In [112]:
U_final, Z_final, obj = sgd(X_sample, U_sample, Z_sample, gamma=0.01)

Loss: 0.15662004815392405
Loss: 0.1517754209950238
Loss: 0.15046823194392878
Loss: 0.14615117915396664
Loss: 0.14530823885199864
Loss: 0.14135220292263795
Loss: 0.1378068237709942
Loss: 0.13700290144063762
Loss: 0.13563784100131807
Loss: 0.13472214265483282


In [97]:
U_final@Z_final.T

array([[2.09509258, 3.75206   , 3.8992562 , 3.8473046 ],
       [1.88734   , 3.38      , 3.5126    , 3.4658    ],
       [1.4518    , 2.6       , 2.702     , 2.666     ]])

TODO:
1. Modify U, Z inplace or not?