In [1]:
import numpy as np 
import pandas as pd 

## Settings

In [2]:
train_path = '../data/data_train.csv'
submission_path = '../data/sampleSubmission.csv'
save_path = '../data/submission.csv'
number_of_users = 10000
number_of_movies = 1000

## Read data    

In [3]:
def read_data(impute_value=np.nan):
    data_pd = pd.read_csv(train_path) 
    users, movies = [np.squeeze(arr) for arr in np.split(data_pd.Id.str.extract('r(\d+)_c(\d+)').values.astype(int) - 1, 2, axis=-1)]
    predictions = data_pd.Prediction.values
    # Create data matrix
    data = np.full((number_of_users , number_of_movies), impute_value)
    for user, movie ,pred in zip(users, movies, predictions): 
        data[user][movie] = pred
    return data 

## Write submission file

In [15]:
def clip_data(data, clip_high=5, clip_low=1):
    data[data > clip_high] = clip_high
    data[data < clip_low] = clip_low
    return data

In [16]:
def write_submission(data, save_path=save_path):
    # clip data first 
    data = clip_data(data)
    # write submission
    data_pd = pd.read_csv(submission_path) 
    test_users, test_movies = [np.squeeze(arr) for arr in np.split(data_pd.Id.str.extract('r(\d+)_c(\d+)').values.astype(int) - 1, 2, axis=-1)]
    with open(save_path, 'w') as f: 
        f.write('Id,Prediction\n')
        for (user, movie) in zip(test_users, test_movies): 
            f.write("r{}_c{},{}\n".format(user + 1, movie + 1, data[user, movie]))

# Baseline algorithms 

In [5]:
data = read_data(impute_value=0)

## Singular Value Decomposition (SVD)

In [74]:
def svd_reconstruction(data, k=3, return_components=False):
    # SVD decomposition
    U, s, Vt = np.linalg.svd(data, full_matrices=False)
    # Using the top k eigenvalues
    S = np.zeros((number_of_movies, number_of_movies)) 
    S[:k, :k] = np.diag(s[:k])
    # Reconstruct matrix
    if return_components:
        return U, S, Vt
    else:
        return U.dot(S).dot(Vt)

## Alternating Least Squares (ALS)

In [75]:
def runALS(A, R, n_factors, n_iterations, lambda_):
    '''
    Runs Alternating Least Squares algorithm in order to calculate matrix.

    :param A: User-Item Matrix with ratings
    :param R: User-Item Matrix with 1 if there is a rating or 0 if not
    :param n_factors: How many factors each of user and item matrix will consider
    :param n_iterations: How many times to run algorithm
    :param lambda_: Regularization parameter
    :return:
    '''
    print ("Initiating ")
    lambda_ = 0.1
    n_factors = 3 
    n, m = A.shape; 
    n_iterations = 20
    Users = 5 * np.random.rand(n, n_factors)
    Items = 5 * np.random.rand(n_factors, m)

    def get_error(A, Users, Items, R):
        # This calculates the MSE of nonzero elements
        return np.sum((R * (A - np.dot(Users, Items))) ** 2) / np.sum(R)

    MSE_List = []

    print("Starting Iterations")
    for iter in range(n_iterations):
        for i, Ri in enumerate(R):
            Users[i] = np.linalg.solve(np.dot(Items, np.dot(np.diag(Ri), Items.T)) + lambda_ * np.eye(n_factors),
                                       np.dot(Items, np.dot(np.diag(Ri), A[i].T))).T
        print("Error after solving for User Matrix:", get_error(A, Users, Items, R))

        for j, Rj in enumerate(R.T):
            Items[:,j] = np.linalg.solve(np.dot(Users.T, np.dot(np.diag(Rj), Users)) + lambda_ * np.eye(n_factors),
                                     np.dot(Users.T, np.dot(np.diag(Rj), A[:, j])))
        print("Error after solving for Item Matrix:", get_error(A, Users, Items, R))

        MSE_List.append(get_error(A, Users, Items, R))
        print('th iteration is complete...', iter)

    print(MSE_List) 
    return 

## Neural Collaborative Filtering (NCF)

## Baseline Algorithm for Grade 4.0
#### 1. Normalize along items A_ij = (A_ij - mean(A_j)) / std(A_j)
#### 2. Impute missing data with 0 
#### 3. SVD with k = 3 
#### 4. Use SVD result for ALS init
#### 5. ALS with k = 3, lambda = 0.1 for 20 its.

In [90]:
# get data imputed with constant 0 
data = read_data(impute_value=0)
# normalize column-wise 
data_norm = (data - data.mean(axis=0)) / data.max(axis=0)
# svd with k = 3 
svd_rec = svd_reconstruction(data_norm)
# obtain indices of nonzero elements of svd_rec 
nonzero_indices = np.nonzero(svd_rec)
# run ALS 
R = np.zeros(svd_rec.shape)
runALS(svd_rec, R, n_factors = 3, n_iterations = 20, lambda_ = .1)

Initiating 
Starting Iterations


  return np.sum((R * (A - np.dot(Users, Items))) ** 2) / np.sum(R)


Error after solving for User Matrix: nan


KeyboardInterrupt: 

## Non-negative Matrix Factorization (NMF)
#### RMSE: 3.059

In [11]:
from sklearn.decomposition import NMF
model = NMF(init='nndsvd', random_state=0)
W = model.fit_transform(data)
H = model.components_

In [17]:
write_submission(W, save_path='../data/submission_nmf.csv')

## SVD++