In [29]:
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split
from scipy.sparse import csc_matrix
import tqdm
import jax 
from jax.config import config
config.update('jax_enable_x64', True)  # often needed for LBFGS that requires high-precision

import jax.numpy as jnp

In [32]:
# mount the Google Drive
from google.colab import drive

drive.mount("/content/drive")
%cd /content/drive/MyDrive/DSA4212/assignment_2
%ls

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/DSA4212/assignment_2
 [0m[01;34massignment_2_data[0m/
 dsa4212_2022_assignment_2-2.pdf
'Extension using collaborative filtering'
'Factor model from scratch.ipynb'
'Factor model using lecture and extensions'
'Normal baseline model'
 train_data.csv
 tutorial_matrix_factorization_CLASS.ipynb
 tutorial_matrix_factorization.ipynb
 tutorial_newton.ipynb
 tutorial_reconstruction.ipynb
 valid_data.csv


In [30]:
anime_data = pd.read_csv('assignment_2_data/assignment_2_anime.csv')
train_data = pd.read_csv('assignment_2_data/assignment_2_ratings_train.csv')
test_data = pd.read_csv('assignment_2_data/assignment_2_ratings_test.csv')

## Splitting training and validation data

In [31]:
train, valid = train_test_split(train_data, test_size = 0.2)

# Reset index for train and valid
train = train.reset_index()[['user_id', 'anime_id', 'rating']]
valid = valid.reset_index()[['user_id', 'anime_id', 'rating']]

## Encoding columns

In [33]:
def encode_column(column):
    keys = column.unique()
    key_to_id = {key:idx for idx,key in enumerate(keys)}
    return key_to_id, np.array([key_to_id[x] for x in column]), len(keys)

In [34]:
def encode_df(data):
    anime_ids, data['anime_id'], num_anime = encode_column(data['anime_id'])
    user_ids, data['user_id'], num_users = encode_column(data['user_id'])
    return data, num_users, num_anime, user_ids, anime_ids

In [35]:
train_data, num_users, num_anime, user_ids, anime_ids = encode_df(train_data)


In [36]:
def create_embeddings(n, K):
    """
    Creates a random numpy matrix of shape n, K with uniform values in (0, 11/K)
    n: number of items/users
    K: number of factors in the embedding 
    """
    return 11*np.random.random((n, K)) / K

## Create sparse matrix

In [37]:
def sparse_matrix_init(arraydata, rows, cols, colname): 
    return csc_matrix((arraydata[colname].values, (arraydata['user_id'].values, arraydata['anime_id'].values)), shape=(rows, cols))

## Prediction without doing U x V transpose

In [38]:
def predict(data, encoded_user, encoded_anime):
    data['prediction'] = np.sum(np.multiply(encoded_anime[data['anime_id']], encoded_user[data['user_id']]), axis = 1)
    return data

In [39]:
def mse(data, encoded_user, encoded_anime):
    A = sparse_matrix_init(data, encoded_user.shape[0], encoded_anime.shape[0], 'rating')
    A_approx = sparse_matrix_init(predict(data, encoded_user, encoded_anime), encoded_user.shape[0], encoded_anime.shape[0], 'prediction')
    MSE = np.sum((A - A_approx).power(2))/data.shape[0]
    
    return MSE

In [40]:
def gradient(data, encoded_user, encoded_anime):
    A = sparse_matrix_init(data, encoded_user.shape[0], encoded_anime.shape[0], 'rating')
    A_approx = sparse_matrix_init(predict(data, encoded_user, encoded_anime), encoded_user.shape[0], encoded_anime.shape[0], 'prediction')
    
    grad_user = (-2/data.shape[0])*((A - A_approx)*encoded_anime)
    grad_anime = (-2/data.shape[0])*((A - A_approx).T*encoded_user) 
    
    return grad_user, grad_anime

In [41]:
def model(data, encoded_user, encoded_anime, iter, lr):
    
    for i in tqdm.tqdm(range(iter)):
        grad_user, grad_anime = gradient(data, encoded_user, encoded_anime)
        
        encoded_user = encoded_user - lr*grad_user
        encoded_anime = encoded_anime - lr*grad_anime

    return encoded_user, encoded_anime

In [42]:
encoded_user = create_embeddings(num_users, 3)
encoded_anime = create_embeddings(num_anime, 3)

encoded_user, encoded_anime = model(train_data, encoded_user, encoded_anime, 100, 1)

100%|██████████| 100/100 [02:45<00:00,  1.65s/it]


In [43]:
def encode_new_data(valid, user_ids, anime_ids):
    """ Encodes valid_df with the same encoding as train_df.
    """
    df_val_chosen = valid['anime_id'].isin(anime_ids.keys()) & valid['user_id'].isin(user_ids.keys())
    valid_data = valid[df_val_chosen]
    valid_data['anime_id'] =  np.array([anime_ids[x] for x in valid_data['anime_id']])
    valid_data['user_id'] = np.array([user_ids[x] for x in valid_data['user_id']])
    return valid_data

valid_data = encode_new_data(valid, user_ids, anime_ids)

In [44]:
train_mse = mse(train_data, encoded_user, encoded_anime)
val_mse = mse(valid_data, encoded_user, encoded_anime)
print(train_mse, val_mse)

14.314559860986886 14.297594557583956


# Different experiments with the base model ## 
0. Base model  
train_MSE: 14.31  
valid_MSE: 14.30

1. Changing parameters  
- Changing number of latent factors  
10:  
train_MSE, 14.13  
valid_MSE, 14.16  
20:  
train_MSE, 27.42  
valid_MSE, 27.44

- Changing learning rate  
This indicates that base model have not converged yet    
10:    
train_MSE: 5.82   
valid_MSE: 5.86  
20:  
train_MSE: 6.28  
valid_MSE: 6.27  
Decaying LR:  
train_MSE: 5.83   
valid_MSE: 5.84

- Different distribution for initialisation for embeddings  
N(0,1):  
train_MSE: 142.78  
valid_MSE: 143.00  
Initialisation values might be too small  

2. Changing gradient descent  
- Alternative minimisation  
Anime first then user  
train_MSE: 10.87  
valid_MSE: 10.88  
User first then anime  
train_MSE: 5.53  
valid_MSE: 5.53

- Gradient descent with momentum  
B = 0.9  
train_MSE: 5.98  
valid_MSE: 5.98


3. Changing loss functions  
- Ridge regularisation  
B = 0.1  
train_MSE: 34.11  
valid_MSE: 34.08

4. Changing prediction method  
- Lower and upper bound predictions, basically if scores below one we make them one and if scores above 10 we make them 10.  
  


In [46]:
def ridge_mse(data, encoded_user, encoded_anime):
    A = sparse_matrix_init(data, encoded_user.shape[0], encoded_anime.shape[0], 'rating')
    A_approx = sparse_matrix_init(predict(data, encoded_user, encoded_anime), encoded_user.shape[0], encoded_anime.shape[0], 'prediction')
    MSE = np.sum((A - A_approx).power(2))/data.shape[0] + 0.1*np.sum(encoded_anime^2) + 0.1*np.sum(encoded_user^2)
    
    return MSE

def ridge_gradient(data, encoded_user, encoded_anime):
    A = sparse_matrix_init(data, encoded_user.shape[0], encoded_anime.shape[0], 'rating')
    A_approx = sparse_matrix_init(predict(data, encoded_user, encoded_anime), encoded_user.shape[0], encoded_anime.shape[0], 'prediction')
    
    lamb = 0.001
    grad_user = (-2/data.shape[0])*((A - A_approx)*encoded_anime) + 2*lamb*encoded_user
    grad_anime = (-2/data.shape[0])*((A - A_approx).T*encoded_user) + 2*lamb*encoded_anime
    
    return grad_user, grad_anime

def create_embeddings(n, K):
    """
    Creates a random numpy matrix of shape n, K with uniform values in (0, 11/K)
    n: number of items/users
    K: number of factors in the embedding 
    """
    return 11*np.random.random((n, K)) / K

def model(data, encoded_user, encoded_anime, iter, lr):
   # B = 0.9
    for i in tqdm.tqdm(range(iter)):
      #if (i == 0):
     # encoded_user_lag = encoded_user
     # encoded_anime_lag = encoded_anime
      grad_user, grad_anime = ridge_gradient(data, encoded_user, encoded_anime)
      encoded_user = encoded_user - lr*grad_user
      encoded_anime = encoded_anime - lr*grad_anime
      #user_change = encoded_user - encoded_user_lag
      #anime_change = encoded_anime - encoded_anime_lag
      #else:
      #  encoded_user_lag = encoded_user
      #  encoded_anime_lag = encoded_anime
      #  grad_user, grad_anime = gradient(data, encoded_user, encoded_anime)
      #  encoded_user = encoded_user - lr*grad_user + B*(user_change)
      #  encoded_anime = encoded_anime - lr*grad_anime + B*(anime_change)
      #  user_change = encoded_user - encoded_user_lag
      #  anime_change = encoded_anime - encoded_anime_lag
    #for i in tqdm.tqdm(range(iter)):
     #   grad_user, grad_anime = gradient(data, encoded_user, encoded_anime)
     #   #encoded_user = encoded_user - lr*grad_user
     #   encoded_anime = encoded_anime - lr*grad_anime

    return encoded_user, encoded_anime

encoded_user = create_embeddings(num_users, 3)
encoded_anime = create_embeddings(num_anime, 3)

encoded_user, encoded_anime = model(train_data, encoded_user, encoded_anime, 100, 10)

train_mse = mse(train_data, encoded_user, encoded_anime)
val_mse = mse(valid_data, encoded_user, encoded_anime)
print(train_mse, val_mse)


100%|██████████| 100/100 [02:44<00:00,  1.65s/it]


34.11222227641071 34.08283701338232


In [None]:
rate_getter = lambda x: 1 if x<1 else (10 if x>10 else x) 
vfunc = onp.vectorize(rate_getter)
#a = U_init[u_list.astype(int)] * V_init[v_list.astype(int)]
#b = a.sum(axis = 1)
#int_mat = U_init[u_list.astype(int)] * V_init[v_list.astype(int)]
#predictions = vfunc(int_mat.sum(axis = 1))
#MSE = onp.mean( (predictions - ratings_list)**2 )
#predictions = int_mat.sum(axis = 1)
#predictions.shape