In [21]:
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split
from scipy.sparse import csc_matrix
import tqdm
import jax 
from jax.config import config
config.update('jax_enable_x64', True)  # often needed for LBFGS that requires high-precision
import jax.numpy as jnp
import math

In [2]:
# mount the Google Drive
from google.colab import drive

drive.mount("/content/drive")

Mounted at /content/drive


In [3]:
%cd /content/drive/MyDrive/DSA4212/assignment_2
%ls

/content/drive/MyDrive/DSA4212/assignment_2
 [0m[01;34massignment_2_data[0m/                'Normal baseline model'
 dsa4212_2022_assignment_2-2.pdf    tutorial_matrix_factorization_CLASS.ipynb
'Factor model from scratch.ipynb'   tutorial_matrix_factorization.ipynb
'Factor model using lecture'


In [4]:
import jax 
from jax.config import config
config.update('jax_enable_x64', True)  # often needed for LBFGS that requires high-precision


import jax.numpy as jnp
import numpy as onp
import pylab as plt
import tqdm

import imageio
import os
from skimage.transform import rescale, resize, downscale_local_mean
import pandas as pd
import time
import scipy

In [5]:
anime_data = pd.read_csv('assignment_2_data/assignment_2_anime.csv')
train_data = pd.read_csv('assignment_2_data/assignment_2_ratings_train.csv')
test_data = pd.read_csv('assignment_2_data/assignment_2_ratings_test.csv')

In [6]:
#train, valid = train_test_split(train_data, test_size = 0.2)

# Reset index for train and valid
#train = train.reset_index()[['user_id', 'anime_id', 'rating']]
#valid = valid.reset_index()[['user_id', 'anime_id', 'rating']]

In [7]:
# This function takes a column that might have repeats and assigns all unique values an id
# first output key_to_id is a dictionary {unique_id:index}
# second output is for each value in the column we get the index
# third output is just the no. of unique values
def encode_column(column):
    keys = column.unique()
    key_to_id = {key:idx for idx,key in enumerate(keys)}
    return key_to_id, np.array([key_to_id[x] for x in column]), len(keys)

In [8]:
# This is basically to apply the encode_column function on our columns of choice
def encode_df(data):
    anime_ids, data['anime_id'], num_anime = encode_column(data['anime_id'])
    user_ids, data['user_id'], num_users = encode_column(data['user_id'])
    return data, num_users, num_anime, user_ids, anime_ids

In [None]:
## This is to initalise our sparse matrix
# How this works is csc_matrix(data,(row_ind,col_ind),shape) , data contains the values and row_ind and col_ind are the indices where the values are
# if there are repeating [row_ind[k], col_ind[k]], they will just add the values in data[k]
# where data, row_ind and col_ind satisfy the relationship a[row_ind[k], col_ind[k]] = data[k]

# What this is doing in our context is that rating is the data, row_ind is user_id and col_ind is anime_id 
#def sparse_matrix_init(arraydata, rows, cols, colname): 
 #   return csc_matrix((arraydata[colname].values, (arraydata['user_id'].values, arraydata['anime_id'].values)), shape=(rows, cols))

In [40]:
train_data, num_users, num_anime, user_ids, anime_ids = encode_df(train_data)
# Multiple entries of user_id meaning that
#train_data.groupby("user_id").count()
#train_data.groupby("rating").count()

In [51]:
# Basically the main difference between this implementation and the prof's implementation
# is that we were given a pxq sparse matrix A with the ratings of all users+anime_id combinations
# we also assumed that we had a u_list/user list where the entries range from 1-p and f_list/anime list where the entries range from 1-q
# Rating of first person = A[u_list[0],f_list[0]]
# But for our case it is not possible as we are given a list of anime_ids and user_ids so we need to think of someway to reexpress them as indices so that they will
# be bound by values (1-p) and (1-q) respectively

ratings_list = train_data.rating.values
u_list = train_data.user_id.values
v_list = train_data.anime_id.values

def model(U, V):
  """
  U -- (p, r) -- factors of the p users
  V -- (q, r) -- factors of the q films
  """
  predictions = jnp.sum(U[u_list.astype(int)] * V[v_list.astype(int)], axis=1)
  MSE = np.mean( (predictions - ratings_list)**2 )
  return MSE


In [52]:
# Getting gradient like in lecture
model_grad_U = jax.jit( jax.grad(model, argnums=0) )
model_grad_V = jax.jit( jax.grad(model, argnums=1) )

In [53]:
# 68421 x 3 U init matrix
U_init = onp.random.normal(0,1,size=(68421,3))

# 9632 x 3 matrix V init matrix
V_init = onp.random.normal(0,1,size=(9632,3))

#model(U_init,V_init)

In [54]:
# Gradient descent
niter = 100
eta = 100.
loss_traj = []

U = onp.copy(U_init)
V = onp.copy(V_init)

for _ in tqdm.tqdm(range(niter)):
  # compute my gradients
  grad_u = model_grad_U(U,V)
  grad_v = model_grad_V(U,V)

  # do the GD update
  U = U - eta * grad_u
  V = V - eta * grad_v

  # monitor the loss
  loss_traj.append( model(U,V) )


100%|██████████| 100/100 [00:10<00:00,  9.87it/s]


In [34]:
# Encode test data with the same as training data which is just saying that we want to make sure that the same user/anime are given the 
# same index mapping in both data sets
def encode_new_data(test_data, user_ids, anime_ids):
    """ Encodes test_df with the same encoding as train_df.
    """
    # This is to check for each row entry in the test data set if it exists in the training set
    df_val_chosen = test_data['anime_id'].isin(anime_ids.keys()) & test_data['user_id'].isin(user_ids.keys())

    # Only use rows in the test data that exists for 
    test_data = test_data[df_val_chosen]
    test_data['anime_id'] =  np.array([anime_ids[x] for x in test_data['anime_id']])
    test_data['user_id'] = np.array([user_ids[x] for x in test_data['user_id']])
    return test_data

test_data = encode_new_data(test_data, user_ids, anime_ids)

In [41]:
#df_val_chosen = test_data['anime_id'].isin(anime_ids.keys()) & test_data['user_id'].isin(user_ids.keys())
test_data.groupby("rating").count()

Unnamed: 0_level_0,user_id,anime_id
rating,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3208,3208
2,4360,4360
3,7472,7472
4,19237,19237
5,51642,51642
6,112503,112503
7,238470,238470
8,280069,280069
9,217471,217471
10,175266,175266


In [43]:
u_list = test_data.user_id.values
v_list = test_data.anime_id.values
ratings_list = test_data.rating.values
predictions = jnp.sum(U[u_list.astype(int)] * V[v_list.astype(int)], axis=1)
MSE = np.mean( (predictions - ratings_list)**2 )
MSE

Array(31.40143141, dtype=float64)