In [105]:
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split
from scipy.sparse import csc_matrix
import tqdm

In [106]:
anime_data = pd.read_csv('assignment_2_anime.csv')
train_data = pd.read_csv('assignment_2_ratings_train.csv')
test_data = pd.read_csv('assignment_2_ratings_test.csv')

## Splitting training and validation data

In [107]:
train, valid = train_test_split(train_data, test_size = 0.2)

# Reset index for train and valid
train = train.reset_index()[['user_id', 'anime_id', 'rating']]
valid = valid.reset_index()[['user_id', 'anime_id', 'rating']]

## Encoding columns

In [108]:
def encode_column(column):
    keys = column.unique()
    key_to_id = {key:idx for idx,key in enumerate(keys)}
    return key_to_id, np.array([key_to_id[x] for x in column]), len(keys)

In [109]:
def encode_df(data):
    anime_ids, data['anime_id'], num_anime = encode_column(data['anime_id'])
    user_ids, data['user_id'], num_users = encode_column(data['user_id'])
    return data, num_users, num_anime, user_ids, anime_ids

In [110]:
train_data, num_users, num_anime, user_ids, anime_ids = encode_df(train_data)

In [111]:
def create_embeddings(n, K):
    """
    Creates a random numpy matrix of shape n, K with uniform values in (0, 11/K)
    n: number of items/users
    K: number of factors in the embedding 
    """
    return 11*np.random.random((n, K)) / K

## Create sparse matrix

In [112]:
def sparse_matrix_init(arraydata, rows, cols, colname): 
    return csc_matrix((arraydata[colname].values, (arraydata['user_id'].values, arraydata['anime_id'].values)), shape=(rows, cols))

## Prediction without doing U x V transpose

In [113]:
def predict(data, encoded_user, encoded_anime):
    data['prediction'] = np.sum(np.multiply(encoded_anime[data['anime_id']], encoded_user[data['user_id']]), axis = 1)
    return data

In [114]:
def mse(data, encoded_user, encoded_anime):
    A = sparse_matrix_init(data, encoded_user.shape[0], encoded_anime.shape[0], 'rating')
    A_approx = sparse_matrix_init(predict(data, encoded_user, encoded_anime), encoded_user.shape[0], encoded_anime.shape[0], 'prediction')
    MSE = np.sum((A - A_approx).power(2))/data.shape[0]
    
    return MSE

In [115]:
def gradient(data, encoded_user, encoded_anime):
    A = sparse_matrix_init(data, encoded_user.shape[0], encoded_anime.shape[0], 'rating')
    A_approx = sparse_matrix_init(predict(data, encoded_user, encoded_anime), encoded_user.shape[0], encoded_anime.shape[0], 'prediction')
    
    lamb = 0.0002
    grad_user = (-2/data.shape[0])*((A - A_approx)*encoded_anime) + 2*lamb*encoded_user
    grad_anime = (-2/data.shape[0])*((A - A_approx).T*encoded_user) + 2*lamb*encoded_anime
    
    return grad_user, grad_anime

In [116]:
def model(data, encoded_user, encoded_anime, iter, lr):
    
    for i in tqdm.tqdm(range(iter)):
        grad_user, grad_anime = gradient(data, encoded_user, encoded_anime)
        
        encoded_user = encoded_user - lr*grad_user
        encoded_anime = encoded_anime - lr*grad_anime

    return encoded_user, encoded_anime

In [117]:
encoded_user = create_embeddings(num_users, 3)
encoded_anime = create_embeddings(num_anime, 3)

encoded_user, encoded_anime = model(train_data, encoded_user, encoded_anime, 800, 1)

100%|████████████████████████████████████████████████████████████████████████████████| 800/800 [12:49<00:00,  1.04it/s]


In [119]:
def encode_new_data(test_data, user_ids, anime_ids):
    """ Encodes valid_df with the same encoding as train_df.
    """
    df_val_chosen = test_data['anime_id'].isin(anime_ids.keys()) & test_data['user_id'].isin(user_ids.keys())
    test_data = test_data[df_val_chosen]
    test_data['anime_id'] =  np.array([anime_ids[x] for x in test_data['anime_id']])
    test_data['user_id'] = np.array([user_ids[x] for x in test_data['user_id']])
    return test_data

test_data = encode_new_data(test_data, user_ids, anime_ids)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['anime_id'] =  np.array([anime_ids[x] for x in test_data['anime_id']])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['user_id'] = np.array([user_ids[x] for x in test_data['user_id']])


In [121]:
train_mse = mse(train_data, encoded_user, encoded_anime)
val_mse = mse(test_data, encoded_user, encoded_anime)
print(train_mse, val_mse)

6.036747635875389 6.06862669626926


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['prediction'] = np.sum(np.multiply(encoded_anime[data['anime_id']], encoded_user[data['user_id']]), axis = 1)
