Training Hybrid filtering My anime dataset

In [1]:
import pandas as pd
import numpy as np
import torch
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from hybrid_filtering_pytorch.hybrid_recommender_torch import Hybrid_recommendation_system

seed_init = 10
torch.manual_seed(seed_init)
np.random.seed(seed_init)

In [2]:
dataset = pd.read_csv("my_anime_data_cleaned/rating_matrix_v2.csv").iloc[:, 1:] # we increment to leave the movieId

user_columns = list(dataset.columns)

dataset = dataset.to_numpy()
num_movie, num_user = dataset.shape

In [3]:
movie_feat = pd.read_csv("my_anime_data_cleaned/anime_data.csv", sep=",")
movie_feat = movie_feat.iloc[:, 2:].to_numpy()

In [4]:
# preparing the scalers
xm_scaler = MinMaxScaler()
labelScaler = MinMaxScaler()

# anime input vectors
xm_scaler.fit(movie_feat)
movie_feat = xm_scaler.transform(movie_feat)

# label input vectors
labelScaler.fit(dataset.T.reshape(-1, 1)[dataset.T.reshape(-1, 1) >= 0].reshape(-1, 1))

In [5]:
def retrieve_anime_index(anime_ids:str, index=False) -> list:
    ids = anime_ids.split("|")
    ids = ids[:-1] # last one is empty string
    ids = list(map(int, ids))

    if isinstance(movie_feat, pd.DataFrame) :
        indexes = movie_feat.loc[movie_feat["anime_id"].isin(ids), "anime_index"]
    else :
        df = pd.read_csv("my_anime_data_cleaned/anime_data.csv")
        indexes = df.loc[df["anime_id"].isin(ids), "anime_index"]

    return list(indexes)


def decaying_average(anime_indexes:list) -> np.ndarray :
    decay_rate = 0.95
    num_series = len(anime_indexes)

    decay_factor = np.array([decay_rate**x for x in range(num_series)]).reshape(-1, 1)

    series_features = movie_feat[anime_indexes]

    series_features *= decay_factor

    return np.mean(series_features, axis=0)    

In [6]:
# user data preparation
user_feat = pd.read_csv("my_anime_data_cleaned/user_features.csv", sep=",")

user_anime_indexes = [0 for _ in range(user_feat.shape[0])]
for i in range(len(user_feat)) :
    watched = user_feat.loc[i, "watched"]
    user_anime_indexes[i] = retrieve_anime_index(watched)

# user input vectors
user_feat = np.zeros((num_user, movie_feat.shape[-1]))
for i in range(len(user_anime_indexes)) :
    user_feat[i] = decaying_average(user_anime_indexes[i])

In [7]:
# model initializtion
u_dim = user_feat.shape[-1]
m_dim = movie_feat.shape[-1]

latent_dimension = 256
lr = 0.00001

model = Hybrid_recommendation_system(num_user, num_movie, u_dim, m_dim, lr=lr, l_d=latent_dimension, collab_filter_dim=256)

Device set to : NVIDIA GeForce RTX 3070 Laptop GPU
Device set to : NVIDIA GeForce RTX 3070 Laptop GPU
Device set to : NVIDIA GeForce RTX 3070 Laptop GPU


In [None]:
model.load_model(model_set="1")

In [8]:
# training loop params
epochs = 5
learning_rate = 0.0005

model.learning_rate = learning_rate

best_loss = 20000

test = 0
num_movie_train = num_movie - test

user_batch_size = 572
movie_batch_size = 300

In [9]:
# training loop
for epoch in range(epochs):
    total_loss = 0
    count = 0
    for u_batch in range(user_batch_size, num_user, user_batch_size) :
        for m_batch in range(movie_batch_size, num_movie_train, movie_batch_size) :
            # indexes preparation
            u_start = u_batch - user_batch_size
            m_start = m_batch - movie_batch_size

            rating = dataset.T[u_start:u_batch, m_start:m_batch].reshape(-1, 1)
            mask = np.where(rating >= 0, 1, 0)
            rating = labelScaler.transform(rating)

            loss = model.train(
                user_feat[u_start:u_batch], movie_feat[m_start:m_batch],
                param_idx=(np.arange(u_start, u_batch), np.arange(m_start, m_batch)),
                ratings=rating, rating_mask=mask,
                epochs=1, expand=True
            )
            
            total_loss += loss
            count += 1

            print(f"epochs {epoch} u_batch {u_batch} m_batch {m_batch} loss : {loss}", end="\r")

        print(f"epochs {epoch} loss : {total_loss / count}")

        if total_loss < best_loss :
            model.save_model(model_set="1")
            best_loss = total_loss

epochs 0 loss : 0.07021405547857285loss : 0.042571377009153366
NCF saved
content model saved
all model saved
epochs 1 loss : 0.05542684718966484loss : 0.052180472761392596
NCF saved
content model saved
all model saved
epochs 2 loss : 0.056162115186452866oss : 0.073691263794898995
epochs 3 loss : 0.04546348378062248loss : 0.044194262474775314
NCF saved
content model saved
all model saved
epochs 4 loss : 0.042294424027204514oss : 0.042847681790590286
NCF saved
content model saved
all model saved


In [None]:
model.save_model(model_set="1")

In [24]:
# testing the perf on random set
test_size = 100

user_test_index = np.random.randint(0, num_user - test_size)
user_test_end = user_test_index + test_size

anime_test_index = np.random.randint(0, num_movie - test_size)
anime_test_end = anime_test_index + test_size

label = dataset.T[user_test_index:user_test_index, anime_test_index:anime_test_end]
label.reshape(-1, 1)

label_mask = np.where(label >= 0 , 1, 0)
boolean_mask = label_mask > 0

test_u_feat = torch.tensor(user_feat[user_test_index:user_test_end], dtype=torch.float32, device=model.device)
test_m_feat = torch.tensor(movie_feat[anime_test_index:anime_test_end], dtype=torch.float32, device=model.device)

pred = model.prediction(
    test_u_feat, test_m_feat,
    indexes=(np.arange(user_test_index, user_test_end), np.arange(anime_test_index, anime_test_end)),
    weights=(0.5, 0.5),
    expand=True
)

pred = labelScaler.inverse_transform(pred.detach().cpu().numpy())

print("the prediction is : ", pred[boolean_mask][:10])
print("the label is : ", label[boolean_mask][:10])

print("the mean squared error is : ", np.mean((label - pred)**2))

print("the loss scaled is : ", np.mean((labelScaler.transform(label) - labelScaler.transform(pred))**2))

In [19]:
label = dataset.T[user_test_index:user_test_end, anime_test_index:anime_test_end]

In [23]:
label.shape

(0, 1)