# Multi-Stage clothing recommendation system

## 1. Retrieval model

In [None]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch import nn

In [None]:
np.random.seed(2)

In [None]:
path_to_transactions = "C:\\Sajat\\Egyetem\\MSc\\Onallo\\HM_dataset\\transactions_train.csv"

# Number of transactions in original dataset: 31_788_324
max_training_transactions_count = 500_000

transactions_df = pd.read_csv(
    filepath_or_buffer=path_to_transactions,
    usecols=["customer_id", "article_id", "t_dat"],
    nrows=max_training_transactions_count,
)

print(f"First transaction date: {transactions_df.head(1)['t_dat']}")
print(f"Last transaction date: {transactions_df.tail(1)['t_dat']}")

transactions_df = transactions_df.drop("t_dat", axis=1)

articles_df = transactions_df[["article_id"]].drop_duplicates()
customers_df = transactions_df[["customer_id"]].drop_duplicates()

number_of_unique_article_ids = len(articles_df)
number_of_unique_customer_ids = len(customers_df)

print(f"Number of article IDs: {number_of_unique_article_ids}")
print(f"Number of customer IDs:  {number_of_unique_customer_ids}")

In [None]:
number_of_epochs = 1
number_of_hidden_linear_layers = 3
number_of_neurons_in_layer = 64
embedding_vector_dimension = 16
half_batch_size = 8

In [None]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)

print(f"Using {device} device")

In [None]:
class RetrievalTrainingNegativeBatchSampledPairDataset(Dataset):

    def __init__(self, articles_df, customers_df, transactions_df):
        self.articles_df = articles_df
        self.customers_df = customers_df
        self.transactions_df = transactions_df

    def __len__(self):
        return len(transactions_df)

    def __getitem__(self, idx):
        # TODO this maybe can be optimized with moving to a preprocessing step the article search for users, and the negative item index generation could be also done in a smarter and more efficient way
        positive_article = self.transactions_df.iloc[idx]

        ids_of_articles_purchased_by_user = self.transactions_df[
            self.transactions_df["customer_id"] == positive_article["customer_id"]
        ]["article_id"].drop_duplicates()

        indexes_of_negative_articles_of_user = np.where(
            ~self.articles_df["article_id"].isin(ids_of_articles_purchased_by_user)
        )[0]
        
        negative_article_index = np.random.choice(indexes_of_negative_articles_of_user)
        
        positive_article_index = np.where(self.articles_df["article_id"] == positive_article["article_id"])[0][0]
        customer_index = np.where(self.customers_df["customer_id"] == positive_article["customer_id"])[0][0]

        return (
            torch.tensor([
                [positive_article_index, customer_index],
                [negative_article_index, customer_index],
            ]),
            torch.tensor([1.0, 0.0]),
        )

In [None]:
def create_relu_stack(
        unique_input_count,
        number_of_hidden_linear_layers,
        number_of_neurons_in_layer,
        output_vector_dimension
):
    modules = [
        nn.Embedding(
            num_embeddings=unique_input_count,
            embedding_dim=number_of_neurons_in_layer,
        )
    ]
    for hidden_layer_idx in range(0, number_of_hidden_linear_layers):
        if hidden_layer_idx != (number_of_hidden_linear_layers - 1):
            modules.append(nn.Linear(number_of_neurons_in_layer, number_of_neurons_in_layer))
            modules.append(nn.ReLU())
        else:
            modules.append(nn.Linear(number_of_neurons_in_layer, output_vector_dimension))
    return nn.Sequential(*modules)

In [None]:
class TwoTowerModel(nn.Module):

    def __init__(
            self,
            unique_item_input_count,
            unique_query_input_count,
            number_of_hidden_linear_layers,
            number_of_neurons_in_layer,
            output_vector_dimension,
            similarity_function,
    ):
        super().__init__()

        self.item_tower_model = create_relu_stack(
            unique_input_count=unique_item_input_count + 1,  # add description the reason of this +1 (OOV)
            number_of_hidden_linear_layers=number_of_hidden_linear_layers,
            number_of_neurons_in_layer=number_of_neurons_in_layer,
            output_vector_dimension=output_vector_dimension,
        )
        self.query_tower_model = create_relu_stack(
            unique_input_count=unique_query_input_count + 1,  # add description the reason of this +1 (OOV)
            number_of_hidden_linear_layers=number_of_hidden_linear_layers,
            number_of_neurons_in_layer=number_of_neurons_in_layer,
            output_vector_dimension=output_vector_dimension,
        )

        self.similarity_function = similarity_function

    def forward(self, x):
        item_embedding = self.item_tower_model(x[0])
        query_embedding = self.query_tower_model(x[1])
        similarity = self.similarity_function(item_embedding, query_embedding)
        return similarity

In [None]:
two_tower_model = TwoTowerModel(
    unique_item_input_count=number_of_unique_article_ids,
    unique_query_input_count=number_of_unique_customer_ids,
    number_of_hidden_linear_layers=number_of_hidden_linear_layers,
    number_of_neurons_in_layer=number_of_neurons_in_layer,
    output_vector_dimension=embedding_vector_dimension,
    similarity_function=nn.CosineSimilarity(),
).to(device)

In [None]:
cross_entropy_loss_fn = nn.CrossEntropyLoss()
two_tower_model_optimizer = torch.optim.Adam(two_tower_model.parameters())

In [None]:
def training_function(
        dataloader,
        model,
        loss_fn,
        optimizer,
):
    for epoch in range(number_of_epochs):
        size = len(dataloader.dataset)
        model.train()
        for batch_index, (X, y) in enumerate(dataloader):
            try:
                x_with_mixed_positives_and_negatives = X.view(-1, 2)
                logits = model(x_with_mixed_positives_and_negatives.t())
                reshaped_labels = y.view(-1)
                loss = loss_fn(logits, reshaped_labels)
                loss.backward()
                optimizer.step()
                optimizer.zero_grad()
                if batch_index % 100 == 0:
                    loss_value, current = loss.item(), (batch_index + 1) * len(X)
                    print(f"loss: {loss_value:>7f}  [{current:>5d}/{size:>5d}]")
            except IndexError:
                print(f"Index error for:")
                print(f"Input: {X}")
                print(f"Labels: {y}")
            except Exception as e:
                print(f"Other exception: {e}")

In [None]:
training_function(
    dataloader=DataLoader(
        dataset=RetrievalTrainingNegativeBatchSampledPairDataset(
            articles_df=articles_df,
            customers_df=customers_df,
            transactions_df=transactions_df,
        ),
        batch_size=half_batch_size,
    ),
    model=two_tower_model,
    loss_fn=cross_entropy_loss_fn,
    optimizer=two_tower_model_optimizer,
)