# Kotlin Clothing Webshop recommender algorithms and inspections

## 1. Simple collaborative filtering retrieval algorithm based on matrix factorization and two tower model

### 1. Import required libraries

In [1]:
import pandas as pd
import tensorflow as tf
import tensorflow_recommenders as tfrs
from typing import Dict, Text

### 2. Prepare the data

#### 1. Read the dataset

In [2]:
path_to_articles = "C:\\Sajat\\Egyetem\\MSc\\Onallo\\HM_dataset\\articles.csv"
path_to_transactions = "C:\\Sajat\\Egyetem\\MSc\\Onallo\\HM_dataset\\transactions_train.csv"

article_df = pd.read_csv(path_to_articles)
transactions_df = pd.read_csv(path_to_transactions)

article_df["article_id"] = article_df["article_id"].astype(str)
transactions_df["article_id"] = transactions_df["article_id"].astype(str)

#### 2. Reduce the size of the dataset to boost testing speed

In [3]:
transactions_date = transactions_df["t_dat"]

first_transaction_date = "2019-09-20"
last_transaction_date = "2020-09-20"

# Reduce the size of dataframe to make tests faster
train_df_first_transaction_date = "2020-09-17" #"2020-09-03"
train_df_and_test_df_separator_date = "2020-09-19" #"2020-09-17"
test_df_last_transaction_date = "2020-09-20" #"2020-09-20"

#### 3. Split the dataset into training and testing datasets

In [7]:
filtered_transactions_df = transactions_df[(transactions_date > train_df_first_transaction_date) & (transactions_date <= test_df_last_transaction_date)]
train_transactions_df = transactions_df[(transactions_date > train_df_first_transaction_date) & (transactions_date <= train_df_and_test_df_separator_date)]
test_transactions_df = transactions_df[(transactions_date > train_df_and_test_df_separator_date) & (transactions_date <= test_df_last_transaction_date)]

print("Count of articles:", article_df.size)
print("Count of transactions:", transactions_df.size)
print("Count of training transactions:", train_df.size)
print("Count of test transactions:", test_df.size)

Count of articles: 2638550
Count of transactions: 158941620
Count of training transactions: 380400
Count of test transactions: 157445


#### 3. Project only relevant item and query attributes and map pandas dataframe to tensorflow dataset

In [20]:
def project_transaction(transactions):
    return [transactions["customer_id"], transactions["article_id"]]


articles = tf.data.Dataset.from_tensor_slices(
    article_df[["article_id"]].to_dict(orient="list")
).map(lambda x: x["article_id"])
transactions = tf.data.Dataset.from_tensor_slices(
    filtered_transactions_df[["customer_id", "article_id"]].to_dict(orient="list")
).map(project_transaction)
customer_ids = tf.data.Dataset.from_tensor_slices(
    filtered_transactions_df[["customer_id"]].to_dict(orient="list")
).map(lambda x: x["customer_id"])
training_transactions_slices = tf.data.Dataset.from_tensor_slices(
    train_transactions_df[["customer_id", "article_id"]].to_dict(orient="list")
)
training_transactions = training_transactions_slices.map(project_transaction)
test_transactions_slices = tf.data.Dataset.from_tensor_slices(
    test_transactions_df[["customer_id", "article_id"]].to_dict(orient="list")
)
test_transactions = test_transactions_slices.map(project_transaction)

### 3. Implement the model

#### 1. Define common embedding dimension

In [9]:
embedding_dimension = 32

#### 2. Create StringLookup layers

In [10]:
customer_ids_vocabulary = tf.keras.layers.StringLookup(mask_token=None)
customer_ids_vocabulary.adapt(customer_ids)
article_ids_vocabulary = tf.keras.layers.StringLookup(mask_token=None)
article_ids_vocabulary.adapt(articles)

#### 3. Implement the query tower

In [15]:
customer_tower_model = tf.keras.Sequential([
    customer_ids_vocabulary,
    tf.keras.layers.Embedding(customer_ids_vocabulary.vocab_size(), embedding_dimension),
])



#### 4. Implement the candidate tower model

In [11]:
article_tower_model = tf.keras.Sequential([
    article_ids_vocabulary,
    tf.keras.layers.Embedding(article_ids_vocabulary.vocab_size(), embedding_dimension),
])



#### 5. Define task (metrics and loss) for two tower model

In [12]:
task = tfrs.tasks.Retrieval(
    metrics=tfrs.metrics.FactorizedTopK(
        candidates=articles.batch(64).map(article_tower_model)
    )
)

#### 6. Implement the two tower model

In [13]:
class TwoTowerModel(tfrs.Model):
    
    def __init__(self, query_tower_model, candidate_tower_model):
        super().__init__()
        self.candidate_tower_model: tf.keras.Model = candidate_tower_model
        self.query_tower_model: tf.keras.Model = query_tower_model
        self.task: tf.keras.layers.Layer = task

    def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
        customer_embeddings = self.query_tower_model(features["customer_id"])
        positive_article_embeddings = self.candidate_tower_model(features["article_id"])

        return self.task(customer_embeddings, positive_article_embeddings)

### 4. Fitting and evaluating

#### 1. Fitting

In [22]:
model = TwoTowerModel(query_tower_model=customer_tower_model, candidate_tower_model=article_tower_model)

model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

model.fit(training_transactions_slices.batch(1024), epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x1f2d5453f70>

#### 2. Evaluating

In [23]:
result = model.evaluate(test_transactions_slices.batch(1024), return_dict=True)

print(result)

{'factorized_top_k/top_1_categorical_accuracy': 0.002095970092341304, 'factorized_top_k/top_5_categorical_accuracy': 0.0066054812632501125, 'factorized_top_k/top_10_categorical_accuracy': 0.010003493167459965, 'factorized_top_k/top_50_categorical_accuracy': 0.030613865703344345, 'factorized_top_k/top_100_categorical_accuracy': 0.05046206712722778, 'loss': 5167.09033203125, 'regularization_loss': 0, 'total_loss': 5167.09033203125}
