# Embedding Bag Test
---

Experimenting applying an embedding bag (embedding layer + average of all embedding vectors) on the categorical features of a time series dataframe.

## Import the necessary packages

In [None]:
# import dask.dataframe as dd                # Dask to handle big data in dataframes
# from dask.distributed import Client        # Dask scheduler
import pandas as pd                        # Pandas to load the data initially
import numpy as np                         # Mathematical operations package, allowing also for missing values representation
import torch                               # PyTorch for tensor and deep learning operations
import data_utils as du                    # Data science and machine learning relevant methods

In [None]:
du.set_pandas_library('pandas')

In [None]:
import pixiedust                           # Debugging in Jupyter Notebook cells

## Initialize variables

Data that we'll be using:

In [None]:
data_df = pd.DataFrame([[103, 0, 'dog'], 
                        [103, 0, 'cat'], 
                        [103, 0, 'cat'],
                        [103, 1, 'horse'],
                        [103, 2, 'dog'],
                        [104, 0, 'bunny'],
                        [104, 1, np.nan],
                        [104, 2, 'bunny'],
                        [104, 3, np.nan],
                        [105, 0, 'horse'],
                        [105, 0, 'dog'],
                        [105, 0, 'dog'],
                        [105, 0, 'cat'],
                        [105, 0, 'bunny'],
                        [105, 0, 'horse'],
                        [105, 1, 'horse'],
                        [105, 1, 'bunny'],
                        [105, 1, 'dog'],
                        [105, 1, np.nan],
                        [105, 1, 'horse'],
                        [105, 1, 'horse'],
                        [105, 1, 'horse'],
                        [105, 1, 'horse'],
                        [105, 1, 'horse'],
                        [105, 1, 'horse']], columns=['id', 'ts', 'Var0'])
# Only use the line of code bellow if you want to test on Dask
# data_df = dd.from_pandas(data_df, npartitions=2)
# If using Pandas, uncomment the line of code bellow and comment the next one, which uses Dask
data_df
# data_df.compute()

In [None]:
data_df.to_csv('dummy_data/embedding_bag_data_df.csv')

Embedding matrix used in the embedding layer:

In [None]:
embed_mtx = torch.FloatTensor([[0, 0, 0],
                               [-1, 0, 1],
                               [0, 1, -1],
                               [1, 1, 0],
                               [1, -1, 1]])
embed_mtx

Simple embedding layer:

In [None]:
simple_embed_layer = torch.nn.Embedding.from_pretrained(embed_mtx)
simple_embed_layer

Embedding layer + average operation (bagging):

In [None]:
bag_embed_layer = torch.nn.EmbeddingBag.from_pretrained(embed_mtx)
bag_embed_layer

## One hot encode categories

In [None]:
data_df_ohe = du.data_processing.one_hot_encoding_dataframe(data_df, columns='Var0', join_rows=False)
data_df_ohe

In [None]:
bool_cols = du.search_explore.list_boolean_columns(data_df_ohe)
bool_cols

In [None]:
data_df_ohe = du.embedding.join_repeated_rows(data_df_ohe, id_columns=['id', 'ts'])
data_df_ohe

In [None]:
data_df_ohe.to_csv('dummy_data/embedding_bag_data_df_ohe.csv')

In [None]:
data_df_ohe.dtypes

## Enumerate categories

In [None]:
data_df.Var0, enum_dict = du.embedding.enum_categorical_feature(data_df, 'Var0', nan_value=0, forbidden_digit=0)
# If using Pandas, uncomment the line of code bellow and comment the next one, which uses Dask
data_df
# data_df.compute()

In [None]:
enum_dict

In [None]:
# If using Pandas, uncomment the line of code bellow and comment the next one, which uses Dask
data = torch.tensor(data_df.values)
# data = torch.tensor(data_df.compute().values)
data

## Apply embedding layer

In [None]:
simple_embed_layer(data[:, 2])

In [None]:
embed_data_df = pd.DataFrame(torch.cat((data[:, :2].float(), simple_embed_layer(data[:, 2])), dim=1).numpy(), columns=['id', 'ts', 'E0', 'E1', 'E2'])
# Only use the line of code bellow if you want to test on Dask
# embed_data_df = dd.from_pandas(embed_data_df, npartitions=2)
# If using Pandas, uncomment the line of code bellow and comment the next one, which uses Dask
embed_data_df
# embed_data_df.compute()

## Apply embedding bag

In [None]:
Var0_embed, Var0_offset = du.embedding.prepare_embed_bag(data_tnsr, features=ohe_feat)
print(f'Var0_embed: {Var0_embed}')
print(f'Var0_offset: {Var0_offset}')

In [None]:
embedding_values = bag_embed_layer(Var0_embed, Var0_offset)[:-1]
embedding_values

In [None]:
bag_embed_layer.embedding_dim

In [None]:
data_tnsr.shape

In [None]:
embedding_values.shape

In [None]:
embedding_values.view(data_tnsr.shape[0], data_tnsr.shape[1], bag_embed_layer.embedding_dim)

In [None]:
embedding_values_3d = embedding_values.view(data_tnsr.shape[0], data_tnsr.shape[1], bag_embed_layer.embedding_dim)
embedding_values_3d

In [None]:
torch.cat((data.double(), embedding_values_3d.double()), dim=2)

Run the full embedding values calculation and tensor joining pipeline:

In [None]:
data_df_ohe

In [None]:
data_embed = du.embedding.embedding_bag_pipeline(data_tnsr, bag_embed_layer, features=ohe_feat)
data_embed

In [None]:
data_embed = data_embed.view(-1, 5).numpy()
data_embed = data_embed[data_embed[:, 0] != 999999]
data_embed

In [None]:
data_df_embed = pd.DataFrame(data_embed, columns=['id', 'ts', 'E0', 'E1', 'E2'])
data_df_embed

In [None]:
data_df_embed.to_csv('dummy_data/embedding_bag_data_df_embed.csv')