# Embedding Bag Test
---

Experimenting applying an embedding bag (embedding layer + average of all embedding vectors) on the categorical features of a time series dataframe.

## Import the necessary packages

In [None]:
import dask.dataframe as dd                # Dask to handle big data in dataframes
import pandas as pd                        # Pandas to load the data initially
from dask.distributed import Client        # Dask scheduler
import numpy as np                         # Mathematical operations package, allowing also for missing values representation
import torch                               # PyTorch for tensor and deep learning operations
import data_utils as du                    # Data science and machine learning relevant methods

In [None]:
import pixiedust                           # Debugging in Jupyter Notebook cells

## Initialize variables

Data that we'll be using:

In [None]:
data_df = pd.DataFrame([[103, 0, 'dog'], 
                        [103, 0, 'cat'],
                        [103, 1, 'horse'],
                        [104, 0, 'bunny'],
                        [104, 1, np.nan],
                        [105, 0, 'horse'],
                        [105, 0, 'dog'],
                        [105, 0, 'cat'],
                        [105, 0, 'bunny'],
                        [105, 1, 'bunny'],
                        [105, 1, 'dog'],
                        [105, 1, np.nan],
                        [105, 1, 'horse']], columns=['id', 'ts', 'Var0'])
# Only use the line of code bellow if you want to test on Dask
# data_df = dd.from_pandas(data_df, npartitions=2)
# If using Pandas, uncomment the line of code bellow and comment the next one, which uses Dask
data_df
# data_df.compute()

Embedding matrix used in the embedding layer:

In [None]:
embed_mtx = torch.FloatTensor([[0, 0, 0],
                               [-1, 0, 1],
                               [0, 1, -1],
                               [1, 1, 0],
                               [1, -1, 1]])
embed_mtx

Simple embedding layer:

In [None]:
simple_embed_layer = torch.nn.Embedding.from_pretrained(embed_mtx)
simple_embed_layer

Embedding layer + average operation (bagging):

In [None]:
bag_embed_layer = torch.nn.EmbeddingBag.from_pretrained(embed_mtx)
bag_embed_layer

## Enumerate categories

In [None]:
data_df.Var0, enum_dict = du.embedding.enum_categorical_feature(data_df, 'Var0', forbidden_digit=0)
# If using Pandas, uncomment the line of code bellow and comment the next one, which uses Dask
data_df
# data_df.compute()

In [None]:
enum_dict

In [None]:
# If using Pandas, uncomment the line of code bellow and comment the next one, which uses Dask
data = torch.tensor(data_df.values)
# data = torch.tensor(data_df.compute().values)
data

## Apply embedding layer

In [None]:
simple_embed_layer(data[:, 2])

In [None]:
embed_data_df = pd.DataFrame(torch.cat((data[:, :2].float(), simple_embed_layer(data[:, 2])), dim=1).numpy(), columns=['id', 'ts', 'E0', 'E1', 'E2'])
# Only use the line of code bellow if you want to test on Dask
# embed_data_df = dd.from_pandas(embed_data_df, npartitions=2)
# If using Pandas, uncomment the line of code bellow and comment the next one, which uses Dask
embed_data_df
# embed_data_df.compute()

## Apply embedding bag

Concatenate rows that have the same `id` and `ts`:

In [None]:
data_df.Var0 = data_df.Var0.astype(str)
# If using Pandas, uncomment the line of code bellow and comment the next one, which uses Dask
data_df.Var0
# data_df.Var0.compute()

In [None]:
data_df = du.embedding.join_categorical_enum(data_df, cat_feat='Var0', id_columns=['id', 'ts'])
data_df

Convert to a PyTorch tensor:

In [None]:
data_df = du.embedding.string_encod_to_numeric(data_df, cat_feat='Var0', inplace=True)
data_df

In [None]:
data_df = data_df.reset_index()
data_df

In [None]:
seq_len_dict = du.padding.get_sequence_length_dict(data_df, id_column='id', ts_column='ts')
seq_len_dict

In [None]:
data = du.padding.dataframe_to_padded_tensor(data_df, id_column='id', inplace=True)
data

Apply the embedding bag:

In [None]:
# Find the idx of the column that we want to embed
Var0idx = du.search_explore.find_col_idx(data_df, feature='Var0')
Var0idx

In [None]:
x = du.utils.get_full_number_string(data[2, 1, 2].item())
x

In [None]:
x = du.utils.get_full_number_string(data[2, 0, 2].item())
x

In [None]:
data_df

In [None]:
Var0_embed, Var0_offset = du.embedding.prepare_embed_bag(data, feature=Var0idx)
print(f'Var0_embed: {Var0_embed}')
print(f'Var0_offset: {Var0_offset}')

In [None]:
# [TODO] Create and run a method that does the full embedding values calculation
# and tensor joining pipeline

In [None]:
bag_embed_layer(Var0_embed, Var0_offset)[:-1]