# Encoding and joining rows
---

Experimenting converting categorical features into numerical encodings and then joining rows that have the same identifier.

## Importing the necessary packages

In [None]:
import dask.dataframe as dd                # Dask to handle big data in dataframes
import pandas as pd                        # Pandas to load the data initially
from dask.distributed import Client        # Dask scheduler
import os                                  # os handles directory/workspace changes
import numpy as np                         # NumPy to handle numeric and NaN operations
from tqdm import tqdm_notebook             # tqdm allows to track code execution progress
from IPython.display import display        # Display multiple outputs on the same cell

In [None]:
# Change to parent directory
os.chdir("..")
import utils                               # Contains auxiliary functions

In [None]:
# Debugging packages
import pixiedust                           # Debugging in Jupyter Notebook cells

In [None]:
# Change to parent directory (presumably "Documents")
os.chdir("../..")

# Path to the CSV dataset files
data_path = 'Datasets/Thesis/eICU/uncompressed/'
project_path = 'GitHub/eICU-mortality-prediction/'

In [None]:
# Set up local cluster
client = Client("tcp://127.0.0.1:60409")
client

In [None]:
# Upload the utils.py file, so that the Dask cluster has access to relevant auxiliary functions
client.upload_file(f'{project_path}NeuralNetwork.py')
client.upload_file(f'{project_path}utils.py')

In [None]:
client.run(os.getcwd)

## Creating data

Encoded dataframes:

In [None]:
data_df = pd.DataFrame([[103, 0, 'dog'], 
                        [103, 0, 'unknown'], 
                        [103, 0, 'frog'],
                        [103, 1, 'cat'],
                        [104, 0, 'bird'],
                        [105, 0, 'fish'],
                        [106, 0, 'hamster'],
                        [106, 0, 'turtle'],
                        [106, 1, 'dog'],
                        [107, 0, 'unknown'],
                        [108, 0, 'unknown'],
                        [108, 0, 'unknown'],
                        [108, 0, 'unknown'], 
                        [108, 1, 'unknown'],
                        [108, 1, 'dog'], 
                        [108, 1, 'unknown'],
                        [108, 1, 'cat'], 
                        [108, 1, 'unknown'],
                        [108, 2, 'dog']], columns=['id', 'ts', 'Var0'])
# Only use the line of code bellow if you want to test on Dask
data_df = dd.from_pandas(data_df, npartitions=2)
# If using Pandas, uncomment the line of code bellow and comment the next one, which uses Dask
# data_df
data_df.compute()

## Encoding categories

In [None]:
data_df['Var0'], cat_embed_feat_enum = utils.enum_categorical_feature(data_df, 'Var0')
data_df.compute()

In [None]:
cat_embed_feat_enum

## Joining rows

In [None]:
data_df = utils.join_categorical_enum(data_df, id_columns=['id', 'ts'], cat_feat=['Var0'], has_timestamp=False)
data_df.compute()