# Handling multiple encodings and joins
---

Experimenting converting encodings back into string-formated categories and joining two different encoding dictionaries.

## Importing the necessary packages

In [None]:
import dask.dataframe as dd                # Dask to handle big data in dataframes
import pandas as pd                        # Pandas to load the data initially
from dask.distributed import Client        # Dask scheduler
import os                                  # os handles directory/workspace changes
import numpy as np                         # NumPy to handle numeric and NaN operations
from tqdm import tqdm_notebook             # tqdm allows to track code execution progress
from IPython.display import display        # Display multiple outputs on the same cell

In [None]:
# Change to parent directory
os.chdir("..")
import utils                               # Contains auxiliary functions

In [None]:
# Debugging packages
import pixiedust                           # Debugging in Jupyter Notebook cells

In [None]:
# Change to parent directory (presumably "Documents")
os.chdir("../..")

# Path to the CSV dataset files
data_path = 'Datasets/Thesis/eICU/uncompressed/'
project_path = 'GitHub/eICU-mortality-prediction/'

In [None]:
# Set up local cluster
client = Client("tcp://127.0.0.1:60773")
client

In [None]:
# Upload the utils.py file, so that the Dask cluster has access to relevant auxiliary functions
client.upload_file(f'{project_path}NeuralNetwork.py')
client.upload_file(f'{project_path}utils.py')

In [None]:
client.run(os.getcwd)

## Creating data

Encoded dataframes:

In [None]:
data1_df = pd.DataFrame([[103, 0, 1], 
                         [103, 0, 0],
                         [103, 1, 0],
                         [104, 0, '3;1;6'],
                         [105, 0, '2;4'],
                         [106, 0, 5],
                         [107, 0, 0],
                         [108, 0, '1;2;3'],
                         [108, 1, 0],
                         [108, 2, '3;5;1;6;2'],
                         [108, 3, 6]], columns=['id', 'ts', 'Var0'])
data2_df = pd.DataFrame([[217, 0, 0], 
                         [217, 1, 3],
                         [217, 2, '3;4'],
                         [426, 0, '1;2'],
                         [409, 0, '2;4'],
                         [378, 0, 1],
                         [290, 0, 0]], columns=['id', 'ts', 'Var0'])
# Only use the line of code bellow if you want to test on Dask
data1_df = dd.from_pandas(data1_df, npartitions=2)
data2_df = dd.from_pandas(data2_df, npartitions=2)
# If using Pandas, uncomment the line of code bellow and comment the next one, which uses Dask
# data1_df
# data2_df
print(f'Dataframe 1:')
display(data1_df.compute())
print(f'Dataframe 2:')
display(data2_df.compute())

Encoding dictionaries:

In [None]:
data1_dict = {0: 'nan',
              1: 'banana',
              2: 'orange',
              3: 'apple',
              4: 'strawberry',
              5: 'melon',
              6: 'peach'}
data2_dict = {0: 'nan',
              1: 'orange',
              2: 'pear',
              3: 'blueberry',
              4: 'banana'}
print(f'Dictionary for data 1: \n{data1_dict}')
print(f'Dictionary for data 2: \n{data2_dict}')

## Converting encodings to the original category names

In [None]:
x = '1;2;3;4'

In [None]:
enums = str(x).split(';')
enums

In [None]:
categories = [data1_dict[int(n)] for n in enums]
categories

In [None]:
categories = ';'.join(categories)
categories

Get the categories names:

In [None]:
data1_df['Var0_categories'] = data1_df.apply(lambda df: utils.enum_category_conversion(df, enum_column='Var0', enum_dict=data1_dict),
                                             axis=1, meta=('df', str))
data1_df.compute()

In [None]:
data2_df['Var0_categories'] = data2_df.apply(lambda df: utils.enum_category_conversion(df, enum_column='Var0', enum_dict=data2_dict),
                                             axis=1, meta=('df', str))
data2_df.compute()

Recover the enumerations:

In [None]:
data1_df['Var0_num'] = data1_df.apply(lambda df: utils.enum_category_conversion(df, enum_column='Var0_categories', enum_dict=utils.invert_dict(data1_dict)),
                                      axis=1, meta=('df', str))
data1_df.compute()

In [None]:
data2_df['Var0_num'] = data2_df.apply(lambda df: utils.enum_category_conversion(df, enum_column='Var0_categories', enum_dict=utils.invert_dict(data2_dict)),
                                      axis=1, meta=('df', str))
data2_df.compute()

## Joining two encodings into one

In [None]:
utils.invert_dict(data1_dict)

In [None]:
utils.invert_dict(data2_dict)

In [None]:
new_data1_df, new_data2_df, all_data_dict = utils.converge_enum(data1_df, data2_df, 'Var0', 
                                                                utils.invert_dict(data1_dict), 
                                                                utils.invert_dict(data2_dict))
all_data_dict

In [None]:
new_data1_df.compute()

In [None]:
new_data2_df.compute()

In [None]:
all_categories = set(list(data1_dict.values()) + list(data2_dict.values()))
all_categories.remove('nan')
all_categories

In [None]:
data3_dict = {'nan': 0,
              'orange': 1,
              'unknown': 0,
              'other': 0,
              'pear': 2,
              'blueberry': 3,
              'banana': 4,
              'null': 0}
data3_dict

In [None]:
data3_dict = utils.invert_dict(data3_dict)
data3_dict

In [None]:
data3_dict[0] = 'nan'
data3_dict

## Experiment with eICU data