# Handling multiple encodings and joins
---

Experimenting converting encodings back into string-formated categories and joining two different encoding dictionaries.

## Importing the necessary packages

In [None]:
import os                                  # os handles directory/workspace changes
import numpy as np                         # NumPy to handle numeric and NaN operations
from tqdm import tqdm_notebook             # tqdm allows to track code execution progress
from IPython.display import display        # Display multiple outputs on the same cell

In [None]:
# Debugging packages
import pixiedust                           # Debugging in Jupyter Notebook cells

In [None]:
# Change to parent directory (presumably "Documents")
os.chdir("../../../..")
# Path to the CSV dataset files
data_path = 'Datasets/Thesis/eICU/uncompressed/'
# Path to the code files
project_path = 'GitHub/eICU-mortality-prediction/'

In [None]:
# import modin.pandas as pd                  # Optimized distributed version of Pandas
import pandas as pd
import data_utils as du                    # Data science and machine learning relevant methods

## Creating data

Encoded dataframes:

In [None]:
data1_df = pd.DataFrame([[103, 0, 1], 
                         [103, 0, 0],
                         [103, 1, 0],
                         [104, 0, '3;1;6'],
                         [105, 0, '2;4'],
                         [106, 0, 5],
                         [107, 0, 0],
                         [108, 0, '1;2;3'],
                         [108, 1, 0],
                         [108, 2, '3;5;1;6;2'],
                         [108, 3, 6]], columns=['id', 'ts', 'Var0'])
data2_df = pd.DataFrame([[217, 0, 0], 
                         [217, 1, 3],
                         [217, 2, '3;4'],
                         [426, 0, '1;2'],
                         [409, 0, '2;4'],
                         [378, 0, 1],
                         [290, 0, 0]], columns=['id', 'ts', 'Var0'])
# Only use the lines of code bellow if you want to test on Dask
# data1_df = dd.from_pandas(data1_df, npartitions=2)
# data2_df = dd.from_pandas(data2_df, npartitions=2)
data2_df
print(f'Dataframe 1:')
display(data1_df)
# display(data1_df.compute()) # Dask
print(f'Dataframe 2:')
display(data2_df)
# display(data2_df.compute()) # Dask

Encoding dictionaries:

In [None]:
data1_dict = {0: 'nan',
              1: 'banana',
              2: 'orange',
              3: 'apple',
              4: 'strawberry',
              5: 'melon',
              6: 'peach'}
data2_dict = {0: 'nan',
              1: 'orange',
              2: 'pear',
              3: 'blueberry',
              4: 'banana'}
print(f'Dictionary for data 1: \n{data1_dict}\n')
print(f'Dictionary for data 2: \n{data2_dict}')

## Converting encodings to the original category names

In [None]:
x = '1;2;3;4'

In [None]:
enums = str(x).split(';')
enums

In [None]:
categories = [data1_dict[int(n)] for n in enums]
categories

In [None]:
categories = ';'.join(categories)
categories

Get the categories names:

In [None]:
data1_df['Var0_categories'] = data1_df.apply(lambda df: du.embedding.enum_category_conversion(df, enum_column='Var0', enum_dict=data1_dict),
                                             axis=1)
data1_df
# data1_df.compute() # Dask

In [None]:
data2_df['Var0_categories'] = data2_df.apply(lambda df: du.embedding.enum_category_conversion(df, enum_column='Var0', enum_dict=data2_dict),
                                             axis=1)
data2_df
# data2_df.compute() # Dask

Recover the enumerations:

In [None]:
data1_dict

In [None]:
du.utils.invert_dict(data1_dict)

In [None]:
data1_df['Var0_num'] = data1_df.apply(lambda df: du.embedding.enum_category_conversion(df, enum_column='Var0_categories', enum_dict=du.utils.invert_dict(data1_dict)),
                                      axis=1)
data1_df
# data1_df.compute() # Dask

In [None]:
data2_df['Var0_num'] = data2_df.apply(lambda df: du.embedding.enum_category_conversion(df, enum_column='Var0_categories', enum_dict=du.utils.invert_dict(data2_dict)),
                                      axis=1)
data2_df
# data2_df.compute() # Dask

## Joining two encodings into one

In [None]:
du.utils.invert_dict(data1_dict)

In [None]:
du.utils.invert_dict(data2_dict)

In [None]:
isinstance('Var0', str)

In [None]:
new_data1_df, new_data2_df, all_data_dict = du.embedding.converge_enum(data1_df, cat_feat_name='Var0', df2=data2_df,
                                                                       nan_value=0,
                                                                       dict1=du.utils.invert_dict(data1_dict), 
                                                                       dict2=du.utils.invert_dict(data2_dict))
all_data_dict

In [None]:
new_data1_df

In [None]:
new_data2_df

In [None]:
all_categories = set(list(data1_dict.values()) + list(data2_dict.values()))
all_categories.remove('nan')
all_categories

In [None]:
data3_dict = {'nan': 0,
              'orange': 1,
              'unknown': 0,
              'other': 0,
              'pear': 2,
              'blueberry': 3,
              'banana': 4,
              'null': 0}
data3_dict

In [None]:
data3_dict = du.utils.invert_dict(data3_dict)
data3_dict

In [None]:
data3_dict[0] = 'nan'
data3_dict