## Importing the necessary packages

In [None]:
import dask.dataframe as dd                # Dask to handle big data in dataframes
import pandas as pd                        # Pandas to load the data initially
from dask.distributed import Client        # Dask scheduler
from dask.diagnostics import ProgressBar   # Dask progress bar
import re                                  # re to do regex searches in string data
import os                                  # os handles directory/workspace changes
import numpy as np                         # NumPy to handle numeric and NaN operations
from tqdm import tqdm_notebook             # tqdm allows to track code execution progress
import numbers                             # numbers allows to check if data is numeric
import utils                               # Contains auxiliary functions

In [None]:
# Debugging packages
import pixiedust                           # Debugging in Jupyter Notebook cells

In [None]:
# Change to parent directory (presumably "Documents")
os.chdir("../../..")

# Path to the CSV dataset files
data_path = 'Documents/Datasets/Thesis/eICU/uncompressed/'
project_path = 'Documents/GitHub/eICU-mortality-prediction/'

In [None]:
# Set up local cluster
client = Client("tcp://127.0.0.1:60008")
client

In [None]:
# Upload the utils.py file, so that the Dask cluster has access to relevant auxiliary functions
client.upload_file(f'{project_path}NeuralNetwork.py')
client.upload_file(f'{project_path}utils.py')

## Creating data

In [None]:
data_df = pd.DataFrame([[103, 0, '2 mg'], 
                        [103, 0, '53 kg/L'],
                        [103, 1, 'horse'],
                        [104, 0, '69 kg bunny'],
                        [105, 0, 'what 5'],
                        [105, 0, '9'],
                        [105, 0, '42 meaning_of_life?'],
                        [105, 0, '123 $$$'],
                        [105, 1, '1 polly parrot'],
                        [105, 1, '2 idiots'],
                        [105, 1, '77 10']], columns=['id', 'ts', 'Var0'])
# Only use the line of code bellow if you want to test on Dask
data_df = dd.from_pandas(data_df, npartitions=2)
# If using Pandas, uncomment the line of code bellow and comment the next one, which uses Dask
# data_df
data_df.compute()

## Separating dosages from units

In [None]:
data_df['drugdosage'] = np.nan
data_df['drugunit'] = np.nan
data_df.compute()

In [None]:
data_df[['drugdosage', 'drugunit']] = data_df.apply(lambda df: utils.set_dosage_and_units(df, orig_column='Var0'), axis=1, result_type='expand')
data_df.compute()