# Dataset Creation

## Import packages and set global variables

In [None]:
from scipy import stats
import numpy as np
import pandas as pd
import datetime as dt
import math
import pickle
from tqdm import tqdm

In [None]:
# Clustering parameters
number_of_stays = 1000
start_index = 0
display_matrix = False

# File export suffix
file_suffix = '_real_'+ str(number_of_stays)
output_path = "output/"

In [None]:
def save_as_pickle(data, file_name, path=output_path):
    file = open(path + file_name, 'wb')
    pickle.dump(data, file)
    file.close()


def get_pickle(file_name, path=output_path):
    return pickle.load(open(path + file_name, 'rb'))


## Import MIMIC IV data

In [None]:
# metadata = pd.read_csv("./dataset_files/mimic-cxr-2.0.0-metadata.csv")
icu = pd.read_csv('./dataset_files/icu_data_5000.csv')
# icu = pd.read_csv('./dataset_files/first_6000_patients.csv', index_col=0)
# icu_2000_2600 = pd.read_csv('./dataset_files/icu_data_2000-2600.csv')
# icu_2600_3200 = pd.read_csv('./dataset_files/icu_data_2600-3200.csv')
# icu_3200_3600 = pd.read_csv('./dataset_files/icu_data_3200-3600.csv')
all_stays = pd.read_csv('./dataset_files/all_stays.csv', index_col=0)

In [None]:
# metadata.insert(0, 'metadata_id', range(0, 0 + len(metadata)))
# metadata.head()


In [None]:
num_stays = len(list(icu['hadm_id'].unique()))
num_lables = len(list(icu['label'].unique()))
print(f"#stays: {num_stays}\n#labels: {num_lables}")


## Set intime for all_stays & metadata

In [None]:
all_stays['charttime'] = pd.to_datetime(all_stays['intime'])
# metadata['charttime'] = pd.to_datetime(metadata['StudyDate'], format='%Y%m%d')



## Preprocess ICU dataset

In [None]:
labels = labels = list(icu['label'].unique())

In [None]:
def freedman_diaconis(data):
    """
    Use Freedman Diaconis rule to compute optimal histogram bin width. 
    ``returnas`` can be one of "width" or "bins", indicating whether
    the bin width or number of bins should be returned respectively. 


    Parameters
    ----------
    data: np.ndarray
        One-dimensional array.

    returnas: {"width", "bins"}
        If "width", return the estimated width for each histogram bin. 
        If "bins", return the number of bins suggested by rule.
    """
    data = np.asarray(data, dtype=np.float_)
    IQR = stats.iqr(data, rng=(25, 75), scale="raw", nan_policy="omit")
    N = data.size
    bw = max((2 * IQR) / np.power(N, 1/3), 1)

    datmin, datmax = data.min(), data.max()
    datrng = datmax - datmin
    bins = int((datrng / bw) + 1)

    return bw, bins

In [None]:
def binned_data(data, width, num_bins):
  if np.isnan(width) or np.isnan(num_bins):
    raise Exception(f'Width or num_bins is not a number')

  binned_values = []

  for value in data:
    bin_index = math.floor(value/width)
    binned_values.append(f'({bin_index * width}, {(bin_index+1) * width }]')

  return binned_values


In [None]:
icu_binned = pd.DataFrame()
blacklist = []

for label in tqdm(labels):
  values = icu[icu['label'] == label]
  val_type = list(values['param_type'])

  if (label in blacklist):
    continue

  if not ('Text' in val_type or 'Checkbox' in val_type):
    IQR = stats.iqr(values['valuenum'], rng=(
        25, 75), scale="raw", nan_policy="omit")

    width, bins = freedman_diaconis(values['valuenum'])
    values['value_categorical'] = values['label'] + \
        binned_data(values['valuenum'], width, bins)
  elif('Checkbox' in val_type):
    values['value_categorical'] = values['label'] + \
        binned_data(values['valuenum'], 1, 2)

  icu_binned = pd.concat([icu_binned, values])

## Set ICU charttime to correct data type

In [None]:
icu_binned['charttime'] = pd.to_datetime(icu_binned['charttime'])
icu_binned.dtypes

## Set event for combining all dataframes

In [None]:
icu_binned['event'] = 'icu: ' +  icu_binned['label'].astype(str)
# metadata['event'] = 'photo: ' + \
#     metadata['PerformedProcedureStepDescription'].astype(str)
all_stays['event'] = 'transfer: ' + all_stays['eventtype']

# data = pd.concat([all_stays, metadata, icu_binned])
data = pd.concat([all_stays, icu_binned])

data.head()

In [None]:

del icu_binned
# del metadata
del all_stays


In [None]:
# data.sort_values(by=['hadm_id', 'charttime', 'event'], ascending=[False, False, True])
# data_sort_1 = data[data['hadm_id'] == 28722652]
# data.head()

In [None]:
# data_sort_1.head(n=20)

In [None]:
data.dtypes

In [None]:
data = data.sort_values(by=['hadm_id','charttime', 'event'],
                 ascending=[False,True, False])
data.head(n=15)


In [None]:
data_sort_2 = data[data['hadm_id'] == 28722652]
data_sort_2.head(n=50)


### Create encoded event 

In [None]:
data['event_encoded'] = data['event'].astype('category')
data['event_encoded'] = data['event_encoded'].cat.codes
len(data.event_encoded.unique())


### Transform event encodings to alphabet

In [None]:
alphabet = list('abcdefghijklmnopqrstuvwyz')

def number_to_character(index):
    return alphabet[index]


data['event_encoded'] = data['event_encoded'].apply(
    lambda x: number_to_character(x))

data.head()

### Sort events on time

In [None]:
# data = data.sort_values(by=['charttime'])
# data.head()

### Add ID to events

In [None]:
data.insert(0, 'event_id', range(0, 0 + len(data)))
data.set_index('event_id')

## Save data as pickle and csv file

In [None]:
save_as_pickle(data, 'data_complete_v4.1')

In [None]:
data.to_csv("output/data_complete_v4.1.csv")

## Create data export for distance matrix

In [None]:
# data = get_pickle('data_complete_v4')

In [None]:
len(list(data['label'].unique()))

In [None]:
distance_data = data[['event_id', 'hadm_id', 'event_encoded']]
save_as_pickle(distance_data, 'distance_data_v4.1')
distance_data.to_csv('output/distance_data_v4.1.csv')
distance_data.head()

# Data summary

In [None]:
stays = list(distance_data['hadm_id'].unique())[
    start_index: start_index + number_of_stays]

lengths = []

for y in tqdm(range(len(stays))):
    sequence_y = distance_data[distance_data['hadm_id']
                           == stays[y]]['event_encoded'].tolist()
    lengths.append([stays[y], len(sequence_y)])

length_data = pd.DataFrame(lengths, columns=['hadm_id', 'length'])
length_data

In [None]:
length_data.describe()


In [None]:
data.head(n=100)

In [None]:
test = data[data['hadm_id'] == 20001729]
test.head(n=50)
