# Preprocessing III

This notebook converts the data output from preprocessing II into the right format, making it ready to train models on.

In [1]:
import pandas as pd
import numpy as np

## Categorical Conversion
General one-hot encoding, for which no specific information about the feature is needed. One important thing here is that all files need to have the same columns. So, they all need to be encoded on the same unique values, since different files may have different values. Therefore, we import all files to collect their unique values and store this in a list of sets.

In [2]:
ethnic_group_names = ['unknown', 'white', 'other', 'asian', 'hispanic_latino', 'black_african_american', 'unable_to_obtain', 'american_indian_alaska_native']
categorical_columns = ['admission_type', 'admission_location', 'insurance', 'marital_status', 'gender']

# initiate sets with unique values for each feature
categorical_sets = []
for category in categorical_columns:
    categorical_sets.append(set())

# iterate over all datasets and add their feature values to the sets
for name in ethnic_group_names:
    # import data
    print('Importing ', name)
    df = pd.read_csv('data/preprocessing_II/' + name + '.csv')

    # add unqiue values for all features
    print('Adding unique values for ', name)
    for i in range(len(categorical_columns)):
        categorical_sets[i].update(df[categorical_columns[i]].unique())
    

Importing  unknown
Adding unique values for  unknown
Importing  white
Adding unique values for  white
Importing  other
Adding unique values for  other
Importing  asian
Adding unique values for  asian
Importing  hispanic_latino
Adding unique values for  hispanic_latino
Importing  black_african_american
Adding unique values for  black_african_american
Importing  unable_to_obtain
Adding unique values for  unable_to_obtain
Importing  american_indian_alaska_native
Adding unique values for  american_indian_alaska_native


In [3]:
def convert_categorical (df):
    for i in range(len(categorical_columns)):
        for value in categorical_sets[i]:
            # replace nan values with category 'unknown'
            if pd.isna(value):
                value = 'unknown'

            # column name e.g. gender_m or marital_status_divorced
            value_name = value.replace(' ', '_')
            column_name = categorical_columns[i] + "_" + value_name.lower()
            # one-hot encoding with True and False
            df[column_name] = np.where(df[categorical_columns[i]] == value, True, False)
       
        # remove old column
        df = df.drop(columns=[categorical_columns[i]])

    return df

## Datetime Conversion

In [4]:
def convert_datetime (df):
    datetime_columns = ['admittime', 'edregtime', 'emar_charttime', 'lab_charttime']

    for column in datetime_columns:
        df[column] = pd.to_datetime(df[column])

    return df

## Float Conversion

All counts are integer values, but are still represented as floats, due to NaN values.

In [5]:
def convert_float (df):
    float_columns = ['icd_code_count', 'emar_count', 'lab_count']

    for column in float_columns:
        df[column] = df[column].fillna(0)
        df[column] = df[column].astype('int64')
    
    return df

## eMAR Medication
### One-hot encoding with specific information selection:

Most common causes for kidney disease are:
1. diabetes
2. high blood pressure
3. high cholestorol

Related medicines are:
1. Insulin
2. ACE inhabitors such as Enalapril, Captopril, Lisinopril and Ramipril
3. Calcium blockers such as amLODIPine

In [6]:
def convert_emar_medicine (df):
    # initiate unique medicine count column
    df['emar_medicine_count'] = 0


    # initiate new boolean columns
    df['emar_contains_insulin'] = False
    df['emar_contains_ace_inhabitors'] = False
    df['emar_contains_calcium_blockers'] = False

    # check for medicine in the array of medications
    # multiple types of medicine may occur (e.g. insuline and ace inhabitors)
    for i in range(len(df)):
        if not pd.isna(df['emar_medications'][i]):
            medications = df['emar_medications'][i]

            # unique medicine count
            # filter and convert string to set to get unique values
            medications_filtered = medications.replace('[', '')
            medications_filtered = medications_filtered.replace(']', '')
            medications_filtered = medications_filtered.replace('\'\'', '')
            df.loc[i, 'emar_medicine_count'] = len(set(medications_filtered.split(',')))
            
            # insulin
            if 'Insulin' in medications:
                df.loc[i, 'emar_contains_insulin'] = True

            # ace inhabitors
            if 'Enalapril' in medications:
                df.loc[i, 'emar_contains_ace_inhabitors'] = True
            elif 'Captopril' in medications:
                df.loc[i, 'emar_contains_ace_inhabitors'] = True
            elif 'Lisinopril' in medications:
                df.loc[i, 'emar_contains_ace_inhabitors'] = True
            elif 'Ramipril' in medications:
                df.loc[i, 'emar_contains_ace_inhabitors'] = True

            # calcium blockers
            if 'amLODIPine' in medications:
                df.loc[i, 'emar_contains_calcium_blockers'] = True

    # drop old column
    df = df.drop(columns=['emar_medications'])
    
    return df

## eMAR Events
### One-hot encoding with specific information selection:

Most prescribed drugs are 'administred', but it sometimes they are not. These are the interesting cases, as doctors might stop a certain prescription due to adverse events for example

In [7]:
def convert_emar_events (df):
    # initiate new boolean columns
    df['emar_contains_not_given'] = False
    df['emar_contains_not_flushed'] = False
    df['emar_contains_stopped'] = False
    df['emar_contains_not_started'] = False

    # check for event in array of events
    # multiple events can occur for the same patient, as the patient may take multiple drugs at the time
    for i in range(len(df)):
        if not pd.isna(df['emar_events'][i]):
            events = df['emar_events'][i]

            if 'Not Given' in events:
                df.loc[i, 'emar_contains_not_given'] = True
            if 'Not Flushed' in events:
                df.loc[i, 'emar_contains_not_flushed'] = True
            if 'Stopped' in events:
                df.loc[i, 'emar_contains_stopped'] = True
            if 'Not Started' in events:
                df.loc[i, 'emar_contains_not_started'] = True

    # drop old column
    df = df.drop(columns=['emar_events'])
    
    return df

## Lab Flag
### One-hot encoding with specific information selection:

Either 'abnormal' or nan. We want to count the occurances of abnormal flags

In [8]:
def convert_lab_flag (df):
    # initiate boolean column
    df['abnormal_lab_flags'] = 0
    
    # check whether it has abnormal
    for i in range(len(df)):
        if not pd.isna(df['lab_flag'][i]):
            flags = df['lab_flag'][i]

            df.loc[i, 'abnormal_lab_flags'] = flags.count('abnormal')
    
    # drop old column
    df = df.drop(columns=['lab_flag'])
    
    return df

## Lab Comments
### One-hot encoding with specific information selection:

Either a detailed comment (string) or ',' or nan

In [9]:
def convert_lab_comments (df):
    # initiate boolean column
    df['has_lab_comment'] = False

    # set value to false
    for i in range(len(df)):
        if not pd.isna(df['lab_comments'][i]):
            comments = df['lab_comments'][i]
            
            # filter out empty comments
            comments_filtered = comments.replace('\"', '')
            comments_filtered = comments_filtered.replace('\'', '')
            comments_filtered = comments_filtered.replace(',', '')
            comments_filtered = comments_filtered.replace(' ', '')

            # if there are non-empty comments set to True
            if comments_filtered != []:
                df.loc[i, 'has_lab_comment'] = True
    
    # drop old column
    df = df.drop(columns=['lab_comments'])
    
    return df

## Lab Priority
### One-hot encoding with specific information selection:

Either a STAT or ROUTINE

In [10]:
def convert_lab_priority (df):
    # initiate new boolean columns
    df['lab_priority_stat'] = False
    df['lab_priority_routine'] = False

    # check for priority in array of priorities
    for i in range(len(df)):
        if not pd.isna(df['lab_priority'][i]):
            priorities = df['lab_priority'][i]

            if 'STAT' in priorities:
                df.loc[i, 'lab_priority_stat'] = True
            if 'ROUTINE' in priorities:
                df.loc[i, 'lab_priority_routine'] = True

    # drop old column
    df = df.drop(columns=['lab_priority'])
    
    return df


## Run All

In [11]:
def convert_all (df_name):
    # import
    print('Importing ', df_name)
    df = pd.read_csv('data/preprocessing_II/' + df_name + '.csv')

    # drop columns
    print('Converting ', df_name)
    df = df.drop(columns=['Unnamed: 0', 'ethnicity'])

    # general format conversions
    df = convert_categorical(df)
    df = convert_datetime(df)
    df = convert_float(df)

    # specific (one-hot) encodings
    df = convert_emar_medicine(df)
    df = convert_emar_events(df)
    df = convert_lab_flag(df)
    df = convert_lab_comments(df)
    df = convert_lab_priority(df)
    
    # save to .csv
    print('Saving .csv for ', df_name)
    df.to_csv("data/preprocessing_III/" + df_name + ".csv")

In [12]:
for name in ethnic_group_names:
    convert_all(name)

Importing  unknown
Converting  unknown
Saving .csv for  unknown
Importing  white
Converting  white
Saving .csv for  white
Importing  other
Converting  other
Saving .csv for  other
Importing  asian
Converting  asian
Saving .csv for  asian
Importing  hispanic_latino
Converting  hispanic_latino
Saving .csv for  hispanic_latino
Importing  black_african_american
Converting  black_african_american
Saving .csv for  black_african_american
Importing  unable_to_obtain
Converting  unable_to_obtain
Saving .csv for  unable_to_obtain
Importing  american_indian_alaska_native
Converting  american_indian_alaska_native
Saving .csv for  american_indian_alaska_native
