# Preprocessing III

This notebook extends the data output from preprocessing II, making it ready to train models on.

In [18]:
import pandas as pd
import numpy as np
import glob

path = './data/preprocessing_II'
files = glob.glob(path + "/*.csv")

## Import data

Data is read from preprocessing II output files. For a quick analysis of the data distribution, the fraction of patients suffering from kidney issues is printed.

In [19]:
def readFiles(csv_files):
    return [pd.read_csv(file) for file in csv_files]

ethn_list = readFiles(files)

Print ethnicity order and percentage of patients suffering from kidney disease

In [20]:
def printEthnicities(csv_files, df_list):
    for counter in range(len(csv_files)):
        print(counter.__str__() + "\t"  + (int(sum(df_list[counter]['has_kidney_issue'])/len(df_list[counter])*100)).__str__() + "% \t" + csv_files[counter].split("\\")[1].split(".")[0])

printEthnicities(files, ethn_list)

0	22% 	american_indian_alaska_native
1	11% 	asian
2	22% 	black_african_american
3	15% 	hispanic_latino
4	14% 	other
5	7% 	unable_to_obtain
6	14% 	unknown
7	18% 	white


In [21]:
def getTestDf(df_list):
    return df_list[0]

df = getTestDf(ethn_list)

## Processing types of columns and categorical variables

The columns for 'emar_events', 'lab_priority' and 'lab_comments' are dropped as these do not contain interpretable data with predictive power.

In [22]:
datetime = ['admittime', 'edregtime', 'emar_charttime', 'lab_charttime']
ints = ['icd_code_count', 'emar_count', 'lab_count']
one_hots = ['admission_type', 'admission_location', 'insurance', 'marital_status', 'gender']
drop_cols = ['ethnicity', 'emar_events', 'lab_priority', 'lab_comments', 'lab_flag', 'Unnamed: 0', 'hadm_id', 'subject_id', 'edregtime', 'emar_charttime', 'lab_charttime']

def convert(type, dfs, cols):
    for df in dfs:
        for col in cols:
            df[col] = df[col].astype(type)
    return dfs

def findAllUniques(dfs, col):
    uniques = set()
    for df in dfs:
        for item in df[col].unique():
            uniques.add(item)
    return uniques

def oneHot(dfs, cols):
    for col in cols:
        vals = findAllUniques(dfs, col)
        for df in range(len(dfs)):
            if len(vals) > 2:
                for val in vals:
                    dfs[df][val] = np.where(dfs[df][col]==val, True, False)

                dfs[df] = dfs[df].drop(col, axis=1)
            else:
                dfs[df][col] = np.where(dfs[df][col]==val[0], True, False)
    return dfs

def dropCols(dfs, cols):
    for col in cols:
        for df in range(len(dfs)):
            dfs[df] = dfs[df].drop(col, axis=1)
    return dfs

ethn_list = convert('datetime64', ethn_list, datetime)
ethn_list = convert('Int64', ethn_list, ints)
ethn_list = oneHot(ethn_list, one_hots)
ethn_list = dropCols(ethn_list, drop_cols)

## Process list variables

### emar_medication
This column is processed by finding the most common 5 medications among kidney disease patients.

These five are then one-hot encoded

In [23]:
def getLiterals(dfs, cols):
    chars = "[]''"
    for col in cols:
        for df in range(len(dfs)):
            dfs[df][col] = dfs[df][col].fillna("[]")
            for char in chars:
                dfs[df][col] = dfs[df][col].apply(lambda x: x.replace(char, ""))
            dfs[df][col] = dfs[df][col].apply(lambda x: x.split(", "))
    return dfs

def getUniquesInList(dfs, col):
    uniques = set()
    for df in range(len(dfs)):
        kid_iss = dfs[df][dfs[df]['has_kidney_issue'] == True]
        for item in kid_iss[col]:
                for sub in item:
                    uniques.add(sub)
    return uniques

def countMeds(dfs):
    uniques = getUniquesInList(dfs, 'emar_medications')
    meds = [0]*len(uniques)
    # for every medicine found in patients with kidney issues
    for i, med in enumerate(uniques):
        # for every ethnicity
        for df in range(len(dfs)):
            kid_iss = dfs[df][dfs[df]['has_kidney_issue'] == True]
            # for every patient's medications
            for item in kid_iss['emar_medications']:
                # if it contains the medicine, increment
                if med in item:
                    meds[i] += 1
    return meds

In [24]:
ethn_list = getLiterals(ethn_list, ['emar_medications'])

In [25]:
kidney_medications = getUniquesInList(ethn_list, 'emar_medications')
kidney_medications = list(kidney_medications)

med_count = np.genfromtxt('med_frequency.csv', delimiter=",")

In [26]:
# N most frequent meds
N = 5

inds = np.argsort(med_count)
most_frequent = inds[-1]
most_frequent = np.append(most_frequent, inds[-3])
most_frequent = np.append(most_frequent, inds[-(N+2):-4])

freq_med_names = []

for item in most_frequent:
    freq_med_names.append(kidney_medications[item])


In [27]:
def oneHotMeds(dfs, meds):
    for med in meds:
        for df in range(len(dfs)):
            dfs[df][med] = False
            for row in dfs[df].itertuples():
                dfs[df].at[row.Index, med] = med in row.emar_medications
    dfs = dropCols(dfs, ['emar_medications'])
    return dfs

ethn_list = oneHotMeds(ethn_list, freq_med_names)

In [28]:
for df in ethn_list:
    df['admittime'] = pd.to_numeric(pd.to_datetime(df['admittime']))
    df['icd_code_count'] = df['icd_code_count'].fillna(0)
    df['emar_count'] = df['emar_count'].fillna(0)
    df['lab_count'] = df['lab_count'].fillna(0)

### Abnormal lab flags
The 'lab_flag' column is processed by counting the number of abnormal states

EDIT: lab_flag column is dropped as its information is already encoded as lab_count

In [29]:
def countAbnormal(dfs):
    for df in range(len(dfs)):
        for row in dfs[df].itertuples():
                print(row.lab_flag.count('abnormal'))
    return dfs

## Save to .csv

In [32]:
df_name = ['american_indian_alaska_native', 'asian', 'black_african_american', 'hispanic_latino', 'other', 'unable_to_obtain', 'unknown', 'white']

for df in range(len(ethn_list)):
    print("Saving ", df_name[df], "...")
    ethn_list[df].to_csv("data/preprocessing_III/" + df_name[df] + ".csv")


Saving  american_indian_alaska_native ...
Saving  asian ...
Saving  black_african_american ...
Saving  hispanic_latino ...
Saving  other ...
Saving  unable_to_obtain ...
Saving  unknown ...
Saving  white ...
