In [352]:
import pandas as pd
import numpy as np
import os
import timeit
from tqdm.notebook import trange, tqdm
import re
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt

In [353]:
DEMO = True
WRITE_ERRORS_TO_FILE = False

In [354]:

if DEMO:
    CHARTEVENTS_BY_ICUSTAY_ID = 'data_demo/parquet/'
    REDUCED_CE_BY_ICUSTAY_ID = 'data_demo/samples/'
    NUMERICAL_BY_ICUSTAY_ID = 'data_demo/samples_numerical/'
else:
    CHARTEVENTS_BY_ICUSTAY_ID = 'data/parquet/'
    REDUCED_CE_BY_ICUSTAY_ID = 'data/samples/'
    NUMERICAL_BY_ICUSTAY_ID = 'data/samples_numerical/'


CHARTEVENTS_FILENAME = 'mimic-iii/CHARTEVENTS.csv'
READMISSION_FILENAME = 'data/readmission.csv'
PARQUET_EXT = '.parquet'


In [355]:
OZ_TO_KG = 35.274
LB_TO_KG = 2.205
IN_TO_CM = 2.54
MAX_FRACTION = 1.0
MAX_PERCENT = 100


In [356]:
# chartevents_dir_list = os.listdir(REDUCED_CE_BY_ICUSTAY_ID)
# len(chartevents_dir_list)

In [357]:
# test = pd.read_parquet(REDUCED_CE_BY_ICUSTAY_ID + chartevents_dir_list[5])
# test = test.reset_index(drop=True)


![Glascow](assets/images/GCS.jpg)

https://www.firstaidforfree.com/glasgow-coma-scale-gcs-first-aiders/

|	Chart Event	|	Dim	|	Normal	| Initial dtype | Change to dtype |
| --- | :--- | :--- | :--- | :--- |
|	1. Glasgow coma scale eye opening	|	4	|	4 Spontaneously	| str | int |
|	2. Glasgow coma scale verbal response	|	5	|	5 Oriented	| str | int |
|	3. Glasgow coma scale motor response	|	6	|	6 Obeys Commands	| str | int |
|	4. Glasgow coma scale total	|	13	|	15	| none/int | int |
|	5. Capillary refill rate	|	2	|	Normal < 3 secs	| str | int |
|	6. Diastolic blood pressure	|	1	|	70	| int | int |
|	7. Systolic blood pressure	|	1	|	105	| int | int |
|	8. Mean blood pressure	|	1	|	87.5	| int/float | int |
|	9. Heart Rate	|	1	|	80	| int | int |
|	10. Glucose	|	1	|	85	| int | int |
|	11. Fraction inspired oxygen	|	1	|	0.21	| int/float | float? |
|	12. Oxygen saturation	|	1	|	97.5	| int | int |
|	13. Respiratory rate	|	1	|	15	| int | int |
|	14. Body Temperature	|	1	|	37	| float | float |
|	15. pH	|	1	|	7.4	| float | float |
|	16. Weight	|	1	|	80.7	| float | float |
|	17. Height	|	1	|	168.8	| float | float |


Since there are so many `NaN` values, everything should just be a float?

In [358]:
gcs_eye_map     = {'__missing__': np.NaN, '4 Spontaneously': '4', '1 No Response': '1', '2 To pain': '2', '3 To speech': '3', 'To Speech': '3', 'Spontaneously': '4', 'To Pain': '2'}
gcs_motor_map   = {'__missing__': np.NaN, '6 Obeys Commands': '6', '5 Localizes Pain': '5', '1 No Response': '1', '4 Flex-withdraws': '4', '2 Abnorm extensn': '2', '3 Abnorm flexion': '3', 'Localizes Pain': '5', 
                    'Obeys Commands': '6', 'Flex-withdraws': '4', 'No response': '1', 'Abnormal Flexion': '3', 'Abnormal extension': '2'}
gcs_verbal_map  = {'__missing__': np.NaN, '5 Oriented': '5', '1.0 ET/Trach': '1', '4 Confused': '4', '2 Incomp sounds': '2', '1 No Response': '1', '3 Inapprop words': '3', 'No Response-ETT': '1', 
                    'Oriented': '5', 'No Response': '1', 'Confused': '4', 'Incomprehensible sounds': '2', 'Inappropriate Words': '3'}
capillary_map = {'__missing__': np.NaN, 'Brisk':'1', 'Delayed':'0', 'Comment':'0', 'Normal <3 secs':'1', 'Abnormal >3 secs':'0', 'Other/Remarks':'0', 'Normal <3 Seconds':'1', 'Abnormal >3 Seconds':'0'}


In [359]:
event_to_id =[
{'CHAREVENT':'GCS_EYE', 'DESCRIPTION':'','ITEMID':[184, 220739],'UNIT':['NONE','NONE']},
{'CHAREVENT':'GCS_MOTOR', 'DESCRIPTION':'','ITEMID':[454, 223901],'UNIT':['NONE','NONE']},
{'CHAREVENT':'GCS_VERBAL', 'DESCRIPTION':'','ITEMID':[723, 223900],'UNIT':['NONE','NONE']},
{'CHAREVENT':'GCS_TOTAL', 'DESCRIPTION':'Sum of the 3 GCS events','ITEMID':[198],'UNIT':['NONE']},
{'CHAREVENT':'CAPILLARY_REFILL', 'DESCRIPTION':'','ITEMID':[3348, 115, 8377, 223951, 224308],'UNIT':['BINARY','BINARY','BINARY','BINARY','BINARY']},
{'CHAREVENT':'D_BLOOD_PRESSURE', 'DESCRIPTION':'','ITEMID':[8368, 220051, 225310, 8555, 8441, 220180, 8502, 8440, 8503, 8504, 8507, 8506, 224643, 227242],'UNIT':['mmHg','mmHg','mmHg','mmHg','mmHg','mmHg','mmHg','mmHg','mmHg','mmHg','mmHg','mmHg','mmHg','mmHg']},
{'CHAREVENT':'M_BLOOD_PRESSURE', 'DESCRIPTION':'','ITEMID':[52, 220052, 225312, 224, 6702, 224322, 456, 220181, 3312, 3314, 3316, 3322, 3320],'UNIT':['mmHg','mmHg','mmHg','mmHg','mmHg','mmHg','mmHg','mmHg','mmHg','mmHg','mmHg','mmHg','mmHg']},
{'CHAREVENT':'S_BLOOD_PRESSURE', 'DESCRIPTION':'','ITEMID':[51, 220050, 225309, 6701, 455, 220179, 3313, 3315, 442, 3317, 3323, 3321, 224167, 227243],'UNIT':['mmHg','mmHg','mmHg','mmHg','mmHg','mmHg','mmHg','mmHg','mmHg','mmHg','mmHg','mmHg','mmHg','mmHg']},
{'CHAREVENT':'HEART_RATE', 'DESCRIPTION':'','ITEMID':[211, 220045],'UNIT':['bpm','bpm']},
{'CHAREVENT':'GLUCOSE', 'DESCRIPTION':'','ITEMID':[807, 811, 1529, 3745, 225664, 220621, 226537, 3447, 3816, 3744, 227015, 227016, 1310, 1455, 2338, 1812, 228388],'UNIT':['','']},
{'CHAREVENT':'FRAC_OXYGEN', 'DESCRIPTION':'','ITEMID':[3420, 223835, 3422, 189, 727],'UNIT':['%','%','%','%','%']},
{'CHAREVENT':'O2_SAT', 'DESCRIPTION':'','ITEMID':[834, 8498, 220227, 646, 220277],'UNIT':['%','%','%','%','%']},
{'CHAREVENT':'RESP_RATE', 'DESCRIPTION':'','ITEMID':[618, 220210, 3603, 224689, 614, 651, 224422, 615, 224690, 619, 224688],'UNIT':['insp/min','']},
{'CHAREVENT':'BODY_TEMP', 'DESCRIPTION':'','ITEMID':[3655, 677, 676, 223762, 3654, 678, 223761, 679, 8537, 645, 591, 226329, 597, 227054, 228242],'UNIT':['C','C','C','C','F','F','F','F','C','','','','C','','F']},
{'CHAREVENT':'PH', 'DESCRIPTION':'','ITEMID':[3839, 1673, 780, 1126, 223830, 4753, 4202, 860, 220274, 8387, 1880, 3777, 227586, 8385, 1352, 4755, 7966],'UNIT':['NONE','NONE','NONE','NONE','NONE','NONE','NONE','NONE','NONE','NONE','NONE','NONE','NONE','NONE','NONE','NONE','NONE']},
{'CHAREVENT':'WEIGHT', 'DESCRIPTION':'','ITEMID':[763, 224639, 226512, 3580, 3693, 3581, 226531, 3582],'UNIT':['kg','kg','kg','kg','kg','lb','lb','oz']},
{'CHAREVENT':'HEIGHT', 'DESCRIPTION':'','ITEMID':[226707, 226730, 1394],'UNIT':['?','cm','in']},
]

In [360]:
sample_dtype_init = {'GCS_EYE': object, 'GCS_EYE_ID': np.float64, 'GCS_MOTOR': object, 'GCS_MOTOR_ID': np.float64, 
                     'GCS_VERBAL': object, 'GCS_VERBAL_ID': np.float64, 'GCS_TOTAL': object, 'GCS_TOTAL_ID': np.float64, 
                     'CAPILLARY_REFILL': object, 'CAPILLARY_REFILL_ID': np.float64, 'D_BLOOD_PRESSURE': object, 
                     'D_BLOOD_PRESSURE_ID': np.float64, 'M_BLOOD_PRESSURE': object, 'M_BLOOD_PRESSURE_ID': np.float64, 
                     'S_BLOOD_PRESSURE': object, 'S_BLOOD_PRESSURE_ID': np.float64, 'HEART_RATE': object, 'HEART_RATE_ID': np.float64, 
                     'GLUCOSE': object, 'GLUCOSE_ID': np.float64, 'FRAC_OXYGEN': object, 'FRAC_OXYGEN_ID': object, 'O2_SAT': object, 
                     'O2_SAT_ID': np.float64, 'RESP_RATE': object, 'RESP_RATE_ID': np.float64, 'BODY_TEMP': object, 'BODY_TEMP_ID': np.float64, 
                     'PH': object, 'PH_ID': np.float64, 'WEIGHT': object, 'WEIGHT_ID': np.float64, 'HEIGHT': object, 'HEIGHT_ID': np.float64, 
                     'GCS_EYE_IND': np.int8, 'GCS_MOTOR_IND': np.int8, 'GCS_VERBAL_IND': np.int8, 'GCS_TOTAL_IND': np.int8, 
                     'CAPILLARY_REFILL_IND': np.int8, 'D_BLOOD_PRESSURE_IND': np.int8, 'M_BLOOD_PRESSURE_IND': np.int8, 'S_BLOOD_PRESSURE_IND': np.int8, 
                     'HEART_RATE_IND': np.int8, 'GLUCOSE_IND': np.int8, 'FRAC_OXYGEN_IND': np.int8, 'O2_SAT_IND': np.int8, 'RESP_RATE_IND': np.int8, 
                     'BODY_TEMP_IND': np.int8, 'PH_IND': np.int8, 'WEIGHT_IND': np.int8, 'HEIGHT_IND': np.int8}

In [361]:
sample_dtype_final = {'GCS_EYE': np.float64, 'GCS_EYE_ID': np.float64, 'GCS_MOTOR': np.float64, 'GCS_MOTOR_ID': np.float64, 
                     'GCS_VERBAL': np.float64, 'GCS_VERBAL_ID': np.float64, 'GCS_TOTAL': np.float64, 'GCS_TOTAL_ID': np.float64, 
                     'CAPILLARY_REFILL': np.float64, 'CAPILLARY_REFILL_ID': np.float64, 'D_BLOOD_PRESSURE': np.float64, 
                     'D_BLOOD_PRESSURE_ID': np.float64, 'M_BLOOD_PRESSURE': np.float64, 'M_BLOOD_PRESSURE_ID': np.float64, 
                     'S_BLOOD_PRESSURE': np.float64, 'S_BLOOD_PRESSURE_ID': np.float64, 'HEART_RATE': np.float64, 'HEART_RATE_ID': np.float64, 
                     'GLUCOSE': np.float64, 'GLUCOSE_ID': np.float64, 'FRAC_OXYGEN': np.float64, 'FRAC_OXYGEN_ID': np.float64, 'O2_SAT': np.float64, 
                     'O2_SAT_ID': np.float64, 'RESP_RATE': np.float64, 'RESP_RATE_ID': np.float64, 'BODY_TEMP': np.float64, 'BODY_TEMP_ID': np.float64, 
                     'PH': np.float64, 'PH_ID': np.float64, 'WEIGHT': np.float64, 'WEIGHT_ID': np.float64, 'HEIGHT': np.float64, 'HEIGHT_ID': np.float64, 
                     'GCS_EYE_IND': np.int8, 'GCS_MOTOR_IND': np.int8, 'GCS_VERBAL_IND': np.int8, 'GCS_TOTAL_IND': np.int8, 
                     'CAPILLARY_REFILL_IND': np.int8, 'D_BLOOD_PRESSURE_IND': np.int8, 'M_BLOOD_PRESSURE_IND': np.int8, 'S_BLOOD_PRESSURE_IND': np.int8, 
                     'HEART_RATE_IND': np.int8, 'GLUCOSE_IND': np.int8, 'FRAC_OXYGEN_IND': np.int8, 'O2_SAT_IND': np.int8, 'RESP_RATE_IND': np.int8, 
                     'BODY_TEMP_IND': np.int8, 'PH_IND': np.int8, 'WEIGHT_IND': np.int8, 'HEIGHT_IND': np.int8}


In [362]:
id_column_list = ['GCS_EYE_ID','GCS_MOTOR_ID','GCS_VERBAL_ID','GCS_TOTAL_ID','CAPILLARY_REFILL_ID',
                  'D_BLOOD_PRESSURE_ID','M_BLOOD_PRESSURE_ID','S_BLOOD_PRESSURE_ID','HEART_RATE_ID',
                  'GLUCOSE_ID','FRAC_OXYGEN_ID','O2_SAT_ID','RESP_RATE_ID','BODY_TEMP_ID','PH_ID',
                  'WEIGHT_ID','HEIGHT_ID']

In [363]:
chartevents_dir_list = os.listdir(REDUCED_CE_BY_ICUSTAY_ID)
len(chartevents_dir_list)

6

In [364]:
# test = pd.read_parquet(REDUCED_CE_BY_ICUSTAY_ID + chartevents_dir_list[0])
# test.head(1)

In [365]:
body_temp_f = [3654, 678, 223761, 679, 228242,227054]
body_temp_unknown = [645, 591]


weight_lb = [3581, 226531]
weight_oz = [3582]

height_in = [226707, 1394]

body_temp_unknown_unit = {}
weight_unknown_unit = {}

for i in body_temp_unknown:
    body_temp_unknown_unit[i] = []


In [366]:
def convert_percent(val):
    if val is not None:
        if val < MAX_FRACTION:
            return val * MAX_PERCENT

In [367]:
def convert_body_temp(row, stay):
    #print('ROW: ', row['BODY_TEMP_ID'])
    #if row['BODY_TEMP_ID'] is not None and row['HEIGHT_ID'] != np.NaN:
    try:
        id = int(row['BODY_TEMP_ID'])
        if id in body_temp_f:
            row['BODY_TEMP'] =(((row['BODY_TEMP'] - 32) * 5) / 9) 
        elif id in body_temp_unknown:
            body_temp_unknown_unit[id].append([stay, row['BODY_TEMP']])
    except Exception as e:
        pass
    
        
    return row

In [368]:
def convert_weight(row):
    # if row['WEIGHT_ID'] is not None and row['HEIGHT_ID'] != np.NaN:
    try:
        id = int(row['WEIGHT_ID'])
        if id in weight_oz:
            row['WEIGHT'] = row['WEIGHT'] * OZ_TO_KG
        elif id in weight_lb:
            row['WEIGHT'] = row['WEIGHT'] * LB_TO_KG
    except Exception as e:
        pass
    
    return row

In [369]:
def convert_height(row):
    # if row['HEIGHT_ID'] is not None and row['HEIGHT_ID'] != np.NaN:
    try:
        id = int(row['HEIGHT_ID'])
        if id in height_in:
            row['HEIGHT'] = row['HEIGHT'] * IN_TO_CM
    except Exception as e:
        pass
    
    return row

In [370]:
def convert(row, stay):
    row = convert_body_temp(row, stay)
    row = convert_weight(row)
    row = convert_height(row)
    return row

### Goals

1. Transform columns to numerical values
2. Transform dtypes
3. 

In [371]:
failed = []


for stay in tqdm(chartevents_dir_list, total=len(chartevents_dir_list)):
    # try:
        sample_df = pd.read_parquet(REDUCED_CE_BY_ICUSTAY_ID + stay)
        sample_df = sample_df.reset_index(drop=True).astype(sample_dtype_init)
        
        sample_df['GCS_EYE'] = sample_df['GCS_EYE'].map(gcs_eye_map)
        sample_df['GCS_MOTOR'] = sample_df['GCS_MOTOR'].map(gcs_motor_map)
        sample_df['GCS_VERBAL'] = sample_df['GCS_VERBAL'].map(gcs_verbal_map)
        sample_df['CAPILLARY_REFILL'] = sample_df['CAPILLARY_REFILL'].map(capillary_map)

        sample_df['BODY_TEMP'] = pd.to_numeric(sample_df['BODY_TEMP'],errors='coerce')
        sample_df['BODY_TEMP_IND'] = sample_df['BODY_TEMP'].notnull().astype(np.int8)

        if sample_df['GLUCOSE'].dtype != np.float64:
            sample_df['GLUCOSE'] = pd.to_numeric(sample_df['GLUCOSE'].apply(lambda x: re.sub(r'[^1-9]*','',x) if x is not None else np.NaN), errors='coerce')
            sample_df['GLUCOSE_IND'] = sample_df['GLUCOSE'].notnull().astype(np.int8)

        sample_df['PH'] = pd.to_numeric(sample_df['PH'], errors='coerce')
        sample_df['PH_IND'] = sample_df['PH'].notnull().astype(np.int8)

        sample_df = sample_df.astype(sample_dtype_final)

        sample_df['GCS_TOTAL'] = sample_df[['GCS_EYE','GCS_MOTOR','GCS_VERBAL']].sum(axis=1,min_count=3)

        sample_df['GCS_TOTAL_IND'] = sample_df['GCS_TOTAL_IND'].where((sample_df['GCS_TOTAL'] >= 3) & (sample_df['GCS_TOTAL'] <= 15), 1)

        # TODO: need to convert columns to same units
        # i.e. some of the temp readings are in F, some C. [cm,in], [kg,lb], [C,F]
            # height, weight, temp
        # confirm the columns below are the same type. i.e. 21 = 0.21
            # FRAC_OXYGEN, O2_SAT, RESP_RATE
        # Remove ID columns
        
        sample_df['FRAC_OXYGEN'] = sample_df['FRAC_OXYGEN'].apply(convert_percent)
        sample_df['O2_SAT'] = sample_df['O2_SAT'].apply(convert_percent)
        sample_df = sample_df.apply(convert, axis=1, stay = stay)

        # # BODY_TEMP need C
        # # body_temp_f = [3654, 678, 223761, 679, 228242,227054]
        # # body_temp_unknown = [645, 591]
        # sample_df = sample_df.apply(convert_body_temp, axis=1)
        # bt_idx = sample_df['BODY_TEMP_ID'].first_valid_index()
        # if bt_idx is not None:
        #     bt_id_val = int(sample_df.at[bt_idx,'BODY_TEMP_ID'])
        #     if bt_id_val in body_temp_f:
        #         sample_df['BODY_TEMP'] = sample_df['BODY_TEMP'].apply(lambda x: (((x - 32) * 5) / 9) if x is not np.NaN else np.NaN)
        #     elif bt_id_val in body_temp_unknown:
        #         body_temp_unknown_unit[bt_id_val].append([stay, sample_df.at[bt_idx,'BODY_TEMP']])

            

        # # WEIGHT need kg
        # # weight_lb = [3581, 226531]
        # # weight_oz = [3582]
        # # weight_unknown = [763]
        # w_idx = sample_df['WEIGHT_ID'].first_valid_index()
        # if w_idx is not None:
        #     w_id_val = int(sample_df.at[w_idx,'WEIGHT_ID'])
        #     if w_id_val in weight_oz:
        #         sample_df['WEIGHT'] = sample_df['WEIGHT'].apply(lambda x: x * OZ_TO_KG if x is not np.NaN else np.NaN)
        #     elif w_id_val in weight_lb:
        #         sample_df['WEIGHT'] = sample_df['WEIGHT'].apply(lambda x: x * LB_TO_KG if x is not np.NaN else np.NaN)


        
        # # HEIGHT need cm
        # # height_in = [226707, 1394]
        # h_idx = sample_df['HEIGHT_ID'].first_valid_index()
        # if h_idx is not None:
        #     h_id_val = int(sample_df.at[h_idx,'HEIGHT_ID'])
        #     if h_id_val in height_in:
        #         sample_df['HEIGHT'] = sample_df['HEIGHT'].apply(lambda x: x * IN_TO_CM if x is not np.NaN else np.NaN)


        sample_df = sample_df.drop(id_column_list, axis=1).reset_index(drop=True)

        sample_df.to_parquet(NUMERICAL_BY_ICUSTAY_ID+stay)
        # break
    # except Exception as e:
    #     failed.append(stay)
    #     failed.append(str(e))
    #     continue


  0%|          | 0/6 [00:00<?, ?it/s]

In [372]:
len(failed)

0

In [373]:
failed

[]

In [374]:
if WRITE_ERRORS_TO_FILE:
    with open('Step5_failed.txt', 'w') as f:
        for line in failed:
            f.write(line)
            f.write('\n')

In [375]:
body_temp_unknown_unit

{645: [], 591: []}

In [376]:
sample_df.shape

(48, 34)

In [377]:
sample_df.head()

Unnamed: 0,GCS_EYE,GCS_MOTOR,GCS_VERBAL,GCS_TOTAL,CAPILLARY_REFILL,D_BLOOD_PRESSURE,M_BLOOD_PRESSURE,S_BLOOD_PRESSURE,HEART_RATE,GLUCOSE,...,S_BLOOD_PRESSURE_IND,HEART_RATE_IND,GLUCOSE_IND,FRAC_OXYGEN_IND,O2_SAT_IND,RESP_RATE_IND,BODY_TEMP_IND,PH_IND,WEIGHT_IND,HEIGHT_IND
0,4.0,6.0,5.0,15.0,,78.0,92.0,138.0,109.0,149.0,...,1,1,1,0,1,1,1,0,1,0
1,,,,,,95.0,111.0,157.0,112.0,,...,1,1,0,0,1,1,0,0,0,0
2,,,,,,79.0,88.0,116.0,86.0,,...,1,1,0,0,1,1,0,0,0,0
3,4.0,6.0,5.0,15.0,,96.0,104.0,132.0,88.0,127.0,...,1,1,1,0,1,1,1,0,0,0
4,,,,,,102.0,109.0,140.0,87.0,,...,1,1,0,0,1,1,0,0,0,0
