In [29]:
import pandas as pd
import numpy as np
import os
import timeit
from tqdm.notebook import trange, tqdm
import re
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt

In [30]:
DEMO = False
WRITE_ERRORS_TO_FILE = True

In [31]:

if DEMO:
    CHARTEVENTS_BY_ICUSTAY_ID = 'data_demo/parquet/'
    REDUCED_CE_BY_ICUSTAY_ID = 'data_demo/samples/'
    NUMERICAL_BY_ICUSTAY_ID = 'data_demo/samples_numerical/'
else:
    CHARTEVENTS_BY_ICUSTAY_ID = 'data/parquet/'
    REDUCED_CE_BY_ICUSTAY_ID = 'data/samples/'
    NUMERICAL_BY_ICUSTAY_ID = 'data/samples_numerical/'


CHARTEVENTS_FILENAME = 'mimic-iii/CHARTEVENTS.csv'
READMISSION_FILENAME = 'data/readmission.csv'
PARQUET_EXT = '.parquet'
STEP_FAIL_FILE = 'Step5_failed.txt'


In [32]:
OZ_TO_KG = 35.274
LB_TO_KG = 2.205
IN_TO_CM = 2.54
MAX_FRACTION = 1.0
MAX_PERCENT = 100


In [33]:
# chartevents_dir_list = os.listdir(REDUCED_CE_BY_ICUSTAY_ID)
# len(chartevents_dir_list)

In [34]:
# test = pd.read_parquet(REDUCED_CE_BY_ICUSTAY_ID + chartevents_dir_list[5])
# test = test.reset_index(drop=True)


![Glascow](assets/images/GCS.jpg)

https://www.firstaidforfree.com/glasgow-coma-scale-gcs-first-aiders/

|	Chart Event	|	Dim	|	Normal	| Initial dtype | Change to dtype |
| --- | :--- | :--- | :--- | :--- |
|	1. Glasgow coma scale eye opening	|	4	|	4 Spontaneously	| str | int |
|	2. Glasgow coma scale verbal response	|	5	|	5 Oriented	| str | int |
|	3. Glasgow coma scale motor response	|	6	|	6 Obeys Commands	| str | int |
|	4. Glasgow coma scale total	|	13	|	15	| none/int | int |
|	5. Capillary refill rate	|	2	|	Normal < 3 secs	| str | int |
|	6. Diastolic blood pressure	|	1	|	70	| int | int |
|	7. Systolic blood pressure	|	1	|	105	| int | int |
|	8. Mean blood pressure	|	1	|	87.5	| int/float | int |
|	9. Heart Rate	|	1	|	80	| int | int |
|	10. Glucose	|	1	|	85	| int | int |
|	11. Fraction inspired oxygen	|	1	|	0.21	| int/float | float? |
|	12. Oxygen saturation	|	1	|	97.5	| int | int |
|	13. Respiratory rate	|	1	|	15	| int | int |
|	14. Body Temperature	|	1	|	37	| float | float |
|	15. pH	|	1	|	7.4	| float | float |
|	16. Weight	|	1	|	80.7	| float | float |
|	17. Height	|	1	|	168.8	| float | float |


Since there are so many `NaN` values, everything should just be a float?

In [35]:
gcs_eye_map     = {'__missing__': np.NaN, '4 Spontaneously': '4', '1 No Response': '1', '2 To pain': '2', '3 To speech': '3', 'To Speech': '3', 'Spontaneously': '4', 'To Pain': '2'}
gcs_motor_map   = {'__missing__': np.NaN, '6 Obeys Commands': '6', '5 Localizes Pain': '5', '1 No Response': '1', '4 Flex-withdraws': '4', '2 Abnorm extensn': '2', '3 Abnorm flexion': '3', 'Localizes Pain': '5', 
                    'Obeys Commands': '6', 'Flex-withdraws': '4', 'No response': '1', 'Abnormal Flexion': '3', 'Abnormal extension': '2'}
gcs_verbal_map  = {'__missing__': np.NaN, '5 Oriented': '5', '1.0 ET/Trach': '1', '4 Confused': '4', '2 Incomp sounds': '2', '1 No Response': '1', '3 Inapprop words': '3', 'No Response-ETT': '1', 
                    'Oriented': '5', 'No Response': '1', 'Confused': '4', 'Incomprehensible sounds': '2', 'Inappropriate Words': '3'}
capillary_map = {'__missing__': np.NaN, 'Brisk':'1', 'Delayed':'0', 'Comment':'0', 'Normal <3 secs':'1', 'Abnormal >3 secs':'0', 'Other/Remarks':'0', 'Normal <3 Seconds':'1', 'Abnormal >3 Seconds':'0'}


In [36]:
event_to_id =[
{'CHAREVENT':'GCS_EYE', 'DESCRIPTION':'','ITEMID':[184, 220739],'UNIT':['NONE','NONE']},
{'CHAREVENT':'GCS_MOTOR', 'DESCRIPTION':'','ITEMID':[454, 223901],'UNIT':['NONE','NONE']},
{'CHAREVENT':'GCS_VERBAL', 'DESCRIPTION':'','ITEMID':[723, 223900],'UNIT':['NONE','NONE']},
{'CHAREVENT':'GCS_TOTAL', 'DESCRIPTION':'Sum of the 3 GCS events','ITEMID':[198],'UNIT':['NONE']},
{'CHAREVENT':'CAPILLARY_REFILL', 'DESCRIPTION':'','ITEMID':[3348, 115, 8377, 223951, 224308],'UNIT':['BINARY','BINARY','BINARY','BINARY','BINARY']},
{'CHAREVENT':'D_BLOOD_PRESSURE', 'DESCRIPTION':'','ITEMID':[8368, 220051, 225310, 8555, 8441, 220180, 8502, 8440, 8503, 8504, 8507, 8506, 224643, 227242],'UNIT':['mmHg','mmHg','mmHg','mmHg','mmHg','mmHg','mmHg','mmHg','mmHg','mmHg','mmHg','mmHg','mmHg','mmHg']},
{'CHAREVENT':'M_BLOOD_PRESSURE', 'DESCRIPTION':'','ITEMID':[52, 220052, 225312, 224, 6702, 224322, 456, 220181, 3312, 3314, 3316, 3322, 3320],'UNIT':['mmHg','mmHg','mmHg','mmHg','mmHg','mmHg','mmHg','mmHg','mmHg','mmHg','mmHg','mmHg','mmHg']},
{'CHAREVENT':'S_BLOOD_PRESSURE', 'DESCRIPTION':'','ITEMID':[51, 220050, 225309, 6701, 455, 220179, 3313, 3315, 442, 3317, 3323, 3321, 224167, 227243],'UNIT':['mmHg','mmHg','mmHg','mmHg','mmHg','mmHg','mmHg','mmHg','mmHg','mmHg','mmHg','mmHg','mmHg','mmHg']},
{'CHAREVENT':'HEART_RATE', 'DESCRIPTION':'','ITEMID':[211, 220045],'UNIT':['bpm','bpm']},
{'CHAREVENT':'GLUCOSE', 'DESCRIPTION':'','ITEMID':[807, 811, 1529, 3745, 225664, 220621, 226537, 3447, 3816, 3744, 227015, 227016, 1310, 1455, 2338, 1812, 228388],'UNIT':['','']},
{'CHAREVENT':'FRAC_OXYGEN', 'DESCRIPTION':'','ITEMID':[3420, 223835, 3422, 189, 727],'UNIT':['%','%','%','%','%']},
{'CHAREVENT':'O2_SAT', 'DESCRIPTION':'','ITEMID':[834, 8498, 220227, 646, 220277],'UNIT':['%','%','%','%','%']},
{'CHAREVENT':'RESP_RATE', 'DESCRIPTION':'','ITEMID':[618, 220210, 3603, 224689, 614, 651, 224422, 615, 224690, 619, 224688],'UNIT':['insp/min','']},
{'CHAREVENT':'BODY_TEMP', 'DESCRIPTION':'','ITEMID':[3655, 677, 676, 223762, 3654, 678, 223761, 679, 8537, 645, 591, 226329, 597, 227054, 228242],'UNIT':['C','C','C','C','F','F','F','F','C','','','','C','','F']},
{'CHAREVENT':'PH', 'DESCRIPTION':'','ITEMID':[3839, 1673, 780, 1126, 223830, 4753, 4202, 860, 220274, 8387, 1880, 3777, 227586, 8385, 1352, 4755, 7966],'UNIT':['NONE','NONE','NONE','NONE','NONE','NONE','NONE','NONE','NONE','NONE','NONE','NONE','NONE','NONE','NONE','NONE','NONE']},
{'CHAREVENT':'WEIGHT', 'DESCRIPTION':'','ITEMID':[763, 224639, 226512, 3580, 3693, 3581, 226531, 3582],'UNIT':['kg','kg','kg','kg','kg','lb','lb','oz']},
{'CHAREVENT':'HEIGHT', 'DESCRIPTION':'','ITEMID':[226707, 226730, 1394],'UNIT':['?','cm','in']},
]

In [37]:
sample_dtype_init = {'GCS_EYE': object, 'GCS_EYE_ID': np.float64, 'GCS_MOTOR': object, 'GCS_MOTOR_ID': np.float64, 
                     'GCS_VERBAL': object, 'GCS_VERBAL_ID': np.float64, 'GCS_TOTAL': object, 'GCS_TOTAL_ID': np.float64, 
                     'CAPILLARY_REFILL': object, 'CAPILLARY_REFILL_ID': np.float64, 'D_BLOOD_PRESSURE': object, 
                     'D_BLOOD_PRESSURE_ID': np.float64, 'M_BLOOD_PRESSURE': object, 'M_BLOOD_PRESSURE_ID': np.float64, 
                     'S_BLOOD_PRESSURE': object, 'S_BLOOD_PRESSURE_ID': np.float64, 'HEART_RATE': object, 'HEART_RATE_ID': np.float64, 
                     'GLUCOSE': object, 'GLUCOSE_ID': np.float64, 'FRAC_OXYGEN': object, 'FRAC_OXYGEN_ID': object, 'O2_SAT': object, 
                     'O2_SAT_ID': np.float64, 'RESP_RATE': object, 'RESP_RATE_ID': np.float64, 'BODY_TEMP': object, 'BODY_TEMP_ID': np.float64, 
                     'PH': object, 'PH_ID': np.float64, 'WEIGHT': object, 'WEIGHT_ID': np.float64, 'HEIGHT': object, 'HEIGHT_ID': np.float64, 
                     'GCS_EYE_IND': np.int8, 'GCS_MOTOR_IND': np.int8, 'GCS_VERBAL_IND': np.int8, 'GCS_TOTAL_IND': np.int8, 
                     'CAPILLARY_REFILL_IND': np.int8, 'D_BLOOD_PRESSURE_IND': np.int8, 'M_BLOOD_PRESSURE_IND': np.int8, 'S_BLOOD_PRESSURE_IND': np.int8, 
                     'HEART_RATE_IND': np.int8, 'GLUCOSE_IND': np.int8, 'FRAC_OXYGEN_IND': np.int8, 'O2_SAT_IND': np.int8, 'RESP_RATE_IND': np.int8, 
                     'BODY_TEMP_IND': np.int8, 'PH_IND': np.int8, 'WEIGHT_IND': np.int8, 'HEIGHT_IND': np.int8}

In [38]:
sample_dtype_final = {'GCS_EYE': np.float64, 'GCS_EYE_ID': np.float64, 'GCS_MOTOR': np.float64, 'GCS_MOTOR_ID': np.float64, 
                     'GCS_VERBAL': np.float64, 'GCS_VERBAL_ID': np.float64, 'GCS_TOTAL': np.float64, 'GCS_TOTAL_ID': np.float64, 
                     'CAPILLARY_REFILL': np.float64, 'CAPILLARY_REFILL_ID': np.float64, 'D_BLOOD_PRESSURE': np.float64, 
                     'D_BLOOD_PRESSURE_ID': np.float64, 'M_BLOOD_PRESSURE': np.float64, 'M_BLOOD_PRESSURE_ID': np.float64, 
                     'S_BLOOD_PRESSURE': np.float64, 'S_BLOOD_PRESSURE_ID': np.float64, 'HEART_RATE': np.float64, 'HEART_RATE_ID': np.float64, 
                     'GLUCOSE': np.float64, 'GLUCOSE_ID': np.float64, 'FRAC_OXYGEN': np.float64, 'FRAC_OXYGEN_ID': np.float64, 'O2_SAT': np.float64, 
                     'O2_SAT_ID': np.float64, 'RESP_RATE': np.float64, 'RESP_RATE_ID': np.float64, 'BODY_TEMP': np.float64, 'BODY_TEMP_ID': np.float64, 
                     'PH': np.float64, 'PH_ID': np.float64, 'WEIGHT': np.float64, 'WEIGHT_ID': np.float64, 'HEIGHT': np.float64, 'HEIGHT_ID': np.float64, 
                     'GCS_EYE_IND': np.int8, 'GCS_MOTOR_IND': np.int8, 'GCS_VERBAL_IND': np.int8, 'GCS_TOTAL_IND': np.int8, 
                     'CAPILLARY_REFILL_IND': np.int8, 'D_BLOOD_PRESSURE_IND': np.int8, 'M_BLOOD_PRESSURE_IND': np.int8, 'S_BLOOD_PRESSURE_IND': np.int8, 
                     'HEART_RATE_IND': np.int8, 'GLUCOSE_IND': np.int8, 'FRAC_OXYGEN_IND': np.int8, 'O2_SAT_IND': np.int8, 'RESP_RATE_IND': np.int8, 
                     'BODY_TEMP_IND': np.int8, 'PH_IND': np.int8, 'WEIGHT_IND': np.int8, 'HEIGHT_IND': np.int8}


In [39]:
id_column_list = ['GCS_EYE_ID','GCS_MOTOR_ID','GCS_VERBAL_ID','GCS_TOTAL_ID','CAPILLARY_REFILL_ID',
                  'D_BLOOD_PRESSURE_ID','M_BLOOD_PRESSURE_ID','S_BLOOD_PRESSURE_ID','HEART_RATE_ID',
                  'GLUCOSE_ID','FRAC_OXYGEN_ID','O2_SAT_ID','RESP_RATE_ID','BODY_TEMP_ID','PH_ID',
                  'WEIGHT_ID','HEIGHT_ID']

In [40]:
chartevents_dir_list = os.listdir(REDUCED_CE_BY_ICUSTAY_ID)
len(chartevents_dir_list)

48075

In [41]:
# test = pd.read_parquet(REDUCED_CE_BY_ICUSTAY_ID + chartevents_dir_list[0])
# test.head(1)

In [42]:
body_temp_f = [3654, 678, 223761, 679, 228242,227054]
# apparently body temp was marked as an event, but the number was not recorded for either of the below event id's 
body_temp_unknown = [645, 591]


weight_lb = [3581, 226531]
weight_oz = [3582]

height_in = [226707, 1394]

body_temp_unknown_unit = {}
weight_unknown_unit = {}

for i in body_temp_unknown:
    body_temp_unknown_unit[i] = []


In [43]:
def convert_percent(val):
    if val is not None:
        if val < MAX_FRACTION:
            return val * MAX_PERCENT

In [44]:
def convert_body_temp(row, stay):
    #print('ROW: ', row['BODY_TEMP_ID'])
    #if row['BODY_TEMP_ID'] is not None and row['HEIGHT_ID'] != np.NaN:
    try:
        id = int(row['BODY_TEMP_ID'])
        if id in body_temp_f:
            row['BODY_TEMP'] =(((row['BODY_TEMP'] - 32) * 5) / 9) 
        # elif id in body_temp_unknown:
        #     body_temp_unknown_unit[id].append([stay, row['BODY_TEMP']])
    except Exception as e:
        pass
    
        
    return row

In [45]:
def convert_weight(row):
    # if row['WEIGHT_ID'] is not None and row['HEIGHT_ID'] != np.NaN:
    try:
        id = int(row['WEIGHT_ID'])
        if id in weight_oz:
            row['WEIGHT'] = row['WEIGHT'] * OZ_TO_KG
        elif id in weight_lb:
            row['WEIGHT'] = row['WEIGHT'] * LB_TO_KG
    except Exception as e:
        pass
    
    return row

In [46]:
def convert_height(row):
    # if row['HEIGHT_ID'] is not None and row['HEIGHT_ID'] != np.NaN:
    try:
        id = int(row['HEIGHT_ID'])
        if id in height_in:
            row['HEIGHT'] = row['HEIGHT'] * IN_TO_CM
    except Exception as e:
        pass
    
    return row

In [47]:
def convert(row, stay):
    row = convert_body_temp(row, stay)
    row = convert_weight(row)
    row = convert_height(row)
    return row

In [48]:
def clean_glucose(val):
    if val is not None and val != np.NaN:
        val = re.sub(r'[^0-9]*','',val)
        if not val.isdecimal():
            val = np.NaN
    else:
        val = np.NaN
    return val    
    

In [49]:
ICU_TXT = 'ICUSTAY_ID='
missed_icustay_list = ['235557', '241818', '202092', '240486', '290401', '273909', '260462', '265380', '284071', '239584', '253638', '283573', '248457', '223240', '251107', '235392', '263481', '277476', '286008', '241950', '299393', '246438', '227800', '225500', '246137', '278150', '278850', '245630', '212859', '262641', '295147', '221698', '268945', '250543', '248546', '211512', '232465', '214502', '201102', '224849', '284459', '254188', '242700', '219601', '261149', '275551', '255012', '276690', '200559', '226237', '260735', '261232', '274027', '218746', '224037', '266631', '289647', '279090', '239751', '204508', '237798', '200890', '231624', '248159', '286712', '258588', '232587', '205409', '227552', '239493', '240865', '257665', '271814', '295017', '276442', '230034', '268187', '289680', '227461', '219050', '241804', '257263', '258774', '259389', '297925', '209237', '239621', '205858', '253641', '253796', '273526', '205464', '220514', '260840', '260354', '216256', '270360', '211306', '276680', '251565', '204432', '256334', '231314', '275349', '299911', '222558', '220860', '215053', '263414', '263461', '248350', '223238', '293512', '270835', '285340', '233066', '270057', '274265', '280177', '212187', '252283', '262101', '266395', '248427', '247366', '211619', '203493', '265660', '239830', '285934', '253540', '297117', '273981', '213515', '248413']


### Goals

1. Transform columns to numerical values
2. Transform dtypes
3. 

In [50]:
failed = []


for stay in tqdm(missed_icustay_list, total=len(missed_icustay_list)):
    try:
        sample_df = pd.read_parquet(REDUCED_CE_BY_ICUSTAY_ID + ICU_TXT + stay)
        sample_df = sample_df.reset_index(drop=True).astype(sample_dtype_init)
        
        sample_df['GCS_EYE'] = sample_df['GCS_EYE'].map(gcs_eye_map)
        sample_df['GCS_MOTOR'] = sample_df['GCS_MOTOR'].map(gcs_motor_map)
        sample_df['GCS_VERBAL'] = sample_df['GCS_VERBAL'].map(gcs_verbal_map)
        sample_df['CAPILLARY_REFILL'] = sample_df['CAPILLARY_REFILL'].map(capillary_map)

        sample_df['BODY_TEMP'] = pd.to_numeric(sample_df['BODY_TEMP'],errors='coerce')
        sample_df['BODY_TEMP_IND'] = sample_df['BODY_TEMP'].notnull().astype(np.int8)


        try:
            sample_df['GLUCOSE'] = sample_df['GLUCOSE'].astype(np.float64)
        except Exception as e:
            sample_df['GLUCOSE'] = sample_df['GLUCOSE'].apply(clean_glucose)
            sample_df['GLUCOSE_IND'] = sample_df['GLUCOSE'].notnull().astype(np.int8)



        sample_df['PH'] = pd.to_numeric(sample_df['PH'], errors='coerce')
        sample_df['PH_IND'] = sample_df['PH'].notnull().astype(np.int8)

        sample_df = sample_df.astype(sample_dtype_final)

        sample_df['GCS_TOTAL'] = sample_df[['GCS_EYE','GCS_MOTOR','GCS_VERBAL']].sum(axis=1,min_count=3)

        sample_df['GCS_TOTAL_IND'] = sample_df['GCS_TOTAL_IND'].where((sample_df['GCS_TOTAL'] >= 3) & (sample_df['GCS_TOTAL'] <= 15), 1)


        
        sample_df['FRAC_OXYGEN'] = sample_df['FRAC_OXYGEN'].apply(convert_percent)
        sample_df['O2_SAT'] = sample_df['O2_SAT'].apply(convert_percent)
        sample_df = sample_df.apply(convert, axis=1, stay = stay)

        


        sample_df = sample_df.drop(id_column_list, axis=1).reset_index(drop=True)

        sample_df.to_parquet(NUMERICAL_BY_ICUSTAY_ID+ ICU_TXT + stay)
        # break
    except Exception as e:
        failed.append(stay)
        failed.append(str(e))
        continue


  0%|          | 0/135 [00:00<?, ?it/s]

In [51]:
len(failed)

0

In [52]:
failed

[]

In [53]:
if WRITE_ERRORS_TO_FILE:
    with open(STEP_FAIL_FILE, 'w') as f:
        for line in failed:
            f.write(line)
            f.write('\n')

In [54]:
# body_temp_unknown_unit[591]

In [55]:
sample_df.shape

(48, 34)

In [56]:
sample_df.head()

Unnamed: 0,GCS_EYE,GCS_MOTOR,GCS_VERBAL,GCS_TOTAL,CAPILLARY_REFILL,D_BLOOD_PRESSURE,M_BLOOD_PRESSURE,S_BLOOD_PRESSURE,HEART_RATE,GLUCOSE,...,S_BLOOD_PRESSURE_IND,HEART_RATE_IND,GLUCOSE_IND,FRAC_OXYGEN_IND,O2_SAT_IND,RESP_RATE_IND,BODY_TEMP_IND,PH_IND,WEIGHT_IND,HEIGHT_IND
0,4.0,6.0,5.0,15.0,,70.0,,138.0,85.0,,...,1,1,0,0,1,1,0,0,0,0
1,4.0,6.0,5.0,15.0,,68.0,82.0,124.0,85.0,106.0,...,1,1,1,0,0,1,1,0,0,0
2,4.0,6.0,5.0,15.0,,,,,89.0,,...,0,1,0,0,0,1,0,0,0,0
3,4.0,6.0,5.0,15.0,,,,,,,...,0,0,0,0,0,0,0,0,0,0
4,4.0,6.0,5.0,15.0,,65.0,80.0,127.0,80.0,,...,1,1,0,0,1,1,1,0,0,0
