In [1]:
import pandas as pd
import numpy as np
import os
import timeit
from tqdm.notebook import trange, tqdm
import re

In [2]:
CHARTEVENTS_BY_ICUSTAY_ID = 'data/parquet/'
REDUCED_CE_BY_ICUSTAY_ID = 'data/samples/'
NUMERICAL_BY_ICUSTAY_ID = 'data/samples_numerical/'
CHARTEVENTS_FILENAME = 'mimic-iii/CHARTEVENTS.csv'
READMISSION_FILENAME = 'data/readmission.csv'
PARQUET_EXT = '.parquet'


In [3]:
# chartevents_dir_list = os.listdir(REDUCED_CE_BY_ICUSTAY_ID)
# len(chartevents_dir_list)

In [4]:
# test = pd.read_parquet(REDUCED_CE_BY_ICUSTAY_ID + chartevents_dir_list[5])
# test = test.reset_index(drop=True)


![Glascow](assets/images/GCS.jpg)

https://www.firstaidforfree.com/glasgow-coma-scale-gcs-first-aiders/

|	Chart Event	|	Dim	|	Normal	| Initial dtype | Change to dtype |
| --- | :--- | :--- | :--- | :--- |
|	1. Glasgow coma scale eye opening	|	4	|	4 Spontaneously	| str | int |
|	2. Glasgow coma scale verbal response	|	5	|	5 Oriented	| str | int |
|	3. Glasgow coma scale motor response	|	6	|	6 Obeys Commands	| str | int |
|	4. Glasgow coma scale total	|	13	|	15	| none/int | int |
|	5. Capillary refill rate	|	2	|	Normal < 3 secs	| str | int |
|	6. Diastolic blood pressure	|	1	|	70	| int | int |
|	7. Systolic blood pressure	|	1	|	105	| int | int |
|	8. Mean blood pressure	|	1	|	87.5	| int/float | int |
|	9. Heart Rate	|	1	|	80	| int | int |
|	10. Glucose	|	1	|	85	| int | int |
|	11. Fraction inspired oxygen	|	1	|	0.21	| int/float | float? |
|	12. Oxygen saturation	|	1	|	97.5	| int | int |
|	13. Respiratory rate	|	1	|	15	| int | int |
|	14. Body Temperature	|	1	|	37	| float | float |
|	15. pH	|	1	|	7.4	| float | float |
|	16. Weight	|	1	|	80.7	| float | float |
|	17. Height	|	1	|	168.8	| float | float |


Since there are so many `NaN` values, everything should just be a float?

In [5]:
gcs_eye_map     = {'__missing__': np.NaN, '4 Spontaneously': '4', '1 No Response': '1', '2 To pain': '2', '3 To speech': '3', 'To Speech': '3', 'Spontaneously': '4', 'To Pain': '2'}
gcs_motor_map   = {'__missing__': np.NaN, '6 Obeys Commands': '6', '5 Localizes Pain': '5', '1 No Response': '1', '4 Flex-withdraws': '4', '2 Abnorm extensn': '2', '3 Abnorm flexion': '3', 'Localizes Pain': '5', 
                    'Obeys Commands': '6', 'Flex-withdraws': '4', 'No response': '1', 'Abnormal Flexion': '3', 'Abnormal extension': '2'}
gcs_verbal_map  = {'__missing__': np.NaN, '5 Oriented': '5', '1.0 ET/Trach': '1', '4 Confused': '4', '2 Incomp sounds': '2', '1 No Response': '1', '3 Inapprop words': '3', 'No Response-ETT': '1', 
                    'Oriented': '5', 'No Response': '1', 'Confused': '4', 'Incomprehensible sounds': '2', 'Inappropriate Words': '3'}
capillary_map = {'__missing__': np.NaN, 'Brisk':'1', 'Delayed':'0', 'Comment':'0', 'Normal <3 secs':'1', 'Abnormal >3 secs':'0', 'Other/Remarks':'0', 'Normal <3 Seconds':'1', 'Abnormal >3 Seconds':'0'}


In [6]:
chartevents_columns = ["ROW_ID","SUBJECT_ID","HADM_ID","ICUSTAY_ID","ITEMID","CHARTTIME","STORETIME","CGID","VALUE","VALUENUM","VALUEUOM","WARNING","ERROR","RESULTSTATUS","STOPPED"]

In [7]:
event_to_id =[
{'CHAREVENT':'GCS_EYE', 'DESCRIPTION':'','ITEMID':[184, 220739],'UNIT':['NONE','NONE']},
{'CHAREVENT':'GCS_MOTOR', 'DESCRIPTION':'','ITEMID':[454, 223901],'UNIT':['NONE','NONE']},
{'CHAREVENT':'GCS_VERBAL', 'DESCRIPTION':'','ITEMID':[723, 223900],'UNIT':['NONE','NONE']},
{'CHAREVENT':'GCS_TOTAL', 'DESCRIPTION':'Sum of the 3 GCS events','ITEMID':[198],'UNIT':['NONE']},
{'CHAREVENT':'CAPILLARY_REFILL', 'DESCRIPTION':'','ITEMID':[3348, 115, 8377, 223951, 224308],'UNIT':['BINARY','BINARY','BINARY','BINARY','BINARY']},
]

In [8]:
chartevents_dtype = {'ROW_ID':object,
'SUBJECT_ID':object,
'HADM_ID':object,
'ICUSTAY_ID':object,
'ITEMID':object,
'CHARTTIME':object,
'STORETIME':object,
'CGID':object,
'VALUE':object,
'VALUENUM':object,
'VALUEUOM':object,
'WARNING':object,
'ERROR':object,
'RESULTSTATUS':object,
'STOPPED':object}

In [9]:
sample_dtype_init = {'GCS_EYE': object, 'GCS_EYE_ID': np.float64, 'GCS_MOTOR': object, 'GCS_MOTOR_ID': np.float64, 
                     'GCS_VERBAL': object, 'GCS_VERBAL_ID': np.float64, 'GCS_TOTAL': object, 'GCS_TOTAL_ID': np.float64, 
                     'CAPILLARY_REFILL': object, 'CAPILLARY_REFILL_ID': np.float64, 'D_BLOOD_PRESSURE': object, 
                     'D_BLOOD_PRESSURE_ID': np.float64, 'M_BLOOD_PRESSURE': object, 'M_BLOOD_PRESSURE_ID': np.float64, 
                     'S_BLOOD_PRESSURE': object, 'S_BLOOD_PRESSURE_ID': np.float64, 'HEART_RATE': object, 'HEART_RATE_ID': np.float64, 
                     'GLUCOSE': object, 'GLUCOSE_ID': np.float64, 'FRAC_OXYGEN': object, 'FRAC_OXYGEN_ID': object, 'O2_SAT': object, 
                     'O2_SAT_ID': np.float64, 'RESP_RATE': object, 'RESP_RATE_ID': np.float64, 'BODY_TEMP': object, 'BODY_TEMP_ID': np.float64, 
                     'PH': object, 'PH_ID': np.float64, 'WEIGHT': object, 'WEIGHT_ID': np.float64, 'HEIGHT': object, 'HEIGHT_ID': np.float64, 
                     'GCS_EYE_IND': np.int8, 'GCS_MOTOR_IND': np.int8, 'GCS_VERBAL_IND': np.int8, 'GCS_TOTAL_IND': np.int8, 
                     'CAPILLARY_REFILL_IND': np.int8, 'D_BLOOD_PRESSURE_IND': np.int8, 'M_BLOOD_PRESSURE_IND': np.int8, 'S_BLOOD_PRESSURE_IND': np.int8, 
                     'HEART_RATE_IND': np.int8, 'GLUCOSE_IND': np.int8, 'FRAC_OXYGEN_IND': np.int8, 'O2_SAT_IND': np.int8, 'RESP_RATE_IND': np.int8, 
                     'BODY_TEMP_IND': np.int8, 'PH_IND': np.int8, 'WEIGHT_IND': np.int8, 'HEIGHT_IND': np.int8}

In [10]:
sample_dtype_final = {'GCS_EYE': np.float64, 'GCS_EYE_ID': np.float64, 'GCS_MOTOR': np.float64, 'GCS_MOTOR_ID': np.float64, 
                     'GCS_VERBAL': np.float64, 'GCS_VERBAL_ID': np.float64, 'GCS_TOTAL': np.float64, 'GCS_TOTAL_ID': np.float64, 
                     'CAPILLARY_REFILL': np.float64, 'CAPILLARY_REFILL_ID': np.float64, 'D_BLOOD_PRESSURE': np.float64, 
                     'D_BLOOD_PRESSURE_ID': np.float64, 'M_BLOOD_PRESSURE': np.float64, 'M_BLOOD_PRESSURE_ID': np.float64, 
                     'S_BLOOD_PRESSURE': np.float64, 'S_BLOOD_PRESSURE_ID': np.float64, 'HEART_RATE': np.float64, 'HEART_RATE_ID': np.float64, 
                     'GLUCOSE': np.float64, 'GLUCOSE_ID': np.float64, 'FRAC_OXYGEN': np.float64, 'FRAC_OXYGEN_ID': np.float64, 'O2_SAT': np.float64, 
                     'O2_SAT_ID': np.float64, 'RESP_RATE': np.float64, 'RESP_RATE_ID': np.float64, 'BODY_TEMP': np.float64, 'BODY_TEMP_ID': np.float64, 
                     'PH': np.float64, 'PH_ID': np.float64, 'WEIGHT': np.float64, 'WEIGHT_ID': np.float64, 'HEIGHT': np.float64, 'HEIGHT_ID': np.float64, 
                     'GCS_EYE_IND': np.int8, 'GCS_MOTOR_IND': np.int8, 'GCS_VERBAL_IND': np.int8, 'GCS_TOTAL_IND': np.int8, 
                     'CAPILLARY_REFILL_IND': np.int8, 'D_BLOOD_PRESSURE_IND': np.int8, 'M_BLOOD_PRESSURE_IND': np.int8, 'S_BLOOD_PRESSURE_IND': np.int8, 
                     'HEART_RATE_IND': np.int8, 'GLUCOSE_IND': np.int8, 'FRAC_OXYGEN_IND': np.int8, 'O2_SAT_IND': np.int8, 'RESP_RATE_IND': np.int8, 
                     'BODY_TEMP_IND': np.int8, 'PH_IND': np.int8, 'WEIGHT_IND': np.int8, 'HEIGHT_IND': np.int8}


In [11]:
chartevents_dir_list = os.listdir(REDUCED_CE_BY_ICUSTAY_ID)
len(chartevents_dir_list)

48075

In [12]:
test = pd.read_parquet(REDUCED_CE_BY_ICUSTAY_ID + chartevents_dir_list[0])
test.head(1)

Unnamed: 0,GCS_EYE,GCS_EYE_ID,GCS_MOTOR,GCS_MOTOR_ID,GCS_VERBAL,GCS_VERBAL_ID,GCS_TOTAL,GCS_TOTAL_ID,CAPILLARY_REFILL,CAPILLARY_REFILL_ID,...,S_BLOOD_PRESSURE_IND,HEART_RATE_IND,GLUCOSE_IND,FRAC_OXYGEN_IND,O2_SAT_IND,RESP_RATE_IND,BODY_TEMP_IND,PH_IND,WEIGHT_IND,HEIGHT_IND
0,,,,,,,,,,,...,1,1,0,0,1,1,0,1,0,0


In [13]:
def process_body_temp(x):
    if x is None:
        return np.NaN
    elif x.isalpha():
        return np.NaN
    else:
        return x

### Goals

1. Transform columns to numerical values
2. Transform dtypes
3. 

In [14]:
failed = []
for stay in tqdm(chartevents_dir_list, total=len(chartevents_dir_list)):
    try:
        sample_df = pd.read_parquet(REDUCED_CE_BY_ICUSTAY_ID + stay)
        sample_df = sample_df.reset_index(drop=True).astype(sample_dtype_init)
        
        sample_df['GCS_EYE'] = sample_df['GCS_EYE'].map(gcs_eye_map)
        sample_df['GCS_MOTOR'] = sample_df['GCS_MOTOR'].map(gcs_motor_map)
        sample_df['GCS_VERBAL'] = sample_df['GCS_VERBAL'].map(gcs_verbal_map)
        sample_df['CAPILLARY_REFILL'] = sample_df['CAPILLARY_REFILL'].map(capillary_map)

        sample_df['BODY_TEMP'] = pd.to_numeric(sample_df['BODY_TEMP'],errors='coerce')
        sample_df['BODY_TEMP'] = sample_df['BODY_TEMP'].notnull().astype(np.int8)

        if sample_df['GLUCOSE'].dtype != np.float64:
            sample_df['GLUCOSE'] = pd.to_numeric(sample_df['GLUCOSE'].apply(lambda x: re.sub(r'[^1-9]*','',x) if x is not None else np.NaN), errors='coerce')
            sample_df['GLUCOSE_IND'] = sample_df['GLUCOSE'].notnull().astype(np.int8)

        sample_df['PH'] = pd.to_numeric(sample_df['PH'], errors='coerce')
        sample_df['PH'] = sample_df['PH'].notnull().astype(np.int8)

        sample_df = sample_df.astype(sample_dtype_final)

        sample_df['GCS_TOTAL'] = sample_df[['GCS_EYE','GCS_MOTOR','GCS_VERBAL']].sum(axis=1,min_count=3)

        sample_df['GCS_TOTAL_IND'] = sample_df['GCS_TOTAL_IND'].where((sample_df['GCS_TOTAL'] >= 3) & (sample_df['GCS_TOTAL'] <= 15), 1)


        sample_df.to_parquet(NUMERICAL_BY_ICUSTAY_ID+stay+PARQUET_EXT)
        # break
    except Exception as e:
        failed.append(stay)
        failed.append(str(e))
        continue


  0%|          | 0/48075 [00:00<?, ?it/s]

In [15]:
len(failed)

16

In [16]:

# re.sub(r'[a-zA-Z]*|-*| *','','208CS')

In [17]:
# uniq = []
# for l in failed:
#     if 'ICUSTAY' in l:
#         pass
#     else:
#         l = l.split(' ')
#         # print(l[-1])
#         uniq.append(l[-1])
#         # break

# uniq = pd.unique(uniq)
# uniq

In [None]:
# pd.Series([np.NaN]).str.isalnum()
# s1=re.sub('[a-zA-Z]|-| ','',np.NaN)

In [19]:
failed

['ICUSTAY_ID=241818',
 'expected string or bytes-like object',
 'ICUSTAY_ID=257263',
 'expected string or bytes-like object',
 'ICUSTAY_ID=284071',
 'expected string or bytes-like object',
 'ICUSTAY_ID=270057',
 'expected string or bytes-like object',
 'ICUSTAY_ID=246137',
 'expected string or bytes-like object',
 'ICUSTAY_ID=222558',
 'expected string or bytes-like object',
 'ICUSTAY_ID=268945',
 'expected string or bytes-like object',
 'ICUSTAY_ID=226237',
 'expected string or bytes-like object']

In [23]:
with open('Step5_failed.txt', 'w') as f:
    for line in failed:
        f.write(line)
        f.write('\n')

In [20]:
# sample_df[['GCS_TOTAL_ID','RESP_RATE','BODY_TEMP']].dtypes

In [21]:
# Need to convert dtypes after reading
# sample_df = pd.read_parquet(NUMERICAL_BY_ICUSTAY_ID + 'ICUSTAY_ID=200001').astype(sample_dtype_final)


In [22]:
# sample_df[['GCS_TOTAL_ID','RESP_RATE','BODY_TEMP']].dtypes

In [38]:
sample_df = pd.read_parquet(REDUCED_CE_BY_ICUSTAY_ID + failed[14])
sample_df['GLUCOSE'].dtype == np.float64

True