In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import trange, tqdm
import csv

In [2]:
CHARTEVENTS_BY_ICUSTAY_ID = 'data/parquet/'
REDUCED_CE_BY_ICUSTAY_ID = 'data/samples/'

CHARTEVENTS_FILENAME = 'mimic-iii/CHARTEVENTS.csv'
READMISSION_FILENAME = 'data/readmission.csv'

DATASTORE_FILENAME = 'samples.h5'
FEATHER_EXT = '.feather'
ROWS_TO_READ = 1000000
MAX_ROWS_CHARTEVENTS = 330712483

STEP_UNIQUE_FILE = 'Step4_unique.txt'

skip_rows = 0


![Glascow](assets/images/GCS.jpg)

https://www.firstaidforfree.com/glasgow-coma-scale-gcs-first-aiders/

|	Chart Event	|	Dim	|	Normal	| Initial dtype | Change to dtype |
| --- | :--- | :--- | :--- | :--- |
|	1. Glasgow coma scale eye opening	|	4	|	4 Spontaneously	| str | int |
|	2. Glasgow coma scale verbal response	|	5	|	5 Oriented	| str | int |
|	3. Glasgow coma scale motor response	|	6	|	6 Obeys Commands	| str | int |
|	4. Glasgow coma scale total	|	13	|	15	| none/int | int |
|	5. Capillary refill rate	|	2	|	Normal < 3 secs	| str | int |
|	6. Diastolic blood pressure	|	1	|	70	| int | int |
|	7. Systolic blood pressure	|	1	|	105	| int | int |
|	8. Mean blood pressure	|	1	|	87.5	| int/float | int |
|	9. Heart Rate	|	1	|	80	| int | int |
|	10. Glucose	|	1	|	85	| int | int |
|	11. Fraction inspired oxygen	|	1	|	0.21	| int/float | float? |
|	12. Oxygen saturation	|	1	|	97.5	| int | int |
|	13. Respiratory rate	|	1	|	15	| int | int |
|	14. Body Temperature	|	1	|	37	| float | float |
|	15. pH	|	1	|	7.4	| float | float |
|	16. Weight	|	1	|	80.7	| float | float |
|	17. Height	|	1	|	168.8	| float | float |


Since there are so many `NaN` values, everything should just be a float?

In [8]:
chartevents_columns = ["ROW_ID","SUBJECT_ID","HADM_ID","ICUSTAY_ID","ITEMID","CHARTTIME","STORETIME","CGID","VALUE","VALUENUM","VALUEUOM","WARNING","ERROR","RESULTSTATUS","STOPPED"]

In [9]:
event_to_id =[
{'CHAREVENT':'GCS_EYE', 'DESCRIPTION':'','ITEMID':[184, 220739],'UNIT':['NONE','NONE']},
{'CHAREVENT':'GCS_MOTOR', 'DESCRIPTION':'','ITEMID':[454, 223901],'UNIT':['NONE','NONE']},
{'CHAREVENT':'GCS_VERBAL', 'DESCRIPTION':'','ITEMID':[723, 223900],'UNIT':['NONE','NONE']},
{'CHAREVENT':'GCS_TOTAL', 'DESCRIPTION':'Sum of the 3 GCS events','ITEMID':[198],'UNIT':['NONE']},
{'CHAREVENT':'CAPILLARY_REFILL', 'DESCRIPTION':'','ITEMID':[3348, 115, 8377, 223951, 224308],'UNIT':['BINARY','BINARY','BINARY','BINARY','BINARY']},
]

In [10]:
chartevents_dtype = {'ROW_ID':object,
'SUBJECT_ID':object,
'HADM_ID':object,
'ICUSTAY_ID':object,
'ITEMID':object,
'CHARTTIME':object,
'STORETIME':object,
'CGID':object,
'VALUE':object,
'VALUENUM':object,
'VALUEUOM':object,
'WARNING':object,
'ERROR':object,
'RESULTSTATUS':object,
'STOPPED':object}

In [11]:
total = int(MAX_ROWS_CHARTEVENTS/ROWS_TO_READ)+1
eye_unique = []
motor_unique = []
verbal_unique = []
total_unique = []
capillary_unique = []
with pd.read_csv(CHARTEVENTS_FILENAME,names=chartevents_columns,dtype=chartevents_dtype,skiprows=1,chunksize=ROWS_TO_READ) as file:
# chartevents = dd.read_csv(CHARTEVENTS_FILENAME,dtype=chartevents_dtype)
    for chartevents in tqdm(file,total=total):
        chartevents = chartevents.dropna(subset=['SUBJECT_ID','HADM_ID','ICUSTAY_ID','CHARTTIME','ITEMID']).drop('ROW_ID',axis=1)
        chartevents['ITEMID'] = chartevents['ITEMID'].astype(np.int64)
        eye_df = chartevents[(chartevents['ITEMID'].isin(event_to_id[0]['ITEMID']))]
        motor_df = chartevents[(chartevents['ITEMID'].isin(event_to_id[1]['ITEMID']))]
        verbal_df = chartevents[(chartevents['ITEMID'].isin(event_to_id[2]['ITEMID']))]
        total_df = chartevents[(chartevents['ITEMID'].isin(event_to_id[3]['ITEMID']))]
        cap_df = chartevents[(chartevents['ITEMID'].isin(event_to_id[4]['ITEMID']))]

        eye_unique.extend(pd.unique(eye_df['VALUE']).tolist())
        motor_unique.extend(pd.unique(motor_df['VALUE']).tolist())
        verbal_unique.extend(pd.unique(verbal_df['VALUE']).tolist())
        total_unique.extend(pd.unique(total_df['VALUE']).tolist())
        capillary_unique.extend(pd.unique(cap_df['VALUE']).tolist())

        # if len(eye_unique) > 0 or len(motor_unique) > 0 or len(verbal_unique) > 0 or len(total_unique) > 0:
        #     break
        

  0%|          | 0/331 [00:00<?, ?it/s]

In [None]:
eye_unique      = pd.unique(np.array(eye_unique))
motor_unique    = pd.unique(np.array(motor_unique))
verbal_unique   = pd.unique(np.array(verbal_unique))
total_unique    = pd.unique(np.array(total_unique))
capillary_unique = pd.unique(np.array(capillary_unique))

In [13]:
with open(STEP_UNIQUE_FILE , 'w') as f:
    w = csv.writer(f)
    w.writerow(eye_unique)
    w.writerow(motor_unique)
    w.writerow(verbal_unique)
    w.writerow(total_unique)
    w.writerow(capillary_unique)

The text file results were manually formed into dictionaries. These will be used to map strings to numerical values. 

Below are the resulting dictionaries.

In [None]:
gcs_eye_map     = {'__missing__': np.NaN, '4 Spontaneously': '4', '1 No Response': '1', '2 To pain': '2', '3 To speech': '3', 'To Speech': '3', 'Spontaneously': '4', 'To Pain': '2'}
gcs_motor_map   = {'__missing__': np.NaN, '6 Obeys Commands': '6', '5 Localizes Pain': '5', '1 No Response': '1', '4 Flex-withdraws': '4', '2 Abnorm extensn': '2', '3 Abnorm flexion': '3', 'Localizes Pain': '5', 
                    'Obeys Commands': '6', 'Flex-withdraws': '4', 'No response': '1', 'Abnormal Flexion': '3', 'Abnormal extension': '2'}
gcs_verbal_map  = {'__missing__': np.NaN, '5 Oriented': '5', '1.0 ET/Trach': '1', '4 Confused': '4', '2 Incomp sounds': '2', '1 No Response': '1', '3 Inapprop words': '3', 'No Response-ETT': '1', 
                    'Oriented': '5', 'No Response': '1', 'Confused': '4', 'Incomprehensible sounds': '2', 'Inappropriate Words': '3'}
capillary_map = {'__missing__': np.NaN, 'Brisk':'1', 'Delayed':'0', 'Comment':'0', 'Normal <3 secs':'1', 'Abnormal >3 secs':'0', 'Other/Remarks':'0', 'Normal <3 Seconds':'1', 'Abnormal >3 Seconds':'0'}