# Chart Events for last 48 period

TODO:

1. [ ] Create or find list of `ITEMID`s that correlate to the 17 dimmensions from paper.
2. [ ] Extract the 17 dims from `CHARTEVENTS.csv` for each sample

|	Chart Event	|	Dim	|	Normal	|
| --- | :--- | :--- |
|	1. Glasgow coma scale eye opening	|	4	|	4 Spontaneously	|
|	2. Glasgow coma scale verbal response	|	5	|	5 Oriented	|
|	3. Glasgow coma scale motor response	|	6	|	6 Obeys Commands	|
|	4. Glasgow coma scale total	|	13	|	15	|
|	5. Capillary refill rate	|	2	|	Normal < 3 secs	|
|	6. Diastolic blood pressure	|	1	|	70	|
|	7. Systolic blood pressure	|	1	|	105	|
|	8. Mean blood pressure	|	1	|	87.5	|
|	9. Heart Rate	|	1	|	80	|
|	10. Glucose	|	1	|	85	|
|	11. Fraction inspired oxygen	|	1	|	0.21	|
|	12. Oxygen saturation	|	1	|	97.5	|
|	13. Respiratory rate	|	1	|	15	|
|	14. Body Temperature	|	1	|	37	|
|	15. pH	|	1	|	7.4	|
|	16. Weight	|	1	|	80.7	|
|	17. Height	|	1	|	168.8	|

![Glascow](assets/images/GCS.jpg)

https://www.firstaidforfree.com/glasgow-coma-scale-gcs-first-aiders/

In [1]:
event_to_id =[
{'CHAREVENT':'GCS_EYE', 'DESCRIPTION':'','ITEMID':['184','220739'],'UNIT':['NONE','NONE']},
{'CHAREVENT':'GCS_MOTOR', 'DESCRIPTION':'','ITEMID':['454','223901'],'UNIT':['NONE','NONE']},
{'CHAREVENT':'GCS_VERBAL', 'DESCRIPTION':'','ITEMID':['723','223900'],'UNIT':['NONE','NONE']},
{'CHAREVENT':'GCS_TOTAL', 'DESCRIPTION':'Sum of the 3 GCS events','ITEMID':['198'],'UNIT':['NONE']},
{'CHAREVENT':'CAPILLARY_REFILL', 'DESCRIPTION':'','ITEMID':['3348','115','8377','223951','224308'],'UNIT':['BINARY','BINARY','BINARY','BINARY','BINARY']},
{'CHAREVENT':'D_BLOOD_PRESSURE', 'DESCRIPTION':'','ITEMID':['8368','220051','225310','8555','8441','220180','8502','8440','8503','8504','8507','8506','224643','227242'],'UNIT':['mmHg','mmHg','mmHg','mmHg','mmHg','mmHg','mmHg','mmHg','mmHg','mmHg','mmHg','mmHg','mmHg','mmHg']},
{'CHAREVENT':'M_BLOOD_PRESSURE', 'DESCRIPTION':'','ITEMID':['52', '220052', '225312', '224', '6702', '224322', '456', '220181', '3312', '3314', '3316', '3322', '3320'],'UNIT':['mmHg','mmHg','mmHg','mmHg','mmHg','mmHg','mmHg','mmHg','mmHg','mmHg','mmHg','mmHg','mmHg']},
{'CHAREVENT':'S_BLOOD_PRESSURE', 'DESCRIPTION':'','ITEMID':['51','220050','225309','6701','455','220179','3313','3315','442','3317','3323','3321','224167','227243'],'UNIT':['mmHg','mmHg','mmHg','mmHg','mmHg','mmHg','mmHg','mmHg','mmHg','mmHg','mmHg','mmHg','mmHg','mmHg']},
{'CHAREVENT':'HEART_RATE', 'DESCRIPTION':'','ITEMID':['211','220045'],'UNIT':['bpm','bpm']},
{'CHAREVENT':'GLUCOSE', 'DESCRIPTION':'','ITEMID':['807', '811', '1529', '3745', '225664', '220621', '226537', '3447', '3816', '3744', '227015', '227016', '1310', '1455', '2338', '1812', '228388'],'UNIT':['','']},
{'CHAREVENT':'FRAC_OXYGEN', 'DESCRIPTION':'','ITEMID':['3420', '223835', '3422', '189', '727'],'UNIT':['%','%','%','%','%']},
{'CHAREVENT':'O2_SAT', 'DESCRIPTION':'','ITEMID':['834', '8498', '220227', '646', '220277'],'UNIT':['%','%','%','%','%']},
{'CHAREVENT':'RESP_RATE', 'DESCRIPTION':'','ITEMID':['618', '220210', '3603', '224689', '614', '651', '224422', '615', '224690', '619', '224688'],'UNIT':['insp/min','']},
{'CHAREVENT':'BODY_TEMP', 'DESCRIPTION':'','ITEMID':['3655','677','676','223762','3654','678','223761','679','8537','645','591','226329','597','227054','228242'],'UNIT':['C','C','C','C','F','F','F','F','C','','','','C','','F']},
{'CHAREVENT':'PH', 'DESCRIPTION':'','ITEMID':['3839','1673','780','1126','223830','4753','4202','860','220274','8387','1880','3777','227586','8385','1352','4755','7966'],'UNIT':['NONE','NONE','NONE','NONE','NONE','NONE','NONE','NONE','NONE','NONE','NONE','NONE','NONE','NONE','NONE','NONE','NONE']},
{'CHAREVENT':'WEIGHT', 'DESCRIPTION':'','ITEMID':['763','224639','226512','3580','3693','3581','226531','3582'],'UNIT':['?','kg','kg','kg','kg','lb','lb','oz']},
{'CHAREVENT':'HEIGHT', 'DESCRIPTION':'','ITEMID':['226707', '226730', '1394'],'UNIT':['?','cm','in']},
]

In [2]:
import pandas as pd
import numpy as np
from tqdm.notebook import trange, tqdm
import traceback
import os
import dask.dataframe as dd
from dask.distributed import Client

In [3]:
CHARTEVENTS_FILENAME = 'mimic-iii/CHARTEVENTS.csv'
READMISSION_FILENAME = 'data/readmission.csv'
SAMPLES_DIR = 'data/parquet/'
DATASTORE_FILENAME = 'samples.h5'
FEATHER_EXT = '.feather'
ROWS_TO_READ = 1000000
MAX_ROWS_CHARTEVENTS = 330712483

skip_rows = 0

In [4]:
chartevents_columns = ["ROW_ID","SUBJECT_ID","HADM_ID","ICUSTAY_ID","ITEMID","CHARTTIME","STORETIME","CGID","VALUE","VALUENUM","VALUEUOM","WARNING","ERROR","RESULTSTATUS","STOPPED"]



# chartevents_dtype = {'ROW_ID':object,
# 'SUBJECT_ID':np.int64,
# 'HADM_ID':np.int64,
# 'ICUSTAY_ID':np.int64,
# 'ITEMID':np.int64,
# 'CHARTTIME':object,
# 'STORETIME':object,
# 'CGID':np.int64,
# 'VALUE':object,
# 'VALUENUM':np.int64,
# 'VALUEUOM':object,
# 'WARNING':np.int64,
# 'ERROR':np.int64,
# 'RESULTSTATUS':object,
# 'STOPPED':object}

chartevents_dtype = {'ROW_ID':str,
'SUBJECT_ID':np.float64,
'HADM_ID':np.float64,
'ICUSTAY_ID':np.float64,
'ITEMID':np.float64,
'CHARTTIME':str,
'STORETIME':str,
'CGID':str,
'VALUE':str,
'VALUENUM':str,
'VALUEUOM':str,
'WARNING':str,
'ERROR':str,
'RESULTSTATUS':str,
'STOPPED':str}

In [5]:
readmission = pd.read_csv(READMISSION_FILENAME).sort_values(['SUBJECT_ID','HADM_ID','ICUSTAY_ID']).reset_index(drop=True)

# d = readmission.dtypes.to_dict()
# d

In [6]:
# print(readmission.shape)
# readmission.nunique()

In [7]:
# icustay_id = list(readmission['ICUSTAY_ID'].values)
# icustay_id.insert(0,0)
# icustay_id.sort()
# print(icustay_id[:5])

In [8]:
# readmission.dtypes

In [9]:
client = Client()
client

2023-05-01 21:41:54,847 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-q5nmiug2', purging
2023-05-01 21:41:54,847 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-9b_ng_y2', purging
2023-05-01 21:41:54,847 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-6fd_24lx', purging
2023-05-01 21:41:54,847 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-pumw74ne', purging


0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 4
Total threads: 16,Total memory: 62.57 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:36953,Workers: 4
Dashboard: http://127.0.0.1:8787/status,Total threads: 16
Started: Just now,Total memory: 62.57 GiB

0,1
Comm: tcp://127.0.0.1:43973,Total threads: 4
Dashboard: http://127.0.0.1:35231/status,Memory: 15.64 GiB
Nanny: tcp://127.0.0.1:46691,
Local directory: /tmp/dask-worker-space/worker-sg7rf6so,Local directory: /tmp/dask-worker-space/worker-sg7rf6so

0,1
Comm: tcp://127.0.0.1:38291,Total threads: 4
Dashboard: http://127.0.0.1:45837/status,Memory: 15.64 GiB
Nanny: tcp://127.0.0.1:37047,
Local directory: /tmp/dask-worker-space/worker-l1g79miv,Local directory: /tmp/dask-worker-space/worker-l1g79miv

0,1
Comm: tcp://127.0.0.1:43423,Total threads: 4
Dashboard: http://127.0.0.1:45269/status,Memory: 15.64 GiB
Nanny: tcp://127.0.0.1:39535,
Local directory: /tmp/dask-worker-space/worker-l7nlpnn8,Local directory: /tmp/dask-worker-space/worker-l7nlpnn8

0,1
Comm: tcp://127.0.0.1:46231,Total threads: 4
Dashboard: http://127.0.0.1:43767/status,Memory: 15.64 GiB
Nanny: tcp://127.0.0.1:40689,
Local directory: /tmp/dask-worker-space/worker-mywjrckx,Local directory: /tmp/dask-worker-space/worker-mywjrckx


In [10]:
df = dd.read_csv(CHARTEVENTS_FILENAME, dtype=chartevents_dtype ,assume_missing=True)

In [11]:
df = df.dropna(subset=['SUBJECT_ID','HADM_ID','ICUSTAY_ID','CHARTTIME']).drop('ROW_ID',axis=1)

In [12]:
df['SUBJECT_ID'] = df.map_partitions(lambda x: x['SUBJECT_ID'].astype(np.int64))
df['HADM_ID'] = df.map_partitions(lambda x: x['HADM_ID'].astype(np.int64))
df['ICUSTAY_ID'] = df.map_partitions(lambda x: x['ICUSTAY_ID'].astype(np.int64))
df['CHARTTIME'] = df.map_partitions(lambda x: pd.to_datetime(x['CHARTTIME'], errors='coerce'))

  df['CHARTTIME'] = df.map_partitions(lambda x: pd.to_datetime(x['CHARTTIME'], errors='coerce'))


In [13]:
df = df.set_index('ICUSTAY_ID',drop=True, sorted=False)

In [14]:
len(df.divisions)
(df.divisions)
df.info

<bound method DataFrame.info of Dask DataFrame Structure:
npartitions=551                                                                                                                             
200001               int64   int64  float64  datetime64[ns]    object  object  object   object   object  object  object       object  object
200060                 ...     ...      ...             ...       ...     ...     ...      ...      ...     ...     ...          ...     ...
...                    ...     ...      ...             ...       ...     ...     ...      ...      ...     ...     ...          ...     ...
299930                 ...     ...      ...             ...       ...     ...     ...      ...      ...     ...     ...          ...     ...
299999                 ...     ...      ...             ...       ...     ...     ...      ...      ...     ...     ...          ...     ...
Dask Name: sort_index, 17 graph layers>

In [15]:
# valid_chartevents = df.map_partitions(lambda df: dd.multi.merge(readmission[['SUBJECT_ID','HADM_ID','ICUSTAY_ID']],df,'inner',['SUBJECT_ID','HADM_ID','ICUSTAY_ID']))
valid_chartevents = df.map_partitions(lambda df: dd.multi.merge(readmission[['ICUSTAY_ID']],df,'inner',['ICUSTAY_ID']))

In [16]:
# df.get_partition(1).head()

In [17]:
valid_chartevents.info

<bound method DataFrame.info of Dask DataFrame Structure:
npartitions=551                                                                                                                                        
200001               int64      int64   int64  float64  datetime64[ns]    object  object  object   object   object  object  object       object  object
200060                 ...        ...     ...      ...             ...       ...     ...     ...      ...      ...     ...     ...          ...     ...
...                    ...        ...     ...      ...             ...       ...     ...     ...      ...      ...     ...     ...          ...     ...
299930                 ...        ...     ...      ...             ...       ...     ...     ...      ...      ...     ...     ...          ...     ...
299999                 ...        ...     ...      ...             ...       ...     ...     ...      ...      ...     ...     ...          ...     ...
Dask Name: lambda, 18 graph la

In [18]:
valid_chartevents.get_partition(0).to_parquet(SAMPLES_DIR, engine="pyarrow", partition_on='ICUSTAY_ID', compute=True)

Key:       ('shuffle-p2p-0109c0bda0dd0ea5404b03892367842b', 0)
Function:  shuffle_unpack
args:      ('61b2cd87c84096a9402139b4a7f134bc', 0, 0)
kwargs:    {}
Exception: "RuntimeError('shuffle_unpack failed during shuffle 61b2cd87c84096a9402139b4a7f134bc')"



RuntimeError: shuffle_unpack failed during shuffle 61b2cd87c84096a9402139b4a7f134bc

In [None]:
# def store_values(df, partition_info=None):
#     div = str(partition_info['division'])
#     dd.to_hdf(df, SAMPLES_DIR + div + '.hdf', div, mode="w")

In [None]:
# stored_charts = valid_chartevents.map_partitions(store_values, meta=valid_chartevents)

In [None]:
# for p in valid_chartevents.partitions:
#     div = str(p.divisions[0])
#     p.to_hdf(SAMPLES_DIR+div+'.hdf', 'data',compute=True) 

In [None]:
# valid_chartevents.to_parquet(SAMPLES_DIR,compute=True)

In [None]:
# valid_chartevents.to_parquet(SAMPLES_DIR, engine="pyarrow", partition_on='ICUSTAY_ID', compute=False)

In [None]:
# total = int(MAX_ROWS_CHARTEVENTS/ROWS_TO_READ)+1

# with pd.read_csv(CHARTEVENTS_FILENAME,names=chartevents_columns,dtype=chartevents_dtype,skiprows=1,chunksize=ROWS_TO_READ) as file:
# # chartevents = dd.read_csv(CHARTEVENTS_FILENAME,dtype=chartevents_dtype)
#     for chartevents in tqdm(file,total=total):
#         chartevents = chartevents.dropna(subset=['SUBJECT_ID','HADM_ID','ICUSTAY_ID','CHARTTIME']).drop('ROW_ID',axis=1)
#         #print(chartevents.head())
#         try:
#             chartevents['SUBJECT_ID'] = chartevents['SUBJECT_ID'].astype(np.int64)
#             chartevents['HADM_ID'] = chartevents['HADM_ID'].astype(np.int64)
#             chartevents['ICUSTAY_ID'] = chartevents['ICUSTAY_ID'].astype(np.int64)
#             chartevents['CHARTTIME'] = pd.to_datetime(chartevents['CHARTTIME'], errors='coerce')
#         except Exception as e:
#             #print('\nError converting IDs to int in range', lines, 'to', lines + ROWS_TO_READ)
#             print(str(e))


#         valid_chartevents = pd.merge(readmission[['SUBJECT_ID','HADM_ID','ICUSTAY_ID']],chartevents,'inner',['SUBJECT_ID','HADM_ID','ICUSTAY_ID'])
#         #print(valid_chartevents.iloc[0]['SUBJECT_ID'])
        
#         while valid_chartevents.shape[0] > 0:
#             subject_id = valid_chartevents.iloc[0]['SUBJECT_ID']
#             hadm_id = valid_chartevents.iloc[0]['HADM_ID']
#             icustay_id = valid_chartevents.iloc[0]['ICUSTAY_ID']

#             df = valid_chartevents[(valid_chartevents['SUBJECT_ID'] == subject_id) & (valid_chartevents['HADM_ID'] == hadm_id) & (valid_chartevents['ICUSTAY_ID'] == icustay_id)]
#             df = df.reset_index(drop=True)

#             name = [str(subject_id),str(hadm_id),str(icustay_id)]
#             filename = 'sample_' + '_'.join(name) + FEATHER_EXT
#             df.to_feather(SAMPLES_DIR + filename)
#             #df.to_hdf(''.join([SAMPLES_DIR,DATASTORE_FILENAME]),filename,'a',append=True, index=False)

#             valid_chartevents = valid_chartevents[(valid_chartevents['SUBJECT_ID'] != subject_id) | (valid_chartevents['HADM_ID'] != hadm_id) | (valid_chartevents['ICUSTAY_ID'] != icustay_id)]

