In [None]:
import pandas as pd
from scipy.stats import skew

In [2]:
chunks = []
for chunk in pd.read_csv('../data_mimic_IV/icu/chartevents.csv', chunksize=100000): 
    chunks.append(chunk)

chart_events = pd.concat(chunks, ignore_index=True)
chart_events.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 432997491 entries, 0 to 432997490
Data columns (total 11 columns):
 #   Column        Non-Null Count      Dtype  
---  ------        --------------      -----  
 0   subject_id    432997491 non-null  int64  
 1   hadm_id       432997491 non-null  int64  
 2   stay_id       432997491 non-null  int64  
 3   caregiver_id  417283234 non-null  float64
 4   charttime     432997491 non-null  object 
 5   storetime     432414186 non-null  object 
 6   itemid        432997491 non-null  int64  
 7   value         420557262 non-null  object 
 8   valuenum      169211246 non-null  float64
 9   valueuom      105866265 non-null  object 
dtypes: float64(3), int64(4), object(4)
memory usage: 35.5+ GB


In [3]:
chart_events['charttime']= pd.to_datetime(chart_events['charttime'])

In [4]:
chart_events.loc[chart_events['itemid'] == 223762, 'valuenum'] = chart_events.loc[chart_events['itemid'] == 223762, 'valuenum'] * 9/5 + 32
chart_events.loc[chart_events['itemid'].isin([223761, 223762]), 'itemid'] = 223761

In [5]:
sub= pd.read_csv('studysubjects_final.csv', parse_dates=['intime','outtime'])
sub.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44421 entries, 0 to 44420
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   subject_id  44421 non-null  int64         
 1   hadm_id     44421 non-null  int64         
 2   stay_id     44421 non-null  int64         
 3   intime      44421 non-null  datetime64[ns]
 4   outtime     44421 non-null  datetime64[ns]
 5   duration    44421 non-null  float64       
dtypes: datetime64[ns](2), float64(1), int64(3)
memory usage: 2.0 MB


In [6]:
important_itemids = [
    220179, 220051, 220050, 225310, 225624, 220615, 223835,
    226537, 220045, 227464, 220645, 225651, 226260, 220210,
    223830, 220224, 220545, 220739, 223900, 223901, 223761] 

total_subjects = sub['subject_id'].nunique()

filtered = chart_events[chart_events['itemid'].isin(important_itemids)]

chart = filtered.merge(
    sub[['subject_id','hadm_id','stay_id', 'intime']],
    on=['subject_id','hadm_id','stay_id'],
    how='inner'
)

chart_24hr = chart[
    (chart['charttime'] >= chart['intime']) &
    (chart['charttime'] <= chart['intime'] + pd.Timedelta(hours=24))
]


total_subjects = sub['subject_id'].nunique()

coverage = (
    chart_24hr.groupby('itemid')['subject_id']
    .nunique()
    .reset_index(name='subjects_with_measurement')
)
coverage['percent_coverage'] = 100 * coverage['subjects_with_measurement'] / total_subjects

descriptions = {
    220179: 'NIBP Systolic',
    220051: 'Arterial BP Diastolic',
    220050: 'Arterial BP Systolic',
    225310: 'ART BP Diastolic',
    225624: 'BUN',
    220615: 'Creatinine',
    223835: 'FiO2',
    226537: 'Glucose',
    220045: 'Heart Rate',
    227464: 'Potassium',
    220645: 'Sodium',
    225651: 'Direct Bilirubin',
    226260: 'Mechanical Ventilation',
    220210: 'Respiratory Rate',
    223830: 'pH (Arterial)',
    220224: 'PaO2',
    220545: 'Hematocrit',
    220739: 'GCS Eye',
    223900: 'GCS Verbal',
    223901: 'GCS Motor',
    223761: 'Temp F'
}
coverage['description'] = coverage['itemid'].map(descriptions)

coverage = coverage.sort_values('percent_coverage', ascending=False)
print(coverage[['itemid', 'description', 'percent_coverage']])

    itemid            description  percent_coverage
0   220045             Heart Rate         99.943720
9   220739                GCS Eye         99.903199
13  223900             GCS Verbal         99.891943
14  223901              GCS Motor         99.887441
4   220210       Respiratory Rate         99.725355
8   220645                 Sodium         98.534477
7   220615             Creatinine         98.525472
16  225624                    BUN         98.496207
6   220545             Hematocrit         98.151775
10  223761                 Temp F         97.201774
3   220179          NIBP Systolic         89.802121
12  223835                   FiO2         49.787263
11  223830          pH (Arterial)         44.242588
5   220224                   PaO2         43.983701
2   220051  Arterial BP Diastolic         40.417820
1   220050   Arterial BP Systolic         40.402062
19  227464              Potassium         30.553117
18  226537                Glucose         30.204183
15  225310  

In [7]:
saps_feature_in_chart= [220179, 225624, 220615, 223835, 220045, 220645, 220210,
    223830, 220224, 220545, 220739, 223900, 223901, 223761]

filtered_chart = chart_events[chart_events['itemid'].isin(saps_feature_in_chart)]

merged = filtered_chart.merge(sub,
    on=['subject_id', 'hadm_id', 'stay_id'],
    how='inner')

merged = merged[
    (merged['charttime'] >= merged['intime']) &
    (merged['charttime'] <= merged['intime'] + pd.Timedelta(hours=24))].copy()

In [8]:
filtered_chart

Unnamed: 0,subject_id,hadm_id,stay_id,caregiver_id,charttime,storetime,itemid,value,valuenum,valueuom,warning
6,10000032,29079034,39553978,18704.0,2180-07-23 14:00:00,2180-07-23 14:20:00,223761,98.7,98.70,°F,0.0
7,10000032,29079034,39553978,18704.0,2180-07-23 14:11:00,2180-07-23 14:17:00,220179,84,84.00,mmHg,0.0
10,10000032,29079034,39553978,18704.0,2180-07-23 14:12:00,2180-07-23 14:17:00,220045,91,91.00,bpm,0.0
11,10000032,29079034,39553978,18704.0,2180-07-23 14:12:00,2180-07-23 14:17:00,220210,24,24.00,insp/min,0.0
94,10000032,29079034,39553978,18704.0,2180-07-23 14:30:00,2180-07-23 14:43:00,220045,93,93.00,bpm,0.0
...,...,...,...,...,...,...,...,...,...,...,...
432997468,19999987,23865745,36195440,,2145-11-04 05:01:00,2145-11-04 05:51:00,220645,147,147.00,mEq/L,1.0
432997469,19999987,23865745,36195440,,2145-11-04 05:01:00,2145-11-04 05:51:00,225624,22,22.00,mg/dL,1.0
432997475,19999987,23865745,36195440,,2145-11-04 07:18:00,2145-11-04 07:19:00,220224,69,69.00,mmHg,1.0
432997477,19999987,23865745,36195440,,2145-11-04 07:18:00,2145-11-04 07:19:00,223830,7.42,7.42,units,0.0
