In [3]:

import pandas as pd
import csv
import yaml
from Data_Creator_Helper import * 
from FilterEvents import filter_events
from Data_Creator_Preprocessing import *

# Make a csv for all stays

In [5]:
patients = read_patients_data(f'./rawdata/')
admissions = read_admission_data(f'./rawdata/')
stays = read_icustays_data(f'./rawdata/')

stays = remove_icustays_with_transfers(stays)
stays = merge_on_subject_admission(stays, admissions)
stays = merge_on_subject(stays, patients)
stays = filter_admissions_on_nb_icustays(stays)
stays = add_age_to_icustays(stays)
stays = add_inhospital_mortality_to_icustays(stays)
stays = filter_icustays_on_age(stays)
stays = add_inunit_mortality_to_icustays(stays)
stays.to_csv(os.path.join('./data/', 'all_stays.csv'), index=False)

# Make a csv for all diagnoses

In [6]:
diagnoses = read_icd_diagnoses_data(f'./rawdata/')
diagnoses = filter_diagnoses_on_stays(diagnoses, stays)
diagnoses.to_csv(os.path.join('./data/', 'all_diagnoses.csv'), index=False)

# Make a csv with diagnoses count


In [7]:
counts = count_icd_codes(diagnoses, output_path=os.path.join('./data/', 'diagnosis_counts.csv'))


In [8]:
phenotypes = add_hcup_ccs_2015_groups(diagnoses, yaml.load(open('resources/hcup_ccs_2015_definitions.yaml', 'r')))
make_phenotype_label_matrix(phenotypes, stays).to_csv(os.path.join('./data', 'phenotype_labels.csv'),
                                                      index=False, quoting=csv.QUOTE_NONNUMERIC)

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike


# Make a csv file with stays for each patient

In [9]:
subjects = stays.SUBJECT_ID.unique()
break_up_stays_by_subject(stays, './data/', subjects=subjects, verbose=1)

SUBJECT 33798 of 33798...DONE!


# Make a csv file with diagnoses for each patient


In [10]:
break_up_diagnoses_by_subject(phenotypes, './data/', subjects=subjects, verbose=1)

SUBJECT 33798 of 33798...DONE!


# Make a csv file with events for each patient

This will make a folder for each patient. Each folder is 

In [8]:
#This is for testing, choosing around 1000 patients and one event type only. 

#event_data = ['CHARTEVENTS', 'LABEVENTS', 'OUTPUTEVENTS']
#pat_idx = np.random.choice(patients.shape[0], size=1000)
#patients = patients.iloc[pat_idx]
#stays = stays.merge(patients[['SUBJECT_ID']], left_on='SUBJECT_ID', right_on='SUBJECT_ID')
#event_data = [event_data[2]]
#print('Using only', stays.shape[0], 'stays and only', event_data[0], 'table')

In [12]:
path_to_items = f'./rawdata/D_LABITEMS.csv'

event_tables  = ['LABEVENTS']
items_to_keep = set(
    [int(itemid) for itemid in pd.read_csv(path_to_items, header=0, index_col=0)['ITEMID'].unique()]) if path_to_items else None

for table in event_tables:
    read_events_table_and_break_up_by_subject('./rawdata/', table, './data/', items_to_keep=items_to_keep,
                                              subjects_to_keep=subjects, verbose=1)



finished processing LABEVENTS: ROW 27854054 of 27854056...last write (394668) 73 rows for subject 96442...DONE!


In [13]:
path_to_items = f'./rawdata/D_ITEMS.csv'

event_tables  = ['CHARTEVENTS', 'OUTPUTEVENTS']
items_to_keep = set(
    [int(itemid) for itemid in pd.read_csv(path_to_items, header=0, index_col=0)['ITEMID'].unique()]) if path_to_items else None

for table in event_tables:
    read_events_table_and_break_up_by_subject('./rawdata/', table, './data/', items_to_keep=items_to_keep,
                                              subjects_to_keep=subjects, verbose=1)

finished processing CHARTEVENTS: ROW 330712482 of 330712484...last write (3168090) 287 rows for subject 99781...DONE!
finished processing OUTPUTEVENTS: ROW 4349217 of 4349219...last write (351140) 39 rows for subject 68375...DONE!


### Remove data from events
This is based on the assumptions from the github project. 
The assumptions are:
* There is one-to-one mapping between HADM_ID and ICUSTAY_ID in `stays.csv` files.
* HADM_ID and ICUSTAY_ID are not empty in `stays.csv` files.
* `stays.csv` and `events.csv` files are always present.
* There is no case, where after initial filtering we cannot recover empty ICUSTAY_IDs.
  
Problems which are fixed by filtering up the events:
* Remove all events for which HADM_ID is missing.
* Remove all events for which HADM_ID is not present in `stays.csv`.
* If ICUSTAY_ID is missing in an event and HADM_ID is not missing, then we look at `stays.csv` and try to recover ICUSTAY_ID.
* Remove all events for which we cannot recover ICUSTAY_ID.
* Remove all events for which ICUSTAY_ID is not present in `stays.csv`.

In [14]:
filter_events()

processed 1 / 33798           


  if self.run_code(code, result):


processed 3001 / 33798           
processed 6001 / 33798           
processed 9001 / 33798           
processed 12001 / 33798           
processed 15001 / 33798           
processed 18001 / 33798           
processed 21001 / 33798           
processed 24001 / 33798           
processed 27001 / 33798           
processed 30001 / 33798           
processed 33001 / 33798           
n_events: 253116833
empty_hadm: 5162703
no_hadm_in_stay: 32266173
no_icustay: 15735688
recovered: 15735688
could_not_recover: 0
icustay_missing_in_stays: 7115720
