In [None]:

import pandas as pd
import csv
import yaml
%run 'DataGatherer.ipynb' 
%run 'Preprocess_data.ipynb'

# Make a csv for all stays

In [None]:
patients = read_patients_data(f'./rawdata/')
admissions = read_admission_data(f'./rawdata/')
stays = read_icustays_data(f'./rawdata/')

stays = remove_icustays_with_transfers(stays)
stays = merge_on_subject_admission(stays, admissions)
stays = merge_on_subject(stays, patients)
stays = filter_admissions_on_nb_icustays(stays)
stays = add_age_to_icustays(stays)
stays = add_inhospital_mortality_to_icustays(stays)
stays = add_inunit_mortality_to_icustays(stays)
stays.to_csv(os.path.join('./data/', 'all_stays.csv'), index=False)

# Make a csv for all diagnoses

In [None]:
diagnoses = read_icd_diagnoses_data(f'./rawdata/')
diagnoses = filter_diagnoses_on_stays(diagnoses, stays)
diagnoses.to_csv(os.path.join('./data/', 'all_diagnoses.csv'), index=False)

# Make a csv with diagnoses count


In [None]:
counts = count_icd_codes(diagnoses, output_path=os.path.join('./data/', 'diagnosis_counts.csv'))


In [None]:
phenotypes = add_hcup_ccs_2015_groups(diagnoses, yaml.load(open('resources/hcup_ccs_2015_definitions.yaml', 'r')))
make_phenotype_label_matrix(phenotypes, stays).to_csv(os.path.join('./data', 'phenotype_labels.csv'),
                                                      index=False, quoting=csv.QUOTE_NONNUMERIC)

# Make a csv file with stays for each patient

In [None]:
subjects = stays.SUBJECT_ID.unique()
break_up_stays_by_subject(stays, './data/', subjects=subjects, verbose=1)

# Make a csv file with diagnoses for each patient


In [None]:
break_up_diagnoses_by_subject(phenotypes, './data/', subjects=subjects, verbose=1)

# Make a csv file with events for each patient

This will make a folder for each patient. Each folder is 

In [None]:
#This is for testing, choosing around 1000 patients and one event type only. 

#event_data = ['CHARTEVENTS', 'LABEVENTS', 'OUTPUTEVENTS']
#pat_idx = np.random.choice(patients.shape[0], size=1000)
#patients = patients.iloc[pat_idx]
#stays = stays.merge(patients[['SUBJECT_ID']], left_on='SUBJECT_ID', right_on='SUBJECT_ID')
#event_data = [event_data[2]]
#print('Using only', stays.shape[0], 'stays and only', event_data[0], 'table')

In [None]:
path_to_items = f'./rawdata/D_LABITEMS.csv'
event_data = ['LABEVENTS']
items_to_keep = set(
    [int(itemid) for itemid in pd.read_csv(path_to_items)['ITEMID'].unique()]) if path_to_items else None

for table in event_data:
    read_events_table_and_break_up_by_subject('./rawdata/', table, './data/', items_to_keep=items_to_keep,
                                              subjects_to_keep=subjects, verbose=1)



In [None]:
path_to_items = f'./rawdata/D_ITEMS.csv'
event_data = ['CHARTEVENTS', 'OUTPUTEVENTS']

items_to_keep = set(
    [int(itemid) for itemid in pd.read_csv(path_to_items)['ITEMID'].unique()]) if path_to_items else None

for table in event_data:
    read_events_table_and_break_up_by_subject('./rawdata/', table, './data/', items_to_keep=items_to_keep,
                                              subjects_to_keep=subjects, verbose=1)

# Remove data from events
This is based on the assumptions from the github project. 
The assumptions are:
* There is one-to-one mapping between HADM_ID and ICUSTAY_ID in `stays.csv` files.
* HADM_ID and ICUSTAY_ID are not empty in `stays.csv` files.
* `stays.csv` and `events.csv` files are always present.
* There is no case, where after initial filtering we cannot recover empty ICUSTAY_IDs.
  
Problems which are fixed by filtering up the events:
* Remove all events for which HADM_ID is missing.
* Remove all events for which HADM_ID is not present in `stays.csv`.
* If ICUSTAY_ID is missing in an event and HADM_ID is not missing, then we look at `stays.csv` and try to recover ICUSTAY_ID.
* Remove all events for which we cannot recover ICUSTAY_ID.
* Remove all events for which ICUSTAY_ID is not present in `stays.csv`.

In [None]:
%run 'FilterEvents.ipynb'