# Sort the data into a train and test set

This splits the data into a trainset and a testset.
I have used the same testset from the github project to be able to give a more precise comparrison between my fast ai approach and their approach

 The reason for making a general dataset is to be able to do training on other areas of the data if time and resources allow it

In [2]:
import os
import shutil

DATA_PATH = f'./data/'

testset = set()
with open("resources/testset.csv", "r") as test_set_file:
    for line in test_set_file:
        x, y = line.split(',')
        if int(y) == 1:
            testset.add(x)

def move_to_partition(patients, partition):
    if not os.path.exists(os.path.join(DATA_PATH, partition)):
        os.mkdir(os.path.join(DATA_PATH, partition))
    for patient in patients:
        src = os.path.join(DATA_PATH, patient)
        dest = os.path.join(DATA_PATH, partition, patient)
        shutil.move(src, dest)


folders = os.listdir(DATA_PATH)
folders = list((filter(str.isdigit, folders)))
train_patients = [x for x in folders if not x in testset]
test_patients = [x for x in folders if x in testset]

assert len(set(train_patients) & set(test_patients)) == 0

move_to_partition(train_patients, "train")
move_to_partition(test_patients, "test")


This takes the train and test set and creates datasets which are specific for predicting mortality test.

In [3]:
from __future__ import print_function

import pandas as pd
import random
random.seed(49297)

FINAL_DATA_PATH = f'./datasets/'

if not os.path.exists(FINAL_DATA_PATH):
    os.makedirs(FINAL_DATA_PATH)


def process_partition(partition, eps=1e-6, n_hours=48):
    output_dir = os.path.join(FINAL_DATA_PATH, partition)
    if (not os.path.exists(output_dir)):
        os.mkdir(output_dir)

    xy_pairs = []
    patients = list(filter(str.isdigit, os.listdir(os.path.join(DATA_PATH, partition))))
    for (patient_index, patient) in enumerate(patients):
        patient_folder = os.path.join(DATA_PATH, partition, patient)
        patient_ts_files = list(filter(lambda x: x.find("timeseries") != -1, os.listdir(patient_folder)))

        for ts_filename in patient_ts_files:
            with open(os.path.join(patient_folder, ts_filename)) as tsfile:
                lb_filename = ts_filename.replace("_timeseries", "")
                label_df = pd.read_csv(os.path.join(patient_folder, lb_filename))

                # empty label file
                if (label_df.shape[0] == 0):
                    continue

                mortality = int(label_df.iloc[0]["Mortality"])
                los = 24.0 * label_df.iloc[0]['Length of Stay'] # in hours
                if (pd.isnull(los)):
                    print("\n\t(length of stay is missing)", patient, ts_filename)
                    continue

                if (los < n_hours - eps):
                    continue

                ts_lines = tsfile.readlines()
                header = ts_lines[0]
                ts_lines = ts_lines[1:]
                event_times = [float(line.split(',')[0]) for line in ts_lines]

                ts_lines = [line for (line, t) in zip(ts_lines, event_times)
                                     if (t > -eps and t < n_hours + eps)]
                event_times = [t for t in event_times
                                     if (t > -eps and t < n_hours + eps)]

                # no measurements in ICU
                if (len(ts_lines) == 0):
                    print("\n\t(no events in ICU) ", patient, ts_filename)
                    continue

                output_ts_filename = patient + "_" + ts_filename
                with open(os.path.join(output_dir, output_ts_filename), "w") as outfile:
                    outfile.write(header)
                    for line in ts_lines:
                        outfile.write(line)

                xy_pairs.append((output_ts_filename, mortality))

        if ((patient_index + 1) % 100 == 0):
            print("\rprocessed {} / {} patients".format(patient_index + 1, len(patients)))

    print("\n", len(xy_pairs))
    if partition == "train":
        random.shuffle(xy_pairs)
    if partition == "test":
        xy_pairs = sorted(xy_pairs)

    with open(os.path.join(output_dir, "listfile.csv"), "w") as listfile:
        listfile.write('stay,y_true\n')
        for (x, y) in xy_pairs:
            listfile.write("%s,%d\n" % (x, y))


process_partition("test")
process_partition("train")



	(no events in ICU)  52808 episode2_timeseries.csv
processed 100 / 5070 patients
processed 200 / 5070 patients
processed 300 / 5070 patients
processed 400 / 5070 patients
processed 500 / 5070 patients
processed 600 / 5070 patients
processed 700 / 5070 patients
processed 800 / 5070 patients
processed 900 / 5070 patients
processed 1000 / 5070 patients
processed 1100 / 5070 patients

	(no events in ICU)  21341 episode1_timeseries.csv
processed 1200 / 5070 patients
processed 1300 / 5070 patients
processed 1400 / 5070 patients
processed 1500 / 5070 patients
processed 1600 / 5070 patients
processed 1700 / 5070 patients
processed 1800 / 5070 patients
processed 1900 / 5070 patients
processed 2000 / 5070 patients
processed 2100 / 5070 patients
processed 2200 / 5070 patients
processed 2300 / 5070 patients
processed 2400 / 5070 patients
processed 2500 / 5070 patients
processed 2600 / 5070 patients

	(no events in ICU)  7512 episode1_timeseries.csv
processed 2700 / 5070 patients
processed 2800 / 


	(no events in ICU)  15215 episode1_timeseries.csv
processed 10200 / 36517 patients

	(no events in ICU)  1966 episode1_timeseries.csv
processed 10300 / 36517 patients
processed 10400 / 36517 patients

	(no events in ICU)  29267 episode1_timeseries.csv
processed 10500 / 36517 patients

	(no events in ICU)  32190 episode2_timeseries.csv
processed 10600 / 36517 patients

	(no events in ICU)  5716 episode1_timeseries.csv
processed 10700 / 36517 patients

	(no events in ICU)  98768 episode1_timeseries.csv
processed 10800 / 36517 patients

	(no events in ICU)  13134 episode1_timeseries.csv

	(no events in ICU)  10166 episode1_timeseries.csv
processed 10900 / 36517 patients

	(no events in ICU)  4156 episode1_timeseries.csv
processed 11000 / 36517 patients
processed 11100 / 36517 patients
processed 11200 / 36517 patients
processed 11300 / 36517 patients

	(no events in ICU)  15598 episode1_timeseries.csv
processed 11400 / 36517 patients
processed 11500 / 36517 patients

	(no events in ICU) 

processed 27500 / 36517 patients
processed 27600 / 36517 patients
processed 27700 / 36517 patients
processed 27800 / 36517 patients
processed 27900 / 36517 patients
processed 28000 / 36517 patients
processed 28100 / 36517 patients
processed 28200 / 36517 patients
processed 28300 / 36517 patients
processed 28400 / 36517 patients
processed 28500 / 36517 patients
processed 28600 / 36517 patients
processed 28700 / 36517 patients
processed 28800 / 36517 patients
processed 28900 / 36517 patients
processed 29000 / 36517 patients
processed 29100 / 36517 patients
processed 29200 / 36517 patients
processed 29300 / 36517 patients
processed 29400 / 36517 patients
processed 29500 / 36517 patients
processed 29600 / 36517 patients
processed 29700 / 36517 patients
processed 29800 / 36517 patients
processed 29900 / 36517 patients
processed 30000 / 36517 patients
processed 30100 / 36517 patients
processed 30200 / 36517 patients
processed 30300 / 36517 patients
processed 304