# Split train, validate and test

In [4]:
import pandas as pd
import numpy as np

## Sort based on time

We want to train on previous patients and make predictions for future patients. Because it is a large dataset, we use 80% for training, 10% for validation and 10% for testing

In [5]:
def sort_and_split (df_name):
    # import data
    print('Importing ', df_name)
    df = pd.read_csv('data/preprocessing_III/' + df_name + '.csv', 
                    parse_dates=['admittime', 'edregtime', 'emar_charttime', 'lab_charttime'])

    # sort
    print("Sorting ", df_name)
    df = df.sort_values(by='admittime')
    df = df.reset_index(drop=True)

    # index for the first 80%
    first_80_percent = int(np.around(len(df)*0.8))
    # index for the first 90%
    first_90_percent = int(np.around(len(df)*0.9)) 

    # split
    print("Splitting ", df_name)
    # train is 0% to 80%
    train = df.iloc[: first_80_percent]
    # validate is 80% to 90%
    validate = df.iloc[first_80_percent : first_90_percent]
    # test is 90% to 100%
    test = df.iloc[first_90_percent :]

    print("Saving train, validate and test for ", df_name)
    train.to_csv("data/preprocessing_IV/" + df_name + "_train.csv", index=False)
    validate.to_csv("data/preprocessing_IV/" + df_name + "_validate.csv", index=False)
    test.to_csv("data/preprocessing_IV/" + df_name + "_test.csv", index=False)
    
    

In [6]:
ethnic_group_names = ['unknown', 'white', 'other', 'asian', 'hispanic_latino', 'black_african_american', 'unable_to_obtain', 'american_indian_alaska_native']

for name in ethnic_group_names:
    sort_and_split(name)


Importing  unknown
Sorting  unknown
Splitting  unknown
Saving train, validate and test for  unknown
Importing  white
Sorting  white
Splitting  white
Saving train, validate and test for  white
Importing  other
Sorting  other
Splitting  other
Saving train, validate and test for  other
Importing  asian
Sorting  asian
Splitting  asian
Saving train, validate and test for  asian
Importing  hispanic_latino
Sorting  hispanic_latino
Splitting  hispanic_latino
Saving train, validate and test for  hispanic_latino
Importing  black_african_american
Sorting  black_african_american
Splitting  black_african_american
Saving train, validate and test for  black_african_american
Importing  unable_to_obtain
Sorting  unable_to_obtain
Splitting  unable_to_obtain
Saving train, validate and test for  unable_to_obtain
Importing  american_indian_alaska_native
Sorting  american_indian_alaska_native
Splitting  american_indian_alaska_native
Saving train, validate and test for  american_indian_alaska_native
