# Split train, validate and test

In [1]:
import pandas as pd
import numpy as np

## Sort based on time

We want to train on previous patients and make predictions for future patients. Because it is a large dataset, we use 80% for training, 10% for validation and 10% for testing

In [4]:
def sort_and_split (df_name):
    # import data
    print('Importing ', df_name)
    df = pd.read_csv('data/preprocessing_III/' + df_name + '.csv')
    # print(df.dtypes)

    # sort
    print("Sorting ", df_name)
    df['admittime'] = pd.to_datetime(df['admittime'])
    df = df.sort_values(by='admittime')
    df = df.reset_index(drop=True)
    # print(df.dtypes)

    # index for the first 80%
    first_80_percent = int(np.around(len(df)*0.8))
    # index for the first 90%
    first_90_percent = int(np.around(len(df)*0.9)) 

    # split
    print("Splitting ", df_name)
    # train is 0% to 80%
    train = df.iloc[: first_80_percent]
    # validate is 80% to 90%
    validate = df.iloc[first_80_percent : first_90_percent]
    # test is 90% to 100%
    test = df.iloc[first_90_percent :]

    # print(train.dtypes)

    print("Saving train, validate and test for ", df_name)
    train.to_csv("data/preprocessing_IV/" + df_name + "_train.csv", index=False)
    validate.to_csv("data/preprocessing_IV/" + df_name + "_validate.csv", index=False)
    test.to_csv("data/preprocessing_IV/" + df_name + "_test.csv", index=False)
    
    

In [5]:
ethnic_group_names = ['unknown', 'white', 'other', 'asian', 'hispanic_latino', 'black_african_american', 'unable_to_obtain', 'american_indian_alaska_native']

for name in ethnic_group_names:
    sort_and_split(name)


Importing  unknown
Unnamed: 0                                                    int64
hadm_id                                                       int64
subject_id                                                    int64
admittime                                                    object
edregtime                                                    object
anchor_age                                                    int64
anchor_year                                                   int64
icd_code_count                                                int64
emar_count                                                    int64
emar_charttime                                               object
lab_count                                                     int64
lab_charttime                                                object
has_kidney_issue                                               bool
admission_type_urgent                                          bool
admission_type_direct_emer.  

KeyboardInterrupt: 