## Data Preprocessing for Modeling

In [1]:
import os
import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

## Load Data

In [2]:
df = pd.read_csv("./data/vent_trach_cohort_clean25May2020.csv")

## Drop and Categorize Data

In [3]:
#DROP COLUMNS: SUBJECT_ID, HADM_ID, ETHNICITY, COHORT
df = df.drop(columns=['SUBJECT_ID', 'HADM_ID', 'ETHNICITY', 'COHORT'])

#Categorize Strings: OUTCOME, GENDER
le = LabelEncoder()
df['MORTALITY_3MO_DISCH'] = le.fit_transform(df['MORTALITY_3MO_DISCH'].tolist())
df['MORTALITY_1YR_DISCH'] = le.fit_transform(df['MORTALITY_1YR_DISCH'].tolist())

## Before anymore preprocessing or splitting, save as tmp for ProVent logistic regression
tmp = df

#OneHotEncode: ADMISSION_TYPE, ADMISSION_LOCATION, FIRST_CAREUNIT, GENDER
df = pd.get_dummies(df, prefix=['ADMISSION_TYPE', 'ADMISSION_LOCATION', 'FIRST_CAREUNIT', 'GENDER'], columns=['ADMISSION_TYPE', 'ADMISSION_LOCATION', 'FIRST_CAREUNIT', 'GENDER'])

### Standard Scaler for normal distributions

In [4]:
# General
## Scores
df[["ICU_D1_APSIII", "ICU_D1_OASIS",  "ICU_D1_SOFA", "ICU_D1_LODS", "ICU_D1_SAPSII"]] = preprocessing.scale(df[["ICU_D1_APSIII", "ICU_D1_OASIS",  "ICU_D1_SOFA", "ICU_D1_LODS", "ICU_D1_SAPSII"]])
## Misc
df[["HEIGHT_AVG", "ADMISSION_AGE"]] = preprocessing.scale(df[["HEIGHT_AVG", "ADMISSION_AGE"]])

#D1
## Lab values
df[["LMVD1_SODIUM_MIN",  "LMVD1_SODIUM_MAX", "LMVD1_PLATELET_MIN", "LMVD1_PLATELET_MAX",  "LMVD1_POTASSIUM_MIN", "LMVD1_POTASSIUM_MAX", "LMVD1_CHLORIDE_MIN",  "LMVD1_CHLORIDE_MAX",  "LMVD1_GLUCOSE_MIN", "LMVD1_GLUCOSE_MAX", "LMVD1_HEMATOCRIT_MIN",  "LMVD1_HEMATOCRIT_MAX",  "LMVD1_HEMOGLOBIN_MIN",  "LMVD1_HEMOGLOBIN_MAX", "LMVD1_BICARBONATE_MIN",  "LMVD1_BICARBONATE_MAX", "LMVD1_ANIONGAP_MIN",  "LMVD1_ANIONGAP_MAX",  "LMVD1_ALBUMIN_MIN", "LMVD1_ALBUMIN_MAX"]] = preprocessing.scale(df[["LMVD1_SODIUM_MIN",  "LMVD1_SODIUM_MAX", "LMVD1_PLATELET_MIN", "LMVD1_PLATELET_MAX",  "LMVD1_POTASSIUM_MIN", "LMVD1_POTASSIUM_MAX", "LMVD1_CHLORIDE_MIN",  "LMVD1_CHLORIDE_MAX",  "LMVD1_GLUCOSE_MIN", "LMVD1_GLUCOSE_MAX", "LMVD1_HEMATOCRIT_MIN",  "LMVD1_HEMATOCRIT_MAX",  "LMVD1_HEMOGLOBIN_MIN",  "LMVD1_HEMOGLOBIN_MAX", "LMVD1_BICARBONATE_MIN",  "LMVD1_BICARBONATE_MAX", "LMVD1_ANIONGAP_MIN",  "LMVD1_ANIONGAP_MAX",  "LMVD1_ALBUMIN_MIN", "LMVD1_ALBUMIN_MAX"]])
df[["LD1_SODIUM_MIN", "LD1_SODIUM_MAX", "LD1_PLATELET_MIN", "LD1_PLATELET_MAX", "LD1_POTASSIUM_MIN", "LD1_POTASSIUM_MAX", "LD1_CHLORIDE_MIN", "LD1_CHLORIDE_MAX", "LD1_GLUCOSE_MIN", "LD1_GLUCOSE_MAX", "LD1_HEMATOCRIT_MIN", "LD1_HEMATOCRIT_MAX", "LD1_HEMOGLOBIN_MIN", "LD1_HEMOGLOBIN_MAX", "LD1_BICARBONATE_MIN", "LD1_BICARBONATE_MAX", "LD1_ANIONGAP_MIN", "LD1_ANIONGAP_MAX", "LD1_ALBUMIN_MIN", "LD1_ALBUMIN_MAX"]] = preprocessing.scale(df[["LD1_SODIUM_MIN", "LD1_SODIUM_MAX", "LD1_PLATELET_MIN", "LD1_PLATELET_MAX", "LD1_POTASSIUM_MIN", "LD1_POTASSIUM_MAX", "LD1_CHLORIDE_MIN", "LD1_CHLORIDE_MAX", "LD1_GLUCOSE_MIN", "LD1_GLUCOSE_MAX", "LD1_HEMATOCRIT_MIN", "LD1_HEMATOCRIT_MAX", "LD1_HEMOGLOBIN_MIN", "LD1_HEMOGLOBIN_MAX", "LD1_BICARBONATE_MIN", "LD1_BICARBONATE_MAX", "LD1_ANIONGAP_MIN", "LD1_ANIONGAP_MAX", "LD1_ALBUMIN_MIN", "LD1_ALBUMIN_MAX"]])

## Vital Signs
df[["VMVD1_GLUCOSE_MIN", "VMVD1_GLUCOSE_MAX", "VMVD1_GLUCOSE_MEAN", "VMVD1_RESPRATE_MAX", "VMVD1_RESPRATE_MEAN", "VMVD1_TEMPC_MIN", "VMVD1_TEMPC_MAX", "VMVD1_TEMPC_MEAN", "VMVD1_HEARTRATE_MEAN", "VMVD1_SYSBP_MIN", "VMVD1_SYSBP_MAX", "VMVD1_SYSBP_MEAN", "VMVD1_DIASBP_MIN", "VMVD1_DIASBP_MAX", "VMVD1_DIASBP_MEAN", "VMVD1_MEANBP_MIN", "VMVD1_MEANBP_MAX", "VMVD1_MEANBP_MEAN", "VMVD1_RESPRATE_MIN", "VMVD1_HEARTRATE_MIN", "VMVD1_HEARTRATE_MAX"]] = preprocessing.scale(df[["VMVD1_GLUCOSE_MIN", "VMVD1_GLUCOSE_MAX", "VMVD1_GLUCOSE_MEAN", "VMVD1_RESPRATE_MAX", "VMVD1_RESPRATE_MEAN", "VMVD1_TEMPC_MIN", "VMVD1_TEMPC_MAX", "VMVD1_TEMPC_MEAN", "VMVD1_HEARTRATE_MEAN", "VMVD1_SYSBP_MIN", "VMVD1_SYSBP_MAX", "VMVD1_SYSBP_MEAN", "VMVD1_DIASBP_MIN", "VMVD1_DIASBP_MAX", "VMVD1_DIASBP_MEAN", "VMVD1_MEANBP_MIN", "VMVD1_MEANBP_MAX", "VMVD1_MEANBP_MEAN", "VMVD1_RESPRATE_MIN", "VMVD1_HEARTRATE_MIN", "VMVD1_HEARTRATE_MAX"]])
df[["VD1_GLUCOSE_MIN", "VD1_GLUCOSE_MAX", "VD1_GLUCOSE_MEAN", "VD1_MEANBP_MIN", "VD1_MEANBP_MAX", "VD1_MEANBP_MEAN", "VD1_RESPRATE_MIN", "VD1_RESPRATE_MAX", "VD1_RESPRATE_MEAN", "VD1_TEMPC_MIN", "VD1_TEMPC_MAX", "VD1_TEMPC_MEAN", "VD1_HEARTRATE_MIN", "VD1_HEARTRATE_MAX", "VD1_HEARTRATE_MEAN", "VD1_SYSBP_MIN", "VD1_SYSBP_MAX", "VD1_SYSBP_MEAN", "VD1_DIASBP_MIN", "VD1_DIASBP_MAX", "VD1_DIASBP_MEAN"]] = preprocessing.scale(df[["VD1_GLUCOSE_MIN", "VD1_GLUCOSE_MAX", "VD1_GLUCOSE_MEAN", "VD1_MEANBP_MIN", "VD1_MEANBP_MAX", "VD1_MEANBP_MEAN", "VD1_RESPRATE_MIN", "VD1_RESPRATE_MAX", "VD1_RESPRATE_MEAN", "VD1_TEMPC_MIN", "VD1_TEMPC_MAX", "VD1_TEMPC_MEAN", "VD1_HEARTRATE_MIN", "VD1_HEARTRATE_MAX", "VD1_HEARTRATE_MEAN", "VD1_SYSBP_MIN", "VD1_SYSBP_MAX", "VD1_SYSBP_MEAN", "VD1_DIASBP_MIN", "VD1_DIASBP_MAX", "VD1_DIASBP_MEAN"]])
## Misc
df[["WEIGHTD1_WEIGHT_AVG",  "WEIGHTMVD1_WEIGHT_AVG",  "BMID1",  "BMIMVD1"]] = preprocessing.scale(df[["WEIGHTD1_WEIGHT_AVG",  "WEIGHTMVD1_WEIGHT_AVG",  "BMID1",  "BMIMVD1"]])

#D7
## Lab values
df[["LMVD7_SODIUM_MIN",  "LMVD7_SODIUM_MAX", "LMVD7_PLATELET_MIN", "LMVD7_PLATELET_MAX",  "LMVD7_POTASSIUM_MIN", "LMVD7_POTASSIUM_MAX", "LMVD7_CHLORIDE_MIN",  "LMVD7_CHLORIDE_MAX",  "LMVD7_GLUCOSE_MIN", "LMVD7_GLUCOSE_MAX", "LMVD7_HEMATOCRIT_MIN",  "LMVD7_HEMATOCRIT_MAX",  "LMVD7_HEMOGLOBIN_MIN",  "LMVD7_HEMOGLOBIN_MAX", "LMVD7_BICARBONATE_MIN",  "LMVD7_BICARBONATE_MAX", "LMVD7_ANIONGAP_MIN",  "LMVD7_ANIONGAP_MAX",  "LMVD7_ALBUMIN_MIN", "LMVD7_ALBUMIN_MAX"]] = preprocessing.scale(df[["LMVD7_SODIUM_MIN",  "LMVD7_SODIUM_MAX", "LMVD7_PLATELET_MIN", "LMVD7_PLATELET_MAX",  "LMVD7_POTASSIUM_MIN", "LMVD7_POTASSIUM_MAX", "LMVD7_CHLORIDE_MIN",  "LMVD7_CHLORIDE_MAX",  "LMVD7_GLUCOSE_MIN", "LMVD7_GLUCOSE_MAX", "LMVD7_HEMATOCRIT_MIN",  "LMVD7_HEMATOCRIT_MAX",  "LMVD7_HEMOGLOBIN_MIN",  "LMVD7_HEMOGLOBIN_MAX", "LMVD7_BICARBONATE_MIN",  "LMVD7_BICARBONATE_MAX", "LMVD7_ANIONGAP_MIN",  "LMVD7_ANIONGAP_MAX",  "LMVD7_ALBUMIN_MIN", "LMVD7_ALBUMIN_MAX"]])
## Vital signs
df[["VMVD7_GLUCOSE_MIN", "VMVD7_GLUCOSE_MAX", "VMVD7_GLUCOSE_MEAN", "VMVD7_RESPRATE_MAX", "VMVD7_RESPRATE_MEAN", "VMVD7_TEMPC_MIN", "VMVD7_TEMPC_MAX", "VMVD7_TEMPC_MEAN", "VMVD7_HEARTRATE_MEAN", "VMVD7_SYSBP_MIN", "VMVD7_SYSBP_MAX", "VMVD7_SYSBP_MEAN", "VMVD7_DIASBP_MIN", "VMVD7_DIASBP_MAX", "VMVD7_DIASBP_MEAN", "VMVD7_MEANBP_MIN", "VMVD7_MEANBP_MAX", "VMVD7_MEANBP_MEAN", "VMVD7_RESPRATE_MIN", "VMVD7_HEARTRATE_MIN", "VMVD7_HEARTRATE_MAX"]] = preprocessing.scale(df[["VMVD7_GLUCOSE_MIN", "VMVD7_GLUCOSE_MAX", "VMVD7_GLUCOSE_MEAN", "VMVD7_RESPRATE_MAX", "VMVD7_RESPRATE_MEAN", "VMVD7_TEMPC_MIN", "VMVD7_TEMPC_MAX", "VMVD7_TEMPC_MEAN", "VMVD7_HEARTRATE_MEAN", "VMVD7_SYSBP_MIN", "VMVD7_SYSBP_MAX", "VMVD7_SYSBP_MEAN", "VMVD7_DIASBP_MIN", "VMVD7_DIASBP_MAX", "VMVD7_DIASBP_MEAN", "VMVD7_MEANBP_MIN", "VMVD7_MEANBP_MAX", "VMVD7_MEANBP_MEAN", "VMVD7_RESPRATE_MIN", "VMVD7_HEARTRATE_MIN", "VMVD7_HEARTRATE_MAX"]])
## Misc
df[["WEIGHTMVD7_WEIGHT_AVG", "BMIMVD7"]] = preprocessing.scale(df[["WEIGHTMVD7_WEIGHT_AVG", "BMIMVD7"]])

# Min Max Scaler
min_max_scaler = preprocessing.MinMaxScaler()
# Vital Signs 
## Day 1
df[["VD1_SPO2_MIN", "VD1_SPO2_MAX", "VD1_SPO2_MEAN", "VMVD1_SPO2_MIN", "VMVD1_SPO2_MAX", "VMVD1_SPO2_MEAN"]] = min_max_scaler.fit_transform(df[["VD1_SPO2_MIN", "VD1_SPO2_MAX", "VD1_SPO2_MEAN", "VMVD1_SPO2_MIN", "VMVD1_SPO2_MAX", "VMVD1_SPO2_MEAN"]])
## Day 7
df[["VMVD7_SPO2_MIN", "VMVD7_SPO2_MAX", "VMVD7_SPO2_MEAN"]] = min_max_scaler.fit_transform(df[["VMVD7_SPO2_MIN", "VMVD7_SPO2_MAX", "VMVD7_SPO2_MEAN"]])

# Lab Values
## Day 1
df[["LD1_BANDS_MIN", "LD1_BANDS_MAX", "LD1_BILIRUBIN_MIN", "LD1_BILIRUBIN_MAX", "LD1_CREATININE_MIN", "LD1_CREATININE_MAX", "LD1_LACTATE_MIN", "LD1_LACTATE_MAX", "LD1_PTT_MIN", "LD1_PTT_MAX", "LD1_INR_MIN", "LD1_INR_MAX", "LD1_PT_MIN", "LD1_PT_MAX", "LD1_BUN_MIN", "LD1_BUN_MAX", "LD1_WBC_MIN", "LD1_WBC_MAX"]] = min_max_scaler.fit_transform(df[["LD1_BANDS_MIN", "LD1_BANDS_MAX", "LD1_BILIRUBIN_MIN", "LD1_BILIRUBIN_MAX", "LD1_CREATININE_MIN", "LD1_CREATININE_MAX", "LD1_LACTATE_MIN", "LD1_LACTATE_MAX", "LD1_PTT_MIN", "LD1_PTT_MAX", "LD1_INR_MIN", "LD1_INR_MAX", "LD1_PT_MIN", "LD1_PT_MAX", "LD1_BUN_MIN", "LD1_BUN_MAX", "LD1_WBC_MIN", "LD1_WBC_MAX"]])
df[["LMVD1_BANDS_MIN", "LMVD1_BANDS_MAX", "LMVD1_BILIRUBIN_MIN", "LMVD1_BILIRUBIN_MAX", "LMVD1_CREATININE_MIN", "LMVD1_CREATININE_MAX", "LMVD1_LACTATE_MIN", "LMVD1_LACTATE_MAX", "LMVD1_PTT_MIN", "LMVD1_PTT_MAX", "LMVD1_INR_MIN", "LMVD1_INR_MAX", "LMVD1_PT_MIN", "LMVD1_PT_MAX", "LMVD1_BUN_MIN", "LMVD1_BUN_MAX", "LMVD1_WBC_MIN", "LMVD1_WBC_MAX"]] = min_max_scaler.fit_transform(df[["LMVD1_BANDS_MIN", "LMVD1_BANDS_MAX", "LMVD1_BILIRUBIN_MIN", "LMVD1_BILIRUBIN_MAX", "LMVD1_CREATININE_MIN", "LMVD1_CREATININE_MAX", "LMVD1_LACTATE_MIN", "LMVD1_LACTATE_MAX", "LMVD1_PTT_MIN", "LMVD1_PTT_MAX", "LMVD1_INR_MIN", "LMVD1_INR_MAX", "LMVD1_PT_MIN", "LMVD1_PT_MAX", "LMVD1_BUN_MIN", "LMVD1_BUN_MAX", "LMVD1_WBC_MIN", "LMVD1_WBC_MAX"]])
## Day 7
df[["LMVD7_BANDS_MIN", "LMVD7_BANDS_MAX", "LMVD7_BILIRUBIN_MIN", "LMVD7_BILIRUBIN_MAX", "LMVD7_CREATININE_MIN", "LMVD7_CREATININE_MAX", "LMVD7_LACTATE_MIN", "LMVD7_LACTATE_MAX", "LMVD7_PTT_MIN", "LMVD7_PTT_MAX", "LMVD7_INR_MIN", "LMVD7_INR_MAX", "LMVD7_PT_MIN", "LMVD7_PT_MAX", "LMVD7_BUN_MIN", "LMVD7_BUN_MAX", "LMVD7_WBC_MIN", "LMVD7_WBC_MAX"]] = min_max_scaler.fit_transform(df[["LMVD7_BANDS_MIN", "LMVD7_BANDS_MAX", "LMVD7_BILIRUBIN_MIN", "LMVD7_BILIRUBIN_MAX", "LMVD7_CREATININE_MIN", "LMVD7_CREATININE_MAX", "LMVD7_LACTATE_MIN", "LMVD7_LACTATE_MAX", "LMVD7_PTT_MIN", "LMVD7_PTT_MAX", "LMVD7_INR_MIN", "LMVD7_INR_MAX", "LMVD7_PT_MIN", "LMVD7_PT_MAX", "LMVD7_BUN_MIN", "LMVD7_BUN_MAX", "LMVD7_WBC_MIN", "LMVD7_WBC_MAX"]])

### Training/Testing Split
Here we pull 3mo mortality and 1yr mortality from `df`. `df` is then used to drop 3mo and 1yr mortality, and assigned to `X`.

In [5]:
# Assigning X and y split
y_3mo = df['MORTALITY_3MO_DISCH']
y_1yr = df['MORTALITY_1YR_DISCH']

## Remove outcomes from both
X = df.drop(columns=['MORTALITY_3MO_DISCH'])
X = X.drop(columns=['MORTALITY_1YR_DISCH'])

## We already have outcomes, so we can drop them from tmp also
tmp = tmp.drop(columns = ['MORTALITY_3MO_DISCH'])
tmp = tmp.drop(columns = ['MORTALITY_1YR_DISCH'])

## X.head()
print(df['MORTALITY_3MO_DISCH'].value_counts())
print(df['MORTALITY_1YR_DISCH'].value_counts())

0    2584
1    1750
Name: MORTALITY_3MO_DISCH, dtype: int64
0    2192
1    2142
Name: MORTALITY_1YR_DISCH, dtype: int64


In [6]:
print(y_3mo.value_counts())
print(y_1yr.value_counts())

0    2584
1    1750
Name: MORTALITY_3MO_DISCH, dtype: int64
0    2192
1    2142
Name: MORTALITY_1YR_DISCH, dtype: int64


## Splitting

In [7]:
from sklearn.model_selection import StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits=4, test_size=0.2, random_state=1)

for train_index, test_index in sss.split(X, y_1yr):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_3mo_train, y_3mo_test = y_3mo.iloc[train_index], y_3mo.iloc[test_index]
    y_1yr_train, y_1yr_test = y_1yr.iloc[train_index], y_1yr.iloc[test_index]
    ## Note: tmp is assigned unscaled data for ProVent
    x_pv_train, x_pv_test = tmp.iloc[train_index], tmp.iloc[test_index]

In [None]:
## Sanity Check
#print(y_3mo_train.head(10))
#print(y_1yr_train.head(10))

#print(y_3mo_test.head(10))
#print(y_1yr_test.head(10))

## print(sum(y_3mo_test))
## print(sum(y_1yr_test))

### ProVent Data (Same Split)

In [8]:
## Training Set
x_pv_train = x_pv_train[['VPMVD7_VP', 'LMVD7_PLATELET_MIN', 'RRTMVD7_RRT', 'ADMISSION_AGE', 'FIRST_CAREUNIT']]
## Check if first care unit is trauma surgery
x_pv_train['FIRST_CAREUNIT'] = (x_pv_train['FIRST_CAREUNIT'] != 'TSICU').astype(int)

## Convert to platelets to boolean
x_pv_train['LMVD7_PLATELET_MIN'] = (x_pv_train['LMVD7_PLATELET_MIN'] <= 100).astype(int)
## Convert Age to Categories:
## < 50 : 0
## >= 50: 1
## >= 65: 2
x_pv_train.loc[x_pv_train['ADMISSION_AGE'] < 50, 'AGE'] = int(0)
x_pv_train.loc[x_pv_train['ADMISSION_AGE'] >= 50, 'AGE'] = int(1)
x_pv_train.loc[x_pv_train['ADMISSION_AGE'] >= 65, 'AGE'] = int(2)
## drop admission_age now that we're done with it
x_pv_train = x_pv_train.drop(columns = ['ADMISSION_AGE'])
x_pv_train['AGE'] = x_pv_train['AGE'].astype(int)

## Testing Set
x_pv_test = x_pv_test[['VPMVD7_VP', 'LMVD7_PLATELET_MIN', 'RRTMVD7_RRT', 'ADMISSION_AGE', 'FIRST_CAREUNIT']]
## Check if first care unit is trauma surgery
x_pv_test['FIRST_CAREUNIT'] = (x_pv_test['FIRST_CAREUNIT'] != 'TSICU').astype(int)
## Convert to platelets to boolean
x_pv_test['LMVD7_PLATELET_MIN'] = (x_pv_test['LMVD7_PLATELET_MIN'] <= 100).astype(int)

## Convert to platelets to boolean
x_pv_test['LMVD7_PLATELET_MIN'] = (x_pv_test['LMVD7_PLATELET_MIN'] <= 100).astype(int)
## Convert Age to Categories:
## < 50 : 0
## >= 50: 1
## >= 65: 2
x_pv_test.loc[x_pv_test['ADMISSION_AGE'] < 50, 'AGE'] = int(0)
x_pv_test.loc[x_pv_test['ADMISSION_AGE'] >= 50, 'AGE'] = int(1)
x_pv_test.loc[x_pv_test['ADMISSION_AGE'] >= 65, 'AGE'] = int(2)
## drop admission_age now that we're done with it
x_pv_test = x_pv_test.drop(columns = ['ADMISSION_AGE'])
x_pv_test['AGE'] = x_pv_test['AGE'].astype(int)

In [9]:
print(sum(y_3mo_train))
print(sum(y_3mo_test))

print(sum(y_1yr_train))
print(sum(y_1yr_test))

1390
360
1714
428


### Save

In [10]:
## Overall data including ICUSTAY_ID
X_train.to_csv('./data/X_train.csv')
X_test.to_csv('./data/X_test.csv')

## Now that the ids are saved, drop ICUSTAY_ID, 'ICUSTAY_ID'
X_train = X_train.drop(columns=['ICUSTAY_ID'])
X_test = X_test.drop(columns = ['ICUSTAY_ID'])

## ProVent .csv files for Logistic Regression
x_pv_train.to_csv('./data/X_pv_train.csv', index = False)
x_pv_test.to_csv('./data/X_pv_test.csv', index = False)

## Outcomes
## y_3mo_train.to_csv('./data/y_3mo_train.csv', index = False, header = None)
## y_3mo_test.to_csv('./data/y_3mo_test.csv', index = False, header = None)

## y_1yr_train.to_csv('./data/y_1yr_train.csv', index = False, header = None)
## y_1yr_test.to_csv('./data/y_1yr_test.csv', index = False, header = None)

## npy input
np.save("./data/X_train_s.npy", X_train)
np.save("./data/X_test_s.npy", X_test)

np.save("./data/y_3mo_train_s.npy", y_3mo_train)
np.save("./data/y_3mo_test_s.npy", y_3mo_test)

np.save("./data/y_1yr_train_s.npy", y_1yr_train)
np.save("./data/y_1yr_test_s.npy", y_1yr_test)