# Model Implementation - Train/Test Data Split

## Import Libraries

In [1]:
# import necessary libraries
import pandas as pd
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.utils import to_categorical

## Random Seed

In [2]:
# note: set the random seed for reproducibility of results

# define variable to store seed value
seed = 2

## Read Dataset to Pandas Dataframe

In [3]:
# read modelinputdata.csv file into a pandas dataframe
model_df = pd.read_csv('modelinputdata.csv')
# display model_df
display(model_df.head())
# row count of model_df
print('model_df row count:', model_df.shape[0])

Unnamed: 0,ROA,F_ROA,CFO,F_CFO,CROA,F_CROA,ACCRUAL,F_ACCRUAL,CLEVER,F_CLEVER,CLIQUID,F_CLIQUID,EQ_OFFER,F_EQ_OFFER,CMARGIN,F_CMARGIN,CTURN,F_CTURN,F_SCORE
0,0.038256,1,0.055301,1,-0.008951,0,-0.017044,1,0.137563,0,0.049832,1,0.492,0,-0.008424,0,0.034591,1,5
1,0.024139,1,0.074192,1,-0.014118,0,-0.050053,1,-0.056161,1,0.410468,1,-0.891,1,0.010205,1,-0.230628,0,7
2,0.033413,1,0.065422,1,0.009274,1,-0.032009,1,-0.026959,1,0.120341,1,0.178,0,0.014049,1,-0.034682,0,7
3,0.026403,1,0.021188,1,0.051317,1,0.005215,0,0.046283,0,0.338023,1,-0.908,1,0.027259,1,0.372579,1,7
4,0.054896,1,0.044205,1,0.006296,1,0.010691,0,-0.023845,1,-0.164047,0,0.072,0,-0.010221,0,0.183351,1,5


model_df row count: 3550


## Shuffle Dataset

In [4]:
# shuffle observations in model_df and reset index
model_df = shuffle(model_df, random_state=seed).reset_index(drop=True)

## One Hot Encoding of F-SCORE

In [5]:
# note: the F-SCORE variable has to be one hot encoded in order to calculate the categorical cross entropy loss 

# one hot encode F-SCOREs
F_SCORE_enc = to_categorical(model_df['F_SCORE'])
F_SCORE_enc_df = pd.DataFrame(F_SCORE_enc, columns=['0', '1', '2', '3', '4', '5', '6',
                                                 '7', '8', '9']).astype(int)
model_df = pd.concat([model_df.drop(['F_SCORE'], axis=1), F_SCORE_enc_df], axis=1)

## Train/Test Split

In [6]:
# split model_df into train/test split
# note: 75% of model_df will be used for training purposes and 25% of model_df will be used for testing purposes
# use sklearn's train_test_split to split model_df
prototype_one_train_df, prototype_one_test_df = train_test_split(model_df, test_size=0.25, random_state=seed)

## Save Train/Test Dataset to CSV File

In [7]:
# reset index for prototype_one_train_df and prototype_one_test_df
prototype_one_train_df = prototype_one_train_df.reset_index(drop=True)
prototype_one_test_df = prototype_one_test_df.reset_index(drop=True)

# save prototype_one_train_df to csv file
prototype_one_train_df.to_csv('prototype_one_train.csv', index=False)
# save prototype_one_test_df to csv file
prototype_one_test_df.to_csv('prototype_one_test.csv', index=False)