In [1]:
# Importing modules
import pandas as pd
import numpy as np
from config import config

# Reading the raw dataset
raw_dataset = pd.read_csv(config['path_raw'] + 'TACE_LR.csv')
print("The raw_dataset has the following shape: {}.".format(raw_dataset.shape))
raw_dataset.describe()

The raw_dataset has the following shape: (5073, 22).


Unnamed: 0,ID,state,time,treatment_before,gender,age,Etiology,ECOG,tumor_number,tumor_size,...,ALB,TBIL,AST,ALT,PLT,INR,BUN,Cr,WBC,HGB
count,5073.0,5073.0,5073.0,5073.0,5073.0,5073.0,5073.0,5073.0,5073.0,5073.0,...,5073.0,5073.0,5073.0,5073.0,5073.0,5073.0,5073.0,5073.0,5073.0,5073.0
mean,5054.965109,0.456535,32.353119,0.046521,0.849793,54.967988,1.064065,0.27262,1.777055,6.478766,...,40.522624,16.853966,54.996409,54.9593,143.088602,1.061043,5.45226,74.631538,5.197461,136.292104
std,2291.59278,0.498156,22.551936,0.210631,0.357309,11.913047,0.244892,0.445351,1.413436,3.767633,...,4.840245,7.621541,77.91935,77.191351,77.455903,0.115742,1.927816,20.905981,3.353453,20.091716
min,1001.0,0.0,1.033333,0.0,0.0,18.0,1.0,0.0,1.0,0.7,...,20.0,0.1,1.0,2.0,11.0,0.69,0.9,3.0,1.0,33.3
25%,2736.0,0.0,13.466667,0.0,1.0,46.0,1.0,0.0,1.0,3.5,...,37.2,11.5,29.0,25.0,86.0,0.99,4.31,64.0,3.4,125.0
50%,5664.0,0.0,28.166667,0.0,1.0,55.0,1.0,0.0,1.0,5.5,...,40.5,15.3,41.0,37.0,134.0,1.05,5.3,73.088394,4.64,138.0
75%,7018.0,1.0,46.533333,0.0,1.0,64.0,1.0,1.0,2.0,8.8,...,43.8,20.5,58.614578,60.0,185.0,1.13,6.2,83.0,6.26928,150.0
max,8427.0,1.0,141.233333,1.0,1.0,91.0,2.0,1.0,12.0,22.6,...,75.4,84.0,2900.0,2700.0,634.0,3.1,51.0,824.0,75.0,207.0


In [None]:
# Defining the time and event column
time_column = 'time'
event_column = 'state'

# encode, drop...
dataset = raw_dataset.drop(['ID'], axis=1)
dataset_with_id = raw_dataset

# Defining the modeling features
features_all = np.setdiff1d(dataset.columns, ['time', 'state']).tolist()
features_select = ["tumor_size","therapy", "AFP", "AST", "tumor_number", "ECOG", "ALB", "HGB", "INR"]

# Checking for null values
N_null = sum(dataset[features_all].isnull().sum())
print("The dataset contains {} null values".format(N_null)) #0 null values

# Removing duplicates if there exist
N_dupli = sum(dataset.duplicated(keep='first'))
N_dupli_list = dataset.index[dataset.duplicated(keep='first') == True].tolist()
print(N_dupli_list)
dataset = dataset.drop_duplicates(keep='first').reset_index(drop=True)
dataset_with_id = dataset_with_id.drop(N_dupli_list, axis=0).reset_index(drop=True)
print("The dataset contains {} duplicates".format(N_dupli))

print(dataset.shape)
print(dataset_with_id.shape)

In [None]:
dataset = dataset_with_id

# Building training and testing sets
from sklearn.model_selection import train_test_split
index_train, index_test = train_test_split( range(dataset.shape[0]), test_size = config['valid_ratio'], random_state = config['seed'], shuffle = True)
data_train = dataset.loc[index_train].reset_index( drop = True )
data_test  = dataset.loc[index_test].reset_index( drop = True )

# Print out the data size.
print("The training set has the following shape: {}.".format(data_train.shape))
print("The testing set has the following shape: {}.".format(data_test.shape))

# save the data
data_train.to_csv(config['path_processed'] + 'data_train.csv', index = False)
data_test.to_csv(config['path_processed'] + 'data_test.csv', index = False)