In [1]:
import numpy as np
import pandas as pd

from sklearn.pipeline import make_pipeline
from sklearn.datasets import make_classification
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder, PowerTransformer
from sklearn.model_selection import cross_val_score, train_test_split, KFold, cross_val_predict
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import log_loss, accuracy_score

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Dropout
from keras.regularizers import l2
from keras.optimizers import Adam
from keras_tuner import RandomSearch
from keras_tuner import Objective
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical

from sklearn.metrics import confusion_matrix

In [2]:
train = pd.read_csv('train.csv')
original = pd.read_csv('original.csv')
test = pd.read_csv('test.csv')
ss = pd.read_csv('sample_submission.csv')

In [3]:
def process(train, test, original):
    df_train = train.drop(['id', 'Status'], axis=1)
    original = original.dropna()
    df_original = original.drop(['ID', 'Status'], axis=1)
    df_train = pd.concat([df_train, df_original])
    df_test = test.drop(['id'], axis=1)
    
    # Categorical Cols
    # Train
    categorical_cols = ['Drug', 'Sex', 'Ascites', 'Hepatomegaly', 'Spiders', 'Edema', 'Stage']
    encoder = OneHotEncoder(drop='first')
    encoder.fit(df_train[categorical_cols])
    df_train_cat = pd.DataFrame(encoder.transform(df_train[categorical_cols]).toarray(), columns=encoder.get_feature_names_out())
    # Test
    df_test_cat = pd.DataFrame(encoder.transform(df_test[categorical_cols]).toarray(), columns=encoder.get_feature_names_out())
    
    # Numerical Cols
    # Train
    df_train_num = df_train.drop(categorical_cols, axis=1)
    df_train_num = np.log1p(df_train_num)
    scaler = StandardScaler()
    df_train_num = pd.DataFrame(scaler.fit_transform(df_train_num), columns=df_train_num.columns)
    p_transformer = PowerTransformer()
    df_train_num = pd.DataFrame(p_transformer.fit_transform(df_train_num), columns=df_train_num.columns)
    # Test
    df_test_num = df_test.drop(categorical_cols, axis=1)
    df_test_num = np.log1p(df_test_num)
    df_test_num = pd.DataFrame(scaler.transform(df_test_num), columns=df_test_num.columns)
    df_test_num = pd.DataFrame(p_transformer.transform(df_test_num), columns=df_test_num.columns)
    
    # Combine Num/Cat
    train_final = pd.concat([df_train_num, df_train_cat], axis=1)
    test_final = pd.concat([df_test_num, df_test_cat], axis=1)
    
    # Feature Engineering
    # https://www.kaggle.com/code/ashishkumarak/ps3e26-liver-cirrhosis-survival-prediction#%F0%9F%92%BB-Feature-Engineering
    threshold_platelets = 150
    train_final['thrombocytopenia'] = np.where(train_final['Platelets'] < threshold_platelets, 1, 0)
    test_final['thrombocytopenia'] = np.where(test_final['Platelets'] < threshold_platelets, 1, 0)
    threshold_alk_phos_upper = 147  # Upper limit of normal range
    threshold_alk_phos_lower = 44   # Lower limit of normal range
    train_final['elevated_alk_phos'] = np.where((train_final['Alk_Phos'] > threshold_alk_phos_upper) | (train_final['Alk_Phos'] < threshold_alk_phos_lower), 1, 0)
    test_final['elevated_alk_phos'] = np.where((test_final['Alk_Phos'] > threshold_alk_phos_upper) | (test_final['Alk_Phos'] < threshold_alk_phos_lower), 1, 0)
    normal_copper_range = (62, 140)
    train_final['normal_copper'] = np.where((train_final['Copper'] >= normal_copper_range[0]) & (train_final['Copper'] <= normal_copper_range[1]), 1, 0)
    test_final['normal_copper'] = np.where((test_final['Copper'] >= normal_copper_range[0]) & (test_final['Copper'] <= normal_copper_range[1]), 1, 0)
    normal_albumin_range = (3.4, 5.4)
    train_final['normal_albumin'] = np.where((train_final['Albumin'] >= normal_albumin_range[0]) & (train_final['Albumin'] <= normal_albumin_range[1]), 1, 0)
    test_final['normal_albumin'] = np.where((test_final['Albumin'] >= normal_albumin_range[0]) & (test_final['Albumin'] <= normal_albumin_range[1]), 1, 0)
    normal_bilirubin_range = (0.2, 1.2)
    train_final['normal_bilirubin'] = np.where((train_final['Bilirubin'] >= normal_bilirubin_range[0]) & (train_final['Bilirubin'] <= normal_bilirubin_range[1]), 1, 0)
    test_final['normal_bilirubin'] = np.where((test_final['Bilirubin'] >= normal_bilirubin_range[0]) & (test_final['Bilirubin'] <= normal_bilirubin_range[1]), 1, 0)
    
    # Encode Target
    le_encoder = LabelEncoder()
    y = le_encoder.fit_transform(train.Status)
    y_original = le_encoder.fit_transform(original.Status)
    
    # Split X, y
    X_train = train_final
    y_train = np.concatenate([y, y_original])
    
    X_test = test_final
    
    return X_train, y_train, X_test

In [4]:
X, y, test_final = process(train, test, original)
cv = KFold(n_splits=5, shuffle=True, random_state=0)

In [5]:
# Define your build_model function
def build_model(hp):
    model = Sequential()
    model.add(Dense(units=hp.Int('units_input', min_value=32, max_value=512, step=32), activation='relu', input_dim=X.shape[1], kernel_regularizer=l2(hp.Float('learning_rate', min_value=1e-4, max_value=1e-1, sampling="log"))))
    for i in range(hp.Int('num_hidden_layers', min_value=0, max_value=3)):
        model.add(Dense(units=hp.Int(f'units_{i}', min_value=32, max_value=512, step=32), activation=hp.Choice(f'activation_{i}', values=['relu', 'sigmoid', 'tanh'])))
    model.add(Dense(3, activation='softmax'))
    
    model.compile(optimizer=Adam(learning_rate=hp.Float('learning_rate', min_value=1e-4, max_value=1e-2, sampling="log")), 
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    return model

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.1, shuffle=True, random_state=0)

In [7]:
y_train_one_hot = to_categorical(y_train, num_classes=3)

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

tuner = RandomSearch(build_model, 
                     objective=Objective("val_loss", direction="min"),
                     max_trials=100,
                     executions_per_trial=3,
                     directory=f'C:/Users/Anes3/OneDrive/Bureau/cirrhosis-patient-survival-prediction/cirrhosis-patient-survival-prediction/keras_tuner_dir',
                     project_name=f'my_hyperparameter_search'
        )

tuner.search(X_train, y_train_one_hot,
             epochs=10,
             batch_size=32,
             validation_data=(X_test, to_categorical(y_test, num_classes=3)),  # Use one-hot encoded labels
             callbacks=[early_stopping])

Trial 100 Complete [00h 00m 09s]
val_loss: 0.8375592430432638

Best val_loss So Far: 0.46925535798072815
Total elapsed time: 00h 18m 04s
INFO:tensorflow:Oracle triggered exit


In [11]:
best_model = tuner.hypermodel.build(tuner.get_best_hyperparameters(num_trials=1)[0])
best_model.fit(X_train, to_categorical(y_train, num_classes=3), epochs=10, batch_size=32, validation_data=(X_test, to_categorical(y_test, num_classes=3)))

# Use the trained model to predict on the test data
y_pred = best_model.predict(X_test)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [12]:
# Assuming you have completed the search
best_trials = tuner.oracle.get_best_trials(1)[0]

# Get the best hyperparameters
best_hyperparameters = best_trials.hyperparameters.values

# Display the best hyperparameters
print("Best Hyperparameters:")
print(best_hyperparameters)

Best Hyperparameters:
{'units_input': 448, 'learning_rate': 0.00027788857041120114, 'num_hidden_layers': 2, 'units_0': 448, 'activation_0': 'relu', 'units_1': 192, 'activation_1': 'relu', 'units_2': 32, 'activation_2': 'relu'}


In [13]:
log_loss(to_categorical(y_test, num_classes=3), y_pred)

0.4691603233263094

In [24]:
final_model = tuner.hypermodel.build(tuner.get_best_hyperparameters(num_trials=1)[0])
final_model.fit(X, to_categorical(y, num_classes=3), epochs=10, batch_size=32)
final_predictions = final_model.predict(test_final)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [25]:
res = pd.DataFrame(final_predictions, columns=['Status_C', 'Status_CL', 'Status_D'])
res['id'] = test.id
res = res[['id', 'Status_C', 'Status_CL', 'Status_D']]
res.to_csv('submission_regul.csv', index=False)