In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RepeatedStratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score

from keras.models import Sequential
from keras.layers import Dense
from keras_tuner import RandomSearch
from keras_tuner import Objective

from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import AUC
from tensorflow.keras.regularizers import L2

# Load Data

In [2]:
train = pd.read_csv('train.csv', index_col='id')
test = pd.read_csv('test.csv', index_col='id')

In [3]:
train.shape

(414, 7)

In [4]:
train.head()

Unnamed: 0_level_0,gravity,ph,osmo,cond,urea,calc,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,1.013,6.19,443,14.8,124,1.45,0
1,1.025,5.4,703,23.6,394,4.18,0
2,1.009,6.13,371,24.5,159,9.04,0
3,1.021,4.91,442,20.8,398,6.63,1
4,1.021,5.53,874,17.8,385,2.21,1


In [5]:
train.target.value_counts()

0    230
1    184
Name: target, dtype: int64

In [6]:
X = train.drop('target', axis=1).copy()
y = train.target

# Feature Scaling

In [7]:
#First we need to combine train and test, scale then split again
X1 = pd.concat([X, test])

scaler = StandardScaler()
X2 = pd.DataFrame(scaler.fit_transform(X1) ,columns = X.columns)

X = X2.iloc[:len(train), :]
test = X2.iloc[len(train):, :]

# Define MLP model

In [8]:
# Define your build_model function
def build_model(hp):
    model = Sequential()
    model.add(Dense(units=hp.Int('units_input', min_value=32, max_value=512, step=32), activation='relu', input_dim=X.shape[1]))
    for i in range(hp.Int('num_hidden_layers', min_value=0, max_value=3)):
        model.add(Dense(units=hp.Int(f'units_{i}', min_value=32, max_value=512, step=32), activation=hp.Choice(f'activation_{i}', values=['relu', 'sigmoid', 'tanh'])))
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(optimizer=Adam(learning_rate=hp.Float('learning_rate', min_value=1e-4, max_value=1e-2, sampling="log")),
                  loss='binary_crossentropy',
                  metrics=['AUC'])
    return model

# Define an empty array to store ensemble predictions
ensemble_predictions = np.zeros_like(y, dtype=float)

rskf = RepeatedStratifiedKFold(n_splits=10, n_repeats=5, random_state=0)
best_hyperparameters = []
best_models = []

for i, (train_index, test_index) in enumerate(rskf.split(X, y)):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    tuner = RandomSearch(build_model, 
                         objective=Objective("val_auc", direction="max"),
                         max_trials=100,
                         executions_per_trial=1,
                         directory=f'C:/Users/Anes3/Documents/keras_tuner_dir_fold_{i}',
                         project_name=f'my_hyperparameter_search_fold_{i}'
            )

    tuner.search(X_train, y_train,
                 epochs=20,
                 batch_size=32,
                 validation_data=(X_test, y_test))
    
    # Get the best hyperparameters for this fold
    best_hyperparameters.append(tuner.get_best_hyperparameters(num_trials=1)[0])

    # Build and train the best model for this fold on the full dataset
    best_model = tuner.hypermodel.build(best_hyperparameters[-1])
    best_model.fit(X, y, epochs=10, batch_size=32)
    best_models.append(best_model)
    
    # Use the trained model to predict on the test data
    y_pred = best_model.predict(X_test)
    
    # Add the predictions to the ensemble array
    ensemble_predictions[test_index] += y_pred.flatten()

Trial 100 Complete [00h 00m 02s]
val_auc: 0.8611111640930176

Best val_auc So Far: 0.8949275016784668
Total elapsed time: 00h 03m 40s
INFO:tensorflow:Oracle triggered exit
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [10]:
# Average the ensemble predictions
ensemble_predictions /= len(best_models)

# Convert the ensemble predictions to binary using a threshold (e.g., 0.5)
ensemble_predictions_binary = np.where(ensemble_predictions > 0.5, 1, 0)

# Evaluate the ensemble predictions
ensemble_auc = roc_auc_score(y, ensemble_predictions)

print("Ensemble AUC:", ensemble_auc.round(4))

Ensemble AUC: 0.8547


In [11]:
# Make predictions on new data using the trained ensemble
ensemble_predictions_new = np.zeros((test.shape[0],))  # Initialize an array to store ensemble predictions for new data

for best_model in best_models:
    # Predict using each model in the ensemble
    y_pred = best_model.predict(test)
    ensemble_predictions_new += y_pred.flatten()

# Average the ensemble predictions for new data
ensemble_predictions_new /= len(best_models)

# Convert the ensemble predictions for new data to binary using a threshold (e.g., 0.5)
ensemble_predictions_binary_new = np.where(ensemble_predictions_new > 0.5, 1, 0)



In [None]:
# for best_hyperparams in hyper_params:
#     # Access the best hyperparameters
#     best_input_units = best_hyperparams.get('units_input')
#     best_hidden_layers = best_hyperparams.get('num_hidden_layers')

#     best_activation = []
#     best_units = []

#     # Extract the values of activation functions and units for each hidden layer
#     for i in range(best_hidden_layers):
#         best_activation.append(best_hyperparams.get(f'activation_{i}'))
#         best_units.append(best_hyperparams.get(f'units_{i}'))


#     # Print the best hyperparameters
#     print("Best Hyperparameters:")
#     print("input units =", best_input_units)
#     print("number of hidden layers =", best_hidden_layers)
#     print("activation =", best_activation)
#     print("units =", best_units)
    
#     print('-'*60)

In [12]:
submission = pd.concat([pd.Series(test.index, name='id'), pd.Series(ensemble_predictions_binary_new, name='target')], axis=1)
submission

Unnamed: 0,id,target
0,414,0
1,415,0
2,416,1
3,417,1
4,418,0
...,...,...
271,685,1
272,686,0
273,687,0
274,688,0


# Submission

In [13]:
submission.to_csv('submission_MLP_model_rskf_ensemble.csv', index=False)