In [30]:
import pandas as pd
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, log_loss
import shap
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import GridSearchCV

In [31]:
train_path = "Data files/AppML_InitialProject_train.h5"
test_path = "Data files/AppML_InitialProject_test_classification.h5"

train_df = pd.read_hdf(train_path)
test_df = pd.read_hdf(test_path)
data_train = train_df.iloc[:, :-2]
data_cls_target = train_df.iloc[:, -2]

X_train_init, X_test_init, y_train_init, y_test_init = train_test_split(
    data_train, data_cls_target, test_size=0.5, random_state=42
)

In [32]:
parameters_GridSearch = {
    "n_estimators": [10],
    "learning_rate": [0.0001, 0.001, 0.01, 0.1, 0.5, 1.0],
}

GridSearch = GridSearchCV(
    AdaBoostClassifier(),
    parameters_GridSearch,
    verbose=3,
    cv=5,
    return_train_score=True,
    refit=True,
)
GridSearch.fit(X_train_init, y_train_init)

Fitting 5 folds for each of 6 candidates, totalling 30 fits




KeyboardInterrupt: 

In [None]:
bst_init = AdaBoostClassifier(learning_rate=0.8, n_estimators=30)
bst_init.fit(X_train_init, y_train_init)
pred_proba_init = bst_init.predict_proba(X_test_init)[:, 1]
auc_init = roc_auc_score(y_test_init, pred_proba_init)
auc_init



0.9880462046413974

In [None]:
all_features = data_train.columns
feature_importances_all = pd.Series(bst_init.feature_importances_, index=all_features).sort_values(ascending=False)
selected_features = feature_importances_all.iloc[0:20].index.values

In [33]:
X_train, X_test, y_train, y_test = train_test_split(
    data_train[selected_features], data_cls_target, test_size=0.5, random_state=42
)

parameters_GridSearch = {
    "n_estimators": [30, 40, 50],
    "learning_rate": [0.6, 0.8, 0.9, 1.0],
}

GridSearch = GridSearchCV(
    AdaBoostClassifier(),
    parameters_GridSearch,
    verbose=3,
    cv=5,
    return_train_score=True,
    refit=True,
)

In [34]:
GridSearch.fit(X_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits




[CV 1/5] END learning_rate=0.6, n_estimators=30;, score=(train=0.958, test=0.954) total time=   9.3s




[CV 2/5] END learning_rate=0.6, n_estimators=30;, score=(train=0.956, test=0.957) total time=   6.0s




[CV 3/5] END learning_rate=0.6, n_estimators=30;, score=(train=0.958, test=0.957) total time=   7.1s




[CV 4/5] END learning_rate=0.6, n_estimators=30;, score=(train=0.957, test=0.956) total time=   5.9s




[CV 5/5] END learning_rate=0.6, n_estimators=30;, score=(train=0.956, test=0.957) total time=   6.8s




[CV 1/5] END learning_rate=0.6, n_estimators=40;, score=(train=0.959, test=0.956) total time=   8.4s




[CV 2/5] END learning_rate=0.6, n_estimators=40;, score=(train=0.958, test=0.959) total time=   8.7s




[CV 3/5] END learning_rate=0.6, n_estimators=40;, score=(train=0.959, test=0.959) total time=   8.6s




[CV 4/5] END learning_rate=0.6, n_estimators=40;, score=(train=0.959, test=0.958) total time=   8.4s




[CV 5/5] END learning_rate=0.6, n_estimators=40;, score=(train=0.958, test=0.958) total time=   7.8s




[CV 1/5] END learning_rate=0.6, n_estimators=50;, score=(train=0.960, test=0.957) total time=  11.1s




[CV 2/5] END learning_rate=0.6, n_estimators=50;, score=(train=0.960, test=0.959) total time=  12.3s




[CV 3/5] END learning_rate=0.6, n_estimators=50;, score=(train=0.961, test=0.960) total time=  12.3s




[CV 4/5] END learning_rate=0.6, n_estimators=50;, score=(train=0.960, test=0.958) total time=  10.0s




[CV 5/5] END learning_rate=0.6, n_estimators=50;, score=(train=0.958, test=0.960) total time=  10.1s




[CV 1/5] END learning_rate=0.8, n_estimators=30;, score=(train=0.959, test=0.956) total time=   6.2s




[CV 2/5] END learning_rate=0.8, n_estimators=30;, score=(train=0.959, test=0.958) total time=   6.3s




[CV 3/5] END learning_rate=0.8, n_estimators=30;, score=(train=0.958, test=0.959) total time=   5.7s




[CV 4/5] END learning_rate=0.8, n_estimators=30;, score=(train=0.958, test=0.957) total time=   7.7s




[CV 5/5] END learning_rate=0.8, n_estimators=30;, score=(train=0.958, test=0.959) total time=   6.5s




[CV 1/5] END learning_rate=0.8, n_estimators=40;, score=(train=0.960, test=0.957) total time=   8.0s




[CV 2/5] END learning_rate=0.8, n_estimators=40;, score=(train=0.960, test=0.960) total time=   8.7s




[CV 3/5] END learning_rate=0.8, n_estimators=40;, score=(train=0.959, test=0.959) total time=   8.1s




[CV 4/5] END learning_rate=0.8, n_estimators=40;, score=(train=0.960, test=0.958) total time=   8.3s




[CV 5/5] END learning_rate=0.8, n_estimators=40;, score=(train=0.959, test=0.960) total time=   7.9s




[CV 1/5] END learning_rate=0.8, n_estimators=50;, score=(train=0.961, test=0.959) total time=  11.1s




[CV 2/5] END learning_rate=0.8, n_estimators=50;, score=(train=0.961, test=0.960) total time=  10.8s




[CV 3/5] END learning_rate=0.8, n_estimators=50;, score=(train=0.961, test=0.960) total time=   9.9s




[CV 4/5] END learning_rate=0.8, n_estimators=50;, score=(train=0.961, test=0.959) total time=  10.0s




[CV 5/5] END learning_rate=0.8, n_estimators=50;, score=(train=0.960, test=0.960) total time=  10.2s




[CV 1/5] END learning_rate=0.9, n_estimators=30;, score=(train=0.959, test=0.956) total time=   5.9s




[CV 2/5] END learning_rate=0.9, n_estimators=30;, score=(train=0.958, test=0.957) total time=   6.2s




[CV 3/5] END learning_rate=0.9, n_estimators=30;, score=(train=0.958, test=0.958) total time=   6.5s




[CV 4/5] END learning_rate=0.9, n_estimators=30;, score=(train=0.959, test=0.958) total time=   6.9s




[CV 5/5] END learning_rate=0.9, n_estimators=30;, score=(train=0.958, test=0.960) total time=   6.7s




[CV 1/5] END learning_rate=0.9, n_estimators=40;, score=(train=0.960, test=0.957) total time=   9.6s




[CV 2/5] END learning_rate=0.9, n_estimators=40;, score=(train=0.960, test=0.960) total time=   8.4s




[CV 3/5] END learning_rate=0.9, n_estimators=40;, score=(train=0.959, test=0.959) total time=   7.8s




[CV 4/5] END learning_rate=0.9, n_estimators=40;, score=(train=0.961, test=0.958) total time=   7.9s




[CV 5/5] END learning_rate=0.9, n_estimators=40;, score=(train=0.959, test=0.960) total time=   8.6s




[CV 1/5] END learning_rate=0.9, n_estimators=50;, score=(train=0.961, test=0.957) total time=  11.3s




[CV 2/5] END learning_rate=0.9, n_estimators=50;, score=(train=0.960, test=0.962) total time=  10.1s




[CV 3/5] END learning_rate=0.9, n_estimators=50;, score=(train=0.960, test=0.960) total time=  10.1s




[CV 4/5] END learning_rate=0.9, n_estimators=50;, score=(train=0.961, test=0.959) total time=  10.5s




[CV 5/5] END learning_rate=0.9, n_estimators=50;, score=(train=0.961, test=0.961) total time=  11.9s




[CV 1/5] END learning_rate=1.0, n_estimators=30;, score=(train=0.957, test=0.956) total time=   6.9s




[CV 2/5] END learning_rate=1.0, n_estimators=30;, score=(train=0.958, test=0.957) total time=   6.7s




[CV 3/5] END learning_rate=1.0, n_estimators=30;, score=(train=0.958, test=0.958) total time=   6.8s




[CV 4/5] END learning_rate=1.0, n_estimators=30;, score=(train=0.958, test=0.959) total time=   7.0s




[CV 5/5] END learning_rate=1.0, n_estimators=30;, score=(train=0.959, test=0.960) total time=   6.8s




[CV 1/5] END learning_rate=1.0, n_estimators=40;, score=(train=0.960, test=0.958) total time=   9.1s




[CV 2/5] END learning_rate=1.0, n_estimators=40;, score=(train=0.959, test=0.959) total time=   9.0s




[CV 3/5] END learning_rate=1.0, n_estimators=40;, score=(train=0.960, test=0.959) total time=   9.0s




[CV 4/5] END learning_rate=1.0, n_estimators=40;, score=(train=0.960, test=0.959) total time=  10.7s




[CV 5/5] END learning_rate=1.0, n_estimators=40;, score=(train=0.960, test=0.961) total time=   8.5s




[CV 1/5] END learning_rate=1.0, n_estimators=50;, score=(train=0.961, test=0.958) total time=   9.7s




[CV 2/5] END learning_rate=1.0, n_estimators=50;, score=(train=0.960, test=0.961) total time=  10.7s




[CV 3/5] END learning_rate=1.0, n_estimators=50;, score=(train=0.960, test=0.959) total time=   9.9s




[CV 4/5] END learning_rate=1.0, n_estimators=50;, score=(train=0.962, test=0.960) total time=  10.0s




[CV 5/5] END learning_rate=1.0, n_estimators=50;, score=(train=0.961, test=0.962) total time=   9.7s




In [44]:
best_params = GridSearch.best_params_
bst = AdaBoostClassifier(n_estimators=100, learning_rate=0.8)
bst.fit(X_train, y_train)
pred_proba_validation = bst.predict_proba(X_test)[:, 1]
auc_final = roc_auc_score(y_test, pred_proba_validation)
loss_final = log_loss(y_test, pred_proba_validation)
print(f"AUC: {auc_final}, Loss: {loss_final}")



AUC: 0.990050070982229, Loss: 0.6561080622783709


In [46]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.calibration import CalibratedClassifierCV

In [48]:
bst = AdaBoostClassifier(n_estimators=100, learning_rate=0.8)
calibrated = CalibratedClassifierCV(bst, method='sigmoid', cv=3)
calibrated.fit(X_train, y_train)
pred_proba_calibrated = calibrated.predict_proba(X_test)[:, 1]
auc_calibrated = roc_auc_score(y_test, pred_proba_calibrated)
loss_calibrated = log_loss(y_test, pred_proba_calibrated)
print(f"AUC: {auc_calibrated}, Loss: {loss_calibrated}")



AUC: 0.9902760227231665, Loss: 0.09761545869891274


In [49]:
pred_proba_final = calibrated.predict_proba(test_df[selected_features])[:, 1]
csv_out_path = "Solution_files/Classification_AliAhmad_AdaBoost.csv"
pd.Series(pred_proba_final).to_csv(csv_out_path, sep=",", header=None)

In [39]:
features_out_path = "Solution_files/Classification_AliAhmad_Adaboost_VariableList.csv"

with open(features_out_path, "w") as f:
    for feature in selected_features:
        f.write(f"{feature},\n")