# First iteration of tree-based models.

The hyperparameter optization process will be the following:
1. Choose first range for hyperparameters.
2. Train n models using HalvingRandomSearchCV from scikit learn.
3. Pick the regions of the hyperparameter space that yield the best results.
4. Centre the hyperparameter search on those regions and iterate.

The scoring will consist in:
1. Numerical scores for the testing subset: recall, precision, roc_auc, f1.
2. Numerical scores for the whole data: recall, precision, roc_auc, f1.
2. Visualization of the ROC curve for the whole data.
4. Confusion matrix for the whole data.

In this way we can see if each model over/underfits, comparing the test and total scores. Also, the ROC curve and the confusion matrix will show how each model is performing in the whole dataset.


In [None]:
import pandas as pd
pd.set_option('display.max_colwidth', None)
# For random number generation
from scipy.stats import randint, uniform

from sklearn.experimental import enable_halving_search_cv # Allow importing the experimental HalvingGridSearchCV

# Metrics and auxiliar libraries from sklearn.
from sklearn.model_selection import train_test_split, HalvingRandomSearchCV, RepeatedStratifiedKFold

# Some auxiliary functions for scoring.
import scoring_utils

#DEV
import importlib as imp


In [None]:
# Import the dataframe cleaned and encoded during the feature importance process.
df_encoded = pd.read_csv('../data/df_encoded.csv')

In [None]:
target = 'Default'
features_encoded = [feature for feature in df_encoded.columns if feature not in target]
# Our target and features are the same that we employed during the importance analysis.
print(f"Target: {target}")
print(f"Features: {features_encoded}")

In [None]:
# This split will be used in every model, so that they are scored against the same subset.
X_train, X_test, y_train, y_test = train_test_split(df_encoded[features_encoded], df_encoded[target], train_size = .9)
# Complete datasets
X_total = df_encoded[features_encoded]
y_total = df_encoded[target]

### Single Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

# Grid of parameters for a single tree
param_grid = [
    {
        "min_samples_split": randint(155, 160),
        "max_depth": randint(16, 20),
        "min_samples_leaf": randint(50, 150),
        "max_leaf_nodes": randint(1300, 2000)
    }
]

# Cross validation in 3 folds for our grid search parameter selection.
# It must be consistent across the folds, so the random state has to be fixed.
cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=3, random_state=9)

single_tree_search = HalvingRandomSearchCV(
    estimator=DecisionTreeClassifier(),
    param_distributions=param_grid,
    scoring='recall',
    cv=cv,
    n_jobs=-1,
    verbose=1,
    n_candidates=1000
)

single_tree_search.fit(X_train, y_train)


In [None]:
scoring_utils.get_best_kernels(single_tree_search.cv_results_).head(10)

In [None]:
# Let's use the best estimator to obtain test and overall scores 
single_tree_model = single_tree_search.best_estimator_

yhat_test = single_tree_model.predict(X_test)
yhat_total = single_tree_model.predict(X_total)

In [None]:
single_tree_test_score = scoring_utils.get_metrics(yhat_test, y_test, "Single Tree Test")
single_tree_total_score = scoring_utils.get_metrics(yhat_total, y_total, "Single Tree Total")

single_tree_score = pd.concat((single_tree_test_score, single_tree_total_score))
single_tree_score

In [None]:
yhat_baseline = pd.read_csv('../data/baseline_predict.csv')
scoring_utils.get_metrics(yhat_baseline, y_total, "Baseline")

In [None]:
scoring_utils.get_roc_plot(yhat_test, y_test, "Single Tree Whole Data")

In [None]:
scoring_utils.get_confusion_matrix(y_total, yhat_total, "Single Tree Whole Data")

### Bagging classifier

In [None]:
from sklearn.ensemble import BaggingClassifier

# Grid of parameters for a bagging classifier
# param_grid = [
#     {
#         "estimator__min_samples_split": randint(155, 160),
#         "estimator__max_depth": randint(16, 20),
#         "estimator__min_samples_leaf": randint(50, 150),
#         "n_estimators": randint(220, 320)
#     }
#] 

param_grid = {
    'n_estimators': [300, 400, 500, 600, 700, 800],
    'max_features': [0.90, 0.92, 0.95, 1.0],
    'bootstrap': [True, False],
    'bootstrap_features': [True, False],
}

# Cross validation in 3 folds for our grid search parameter selection.
# It must be consistent across the folds, so the random state has to be fixed.
cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=3, random_state=9)

bagging_search = HalvingRandomSearchCV(
    estimator=BaggingClassifier(DecisionTreeClassifier()),
    param_distributions=param_grid,
    scoring='recall',
    cv=cv,
    n_jobs=-1,
    verbose=1,
    n_candidates=50
)

bagging_search.fit(X_train, y_train)


In [None]:
scoring_utils.get_best_kernels(bagging_search.cv_results_)

In [None]:
# Let's use the best estimator to obtain test and overall scores 
bagging_model = bagging_search.best_estimator_

yhat_test = bagging_model.predict(X_test)
yhat_total = bagging_model.predict(X_total)

In [None]:
bagging_test_score = scoring_utils.get_metrics(yhat_test, y_test, "Bagging Test")
bagging_total_score = scoring_utils.get_metrics(yhat_total, y_total, "Bagging Total")

bagging_score = pd.concat((bagging_test_score, bagging_total_score))
bagging_score

In [None]:
yhat_baseline = pd.read_csv('../data/baseline_predict.csv')
scoring_utils.get_metrics(yhat_baseline, y_total, "Baseline")

In [None]:
scoring_utils.get_roc_plot(yhat_test, y_test, "Bagging Whole Data")

In [None]:
scoring_utils.get_confusion_matrix(y_total, yhat_total, "Bagging Whole Data")