# First iteration of tree-based models.

The hyperparameter optization process will be the following:
1. Train with default hyperaparameters.
2. Identify bias and variance.
3. Choose first range for hyperparameters based on results.
4. Train n models using HalvingRandomSearchCV from scikit learn.
5. Pick the regions of the hyperparameter space that yield the best results.
6. Centre the hyperparameter search on those regions and iterate.

The scoring will consist in:
1. Numerical scores for the testing subset: recall, precision, roc_auc, f1.
2. Numerical scores for the whole data: recall, precision, roc_auc, f1.
2. Visualization of the ROC curve for the whole data.
4. Confusion matrix for the whole data.

In this way we can see if each model over/underfits, comparing the test and total scores. Also, the ROC curve and the confusion matrix will show how each model is performing in the whole dataset.


In [None]:
import pandas as pd
pd.set_option('display.max_colwidth', None)
# For random number generation
from scipy.stats import randint, uniform

from sklearn.experimental import enable_halving_search_cv # Allow importing the experimental HalvingGridSearchCV

# Metrics and auxiliar libraries from sklearn.
from sklearn.model_selection import train_test_split, HalvingRandomSearchCV, RepeatedStratifiedKFold, HalvingGridSearchCV

# Some auxiliary functions for scoring.
import scoring_utils

#DEV
import importlib as imp


In [None]:
# Import the dataframe cleaned and encoded during the feature importance process.
df_encoded = pd.read_csv('../data/df_encoded_interest.csv')

In [None]:
target = 'Default'
features_encoded = [feature for feature in df_encoded.columns if feature not in target]
# Our target and features are the same that we employed during the importance analysis.
print(f"Target: {target}")
print(f"Features: {features_encoded}")

In [None]:
# This split will be used in every model, so that they are scored against the same subset.
X_train, X_test, y_train, y_test = train_test_split(df_encoded[features_encoded], df_encoded[target], train_size = .9)
# Complete datasets
X_total = df_encoded[features_encoded]
y_total = df_encoded[target]

### Single Decision Tree Classifier

In [None]:
# Default check to identify bias/variance
from sklearn.tree import DecisionTreeClassifier

default_tree = DecisionTreeClassifier()
default_tree.fit(X_train, y_train)

yhat_train = default_tree.predict(X_train)
yhat_test = default_tree.predict(X_test)

default_tree_score = pd.concat((
    scoring_utils.get_metrics(y_train, yhat_train, "Default Tree Train"),
    scoring_utils.get_metrics(y_test, yhat_test, "Default Tree Test")
    ))

default_tree_score

In [None]:
default_tree.get_params()

It does overfit! Let's decrease the variance by tuning the hyperparameters.

In [None]:

# Grid of parameters for a single tree
param_grid = [
    {
        "min_samples_split": randint(100, 200),
        "max_depth": randint(14, 18)
    }
]

# Cross validation in 3 folds for our grid search parameter selection.
# It must be consistent across the folds, so the random state has to be fixed.
cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=3, random_state=9)

single_tree_search = HalvingRandomSearchCV(
    estimator=DecisionTreeClassifier(class_weight='balanced'),
    param_distributions=param_grid,
    scoring='f1',
    cv=cv,
    n_jobs=-1,
    n_candidates=1000
)

single_tree_search.fit(X_train, y_train)


In [None]:
scoring_utils.get_best_kernels(single_tree_search.cv_results_)

In [None]:
single_tree_search.best_estimator_.get_params()

In [None]:
len(df_encoded[df_encoded[target] == 1])

In [None]:

weights = [.1, .15, .2, .25, .3]

for weight in weights:
    print(f"\nTraining for weight: {weight}...")
    model = DecisionTreeClassifier(
        max_depth= 15,
        min_samples_split=130,
        class_weight={0: weight, 1:(1-weight)}
    )
    
    model.fit(X_train, y_train)

    yhat_test = model.predict(X_test)
    yhat_train = model.predict(X_train)
    yhat_total = model.predict(X_total)

    score = pd.concat([
        scoring_utils.get_metrics(y_train, yhat_train, "Train"),
        scoring_utils.get_metrics(y_test, yhat_test, "Test"),
        scoring_utils.get_metrics(y_total, yhat_total, "Total")
    ])

    print(score)

    print(f"We obtained a profit of ${scoring_utils.get_profit(X_total, y_total, yhat_total):,.2f}")




In [None]:
# Let's use the best estimator to obtain test and overall scores 
single_tree_model = single_tree_search.best_estimator_

yhat_test = single_tree_model.predict(X_test)
yhat_total = single_tree_model.predict(X_total)

In [None]:
single_tree_test_score = scoring_utils.get_metrics(y_test, yhat_test, "Single Tree Test")
single_tree_total_score = scoring_utils.get_metrics(y_total, yhat_total, "Single Tree Total")

single_tree_score = pd.concat((single_tree_test_score, single_tree_total_score))
single_tree_score

In [None]:
print(f"We obtained a profit of ${scoring_utils.get_profit(X_total, y_total, yhat_total):,.2f}")

In [None]:
scoring_utils.get_roc_plot(y_total, yhat_total, "Single Tree Whole Data")

In [None]:
scoring_utils.get_confusion_matrix(y_total, yhat_total, "Single Tree Whole Data")

### Bagging classifier

In [None]:
# Default check to identify bias/variance
from sklearn.ensemble import BaggingClassifier

default_bagging = BaggingClassifier(DecisionTreeClassifier(), n_jobs=-1)
default_bagging.fit(X_train, y_train)

yhat_train = default_bagging.predict(X_train)
yhat_test = default_bagging.predict(X_test)

default_bagging_score = pd.concat((
    scoring_utils.get_metrics(y_train, yhat_train, "Default Bagging Train"),
    scoring_utils.get_metrics(y_test, yhat_test, "Default Bagging Test")
    ))

default_bagging_score

In [None]:
default_bagging.get_params()

In [None]:

# Grid of parameters for a bagging model
param_grid = [
    {
        "max_samples": [.8, .9, 1.0]
    }
]

# Cross validation in 3 folds for our grid search parameter selection.
# It must be consistent across the folds, so the random state has to be fixed.
cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=3, random_state=9)

bagging_search = HalvingGridSearchCV(
    estimator=BaggingClassifier(n_estimators=10, estimator=single_tree_model),
    param_grid=param_grid,
    scoring='recall',
    cv=cv,
    n_jobs=-1,
    verbose=1
)

bagging_search.fit(X_train, y_train)


In [None]:
scoring_utils.get_best_kernels(bagging_search.cv_results_)

In [None]:
bagging_model = bagging_search.best_estimator_

yhat_test = bagging_model.predict(X_test)
yhat_total = bagging_model.predict(X_total)

In [None]:
bagging_test_score = scoring_utils.get_metrics(y_test, yhat_test, "Bagging Test")
bagging_total_score = scoring_utils.get_metrics(y_total, yhat_total, "Bagging Total")

bagging_score = pd.concat((bagging_test_score, bagging_total_score, single_tree_score))
bagging_score

In [None]:
print(f"We obtained a profit of ${scoring_utils.get_profit(X_total, y_total, yhat_total):,.2f}")

In [None]:
scoring_utils.get_roc_plot(y_total, yhat_total, "Bagging Whole Data")

In [None]:
scoring_utils.get_confusion_matrix(y_total, yhat_total, "Bagging Whole Data")

### Random forest

In [None]:
# Default to check bias/variance
from sklearn.ensemble import RandomForestClassifier

default_forest = RandomForestClassifier(n_jobs=-1).fit(X_train, y_train)

yhat_train = default_forest.predict(X_train)
yhat_test = default_forest.predict(X_test)

default_forest_score = pd.concat((
    scoring_utils.get_metrics(y_train, yhat_train, 'Default Forest Train'),
    scoring_utils.get_metrics(y_test, yhat_test, 'Default Forest Test')
))

default_forest_score


In [None]:
default_forest.get_params()

In [None]:
# {'max_depth': 14, 'min_samples_split': 109} # One of my best trees!!!!!

param_grid = {
    'max_depth': [50, 100, 150], # This has a smaller effect, but 100 seems to work out fine.
    'min_samples_split': [10, 100, 1000] # This should be around 100
}

# Cross validation in 3 folds for our grid search parameter selection.
# It must be consistent across the folds, so the random state has to be fixed.
cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=3, random_state=9)

random_forest_search = HalvingGridSearchCV(
    estimator=RandomForestClassifier(n_estimators=50, max_features=1.0, class_weight='balanced'),
    param_grid=param_grid,
    scoring='recall',
    cv=cv,
    n_jobs=-1,
    verbose=1
)

random_forest_search.fit(X_train, y_train)


In [None]:
scoring_utils.get_best_kernels(random_forest_search.cv_results_)

In [None]:
random_forest = random_forest_search.best_estimator_

yhat_test = random_forest.predict(X_test)
yhat_total = random_forest.predict(X_total)

In [None]:
forest_test_score = scoring_utils.get_metrics(y_test, yhat_test, "Random Forest Test")
forest_total_score = scoring_utils.get_metrics(y_total, yhat_total, "Random Forest Total")

forest_score = pd.concat((forest_test_score, forest_total_score, bagging_score))
forest_score

In [None]:
print(f"We obtained a profit of ${scoring_utils.get_profit(X_total, y_total, yhat_total):,.2f}")

In [None]:
scoring_utils.get_roc_plot(y_total, yhat_total, "Forest Whole Data")

In [None]:
scoring_utils.get_confusion_matrix(y_total, yhat_total, "Random Forest Whole Data")

### Boosting

In [None]:
import xgboost as xgb

In [None]:
xgbmodel = xgb.XGBClassifier(
 learning_rate =0.1,
 n_estimators=100,
 verbosity=2,
 max_depth=11,
 min_child_weight=10,
 gamma=0.1,
 subsample=0.9,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 scale_pos_weight=3.3,
 seed=27)

xgb_param = xgbmodel.get_xgb_params()
xgtrain = xgb.DMatrix(X_train.values, label=y_train.values)
cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=xgbmodel.get_params()['n_estimators'], nfold=5,
metrics='auc', early_stopping_rounds=50)
xgbmodel.set_params(n_estimators=cvresult.shape[0])

xgbmodel.fit(X_train, y_train, eval_metric='auc')



In [None]:
yhat_train = xgbmodel.predict(X_train)
yhat_test = xgbmodel.predict(X_test)

cv_boost_score = pd.concat((
    scoring_utils.get_metrics(y_train, yhat_train, 'CV Boosting Train'),
    scoring_utils.get_metrics(y_test, yhat_test, 'CV Boosting Test')
))

cv_boost_score

In [None]:
yhat_total = xgbmodel.predict(X_total)
scoring_utils.get_confusion_matrix(y_total, yhat_total, 'CV Boosting')

In [None]:
print(f"We obtained a profit of ${scoring_utils.get_profit(X_total, y_total, yhat_total):,.2f}")

In [None]:
print(f"The maximum is ${scoring_utils.get_profit(X_total, y_total, y_total):,.2f}")

# This fuckers works worse. Check if there's time

In [None]:
from xgboost import XGBClassifier

weights = [3.3, 3.5, 3.7, 3.9]

for weight in weights:
    print(f"\nTraining for weight: {weight}...")
    model = xgb.XGBClassifier(
        learning_rate =0.1,
        n_estimators=100,
        verbosity=2,
        max_depth=11,
        min_child_weight=10,
        gamma=0.1,
        subsample=0.9,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        seed=27,
        scale_pos_weight=weight)
    
    model.fit(X_train, y_train)

    yhat_test = model.predict(X_test)
    yhat_train = model.predict(X_train)
    yhat_total = model.predict(X_total)

    score = pd.concat([
        scoring_utils.get_metrics(y_train, yhat_train, "Train"),
        scoring_utils.get_metrics(y_test, yhat_test, "Test"),
        scoring_utils.get_metrics(y_total, yhat_total, "Total")
    ])

    print(score)

    print(f"We obtained a profit of ${scoring_utils.get_profit(X_total, y_total, yhat_total):,.2f}")




In [None]:
from xgboost import XGBClassifier

param_grid = [{
    'scale_pos_weight': [3, 4, 4.5, 5, 5.5, 6]
}]

cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=3, random_state=12)

xgbsearch = HalvingGridSearchCV(
    xgb.XGBClassifier(
        learning_rate =0.1,
        n_estimators=100,
        verbosity=2,
        max_depth=11,
        min_child_weight=10,
        gamma=0.1,
        subsample=0.9,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        seed=27),
    param_grid=param_grid,
    verbose=1,
    cv=cv,
    scoring='recall'
)

xgbsearch.fit(X_train, y_train)

In [None]:
scoring_utils.get_best_kernels(xgbsearch.cv_results_)

In [None]:
xgbmodel = xgbsearch.best_estimator_

yhat_test = xgbmodel.predict(X_test)
yhat_total = xgbmodel.predict(X_total)

In [None]:
xgb_test_score = scoring_utils.get_metrics(y_test, yhat_test, "XGB Test")
xgb_total_score = scoring_utils.get_metrics(y_total, yhat_total, "XGB Total")

xgb_score = pd.concat((xgb_test_score, xgb_total_score))
xgb_score

In [None]:
scoring_utils.get_confusion_matrix(y_total, yhat_total, "XGB")

In [None]:
print(f"We obtained a profit of ${scoring_utils.get_profit(X_total, y_total, yhat_total):,.2f}")

# Hyperopt

In [None]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

In [None]:

xgb1 = xgb.XGBClassifier(
 learning_rate =0.1,
 n_estimators=100,
 verbosity=2,
 max_depth=11,
 min_child_weight=10,
 gamma=0.1,
 subsample=0.9,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 scale_pos_weight=5,
 seed=27)
modelfit(xgb1, X_train, y_train)

In [None]:
from sklearn.metrics import recall_score

In [None]:
space={
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.2),
    'max_depth': hp.quniform("max_depth", 9, 12, 1),
        'gamma': hp.uniform ('gamma', 0, 1.5),
        'reg_alpha' : hp.quniform('reg_alpha', 40,180,1),
        'reg_lambda' : hp.uniform('reg_lambda', 0,1),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
        'min_child_weight' : hp.quniform('min_child_weight', 0, 20, 1),
        'scale_pos_weight': 5,
        'subsample': .9
    }

In [None]:
from xgboost import XGBClassifier

In [None]:

xgb1 = xgb.XGBClassifier(
 learning_rate =0.1,
 n_estimators=100,
 verbosity=2,
 max_depth=11,
 min_child_weight=10,
 gamma=0.1,
 subsample=0.9,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 scale_pos_weight=5,
 seed=27)
modelfit(xgb1, X_train, y_train)

In [None]:
model = XGBClassifier(
        learning_rate= 0.1, 
        max_depth = 11,
        gamma = 0.1,           
        colsample_bytree= .8,
        min_child_weight= 10,
        scale_pos_weight= 5,
        subsample= .9,
        n_estimators = 100, 
        eval_metric='recall',
        early_stopping_rounds=10 ,
        verbosity=2,
        objective= 'binary:logistic'
    )

In [None]:
from sklearn.metrics import f1_score

In [None]:
def hyperparameter_tuning(space):
    model = XGBClassifier(
        learning_rate= space['learning_rate'], 
        max_depth = int(space['max_depth']),
        gamma = space['gamma'],           
        reg_alpha = int(space['reg_alpha']),
        reg_lambda = space['reg_lambda'],
        colsample_bytree=space['colsample_bytree'],
        min_child_weight=space['min_child_weight'],
        scale_pos_weight= 5,
        subsample= .9,
        n_estimators = 100, 
        eval_metric='auc',
        early_stopping_rounds=10 
    )

    evaluation = [( X_train, y_train), ( X_test, y_test)]
    
    model.fit(X_train, y_train,
            eval_set=evaluation,
            verbose=True)

    pred = model.predict(X_test)
    recall = f1_score(y_test, pred)
    print ("SCORE:", recall)
    #change the metric if you like
    return {'loss': -recall, 'status': STATUS_OK, 'model': model}

In [None]:
scoring_utils.get_profit(X_total, y_total, y_total)

In [None]:
trials = Trials()
best = fmin(fn=hyperparameter_tuning,
            space=space,
            algo=tpe.suggest,
            max_evals=100,
            trials=trials)

print (best)

In [None]:
trial = XGBClassifier(
    objective='binary:logistic',
    colsample_bytree=best['colsample_bytree'],
    gamma=best['gamma'],
    learning_rate=best['learning_rate'],
    max_depth=int(best['max_depth']),
    min_child_weight=best['min_child_weight'],
    reg_alpha=best['reg_alpha'],
    reg_lambda=best['reg_lambda'],
    scale_pos_weight= 5,
    subsample= .9,
    n_estimators = 100, 
    eval_metric='auc'
)

old_version = XGBClassifier(
 learning_rate =0.1,
 n_estimators=100,
 verbosity=2,
 max_depth=11,
 min_child_weight=10,
 gamma=0.1,
 subsample=0.9,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 scale_pos_weight=5,
 seed=27)

In [None]:

evaluation = [( X_train, y_train), ( X_test, y_test)]
    
trial.fit(X_train, y_train,
            eval_set=evaluation,
            verbose=True)

In [None]:
yhat_test = trial.predict(X_test)
yhat_total = trial.predict(X_total)


In [None]:
scoring_utils.get_metrics(y_test, yhat_test, "Trial Test")

In [None]:
scoring_utils.get_metrics(y_total, yhat_total, "Trial Test")

In [None]:
scoring_utils.get_confusion_matrix(y_total, yhat_total, "Old model")

In [None]:
scoring_utils.get_profit(X_total, y_total, yhat_total)