# Boosting Algorithms (Classification)

---




---

Imported Libraries

In [1]:
# Data processing
# ==================================================================================
import pandas as pd
import numpy as np

# Charts
# ==================================================================================
import matplotlib.pyplot as plt

# Preprocessing and modeling
# ==================================================================================
import xgboost
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
import sklearn
from sklearn.model_selection import GridSearchCV

from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

# Warnings Configuration
# ==================================================================================
import warnings

def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn # ignore annoying warning (from sklearn and seaborn)
warnings.filterwarnings("ignore", category=FutureWarning)

---

## Step 1: Decision making: Which is the best dataset



In [2]:
# Train data frames
X_train_with_outliers_sel = pd.read_csv('../data/processed/X_train_with_outliers_sel.csv')
X_train_without_outliers_sel = pd.read_csv('../data/processed/X_train_without_outliers_sel.csv')
y_train = pd.read_csv('../data/processed/y_train.csv')

# Test data frames
X_test_with_outliers_sel = pd.read_csv('../data/processed/X_test_with_outliers_sel.csv')
X_test_without_outliers_sel = pd.read_csv('../data/processed/X_test_without_outliers_sel.csv')
y_test = pd.read_csv('../data/processed/y_test.csv')



In [3]:
# train_dicts (dict)
# =====================================================================================
train_dicts = {
  "X_train_with_outliers_sel": X_train_with_outliers_sel,
  "X_train_without_outliers_sel": X_train_without_outliers_sel
}

# test_dicts (dict)
# =====================================================================================
test_dicts = {
  "X_test_with_outliers_sel": X_test_with_outliers_sel,
  "X_test_without_outliers_sel": X_test_without_outliers_sel
}

# -.-.--.-.-.-.-.-.-.-.--.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.--.-.-.-.-.-.-

# train_dfs (list)
# =====================================================================================
train_dfs = [
  X_train_with_outliers_sel,
  X_train_without_outliers_sel
]

# test_dfs (list)
# =====================================================================================
test_dfs = [
  X_test_with_outliers_sel,
  X_test_without_outliers_sel
]

# -.-.--.-.-.-.-.-.-.-.--.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.--.-.-.-.-.-.-

# Print .shape
# =====================================================================================
print("|X_train|")
print("=================================================================")
print(f"X_train_with_outliers_sel shape: {X_train_with_outliers_sel.shape} ")
print(f"X_train_without_outliers_sel shape: {X_train_without_outliers_sel.shape}\n ")

print("|X_test|")
print("=================================================================")
print(f"X_test_with_outliers_sel shape: {X_test_with_outliers_sel.shape} ")
print(f"X_test_without_outliers_sel shape: {X_test_without_outliers_sel.shape}\n ")

print("|Y_train|")
print("=================================================================")
print(f"y_train shape: {y_train.shape}\n ")

print("|Y_test|")
print("=================================================================")
print(f"y_test shape: {y_test.shape} ")

|X_train|
X_train_with_outliers_sel shape: (614, 8) 
X_train_without_outliers_sel shape: (614, 8)
 
|X_test|
X_test_with_outliers_sel shape: (154, 8) 
X_test_without_outliers_sel shape: (154, 8)
 
|Y_train|
y_train shape: (614, 1)
 
|Y_test|
y_test shape: (154, 1) 


In [4]:
results = []

for df_index in range(len(train_dfs)):
  model = XGBClassifier(random_state = 42) # Model initialization and training

  train_df = train_dfs[df_index]
  model.fit(train_df, y_train)

  y_test_pred = model.predict(test_dfs[df_index]) # Model prediction

  results.append(
    {
        "index": df_index,
        "train_df": list(train_dicts.keys())[df_index],
        "Accuracy_score": accuracy_score(y_test, y_test_pred)
  })

resultados = sorted(results, key = lambda x: x["Accuracy_score"], reverse = True)
resultados

[{'index': 1,
  'train_df': 'X_train_without_outliers_sel',
  'Accuracy_score': 0.7142857142857143},
 {'index': 0,
  'train_df': 'X_train_with_outliers_sel',
  'Accuracy_score': 0.7077922077922078}]

In [5]:
print (f"The best train dataframe is |{resultados[0]['train_df']}|.\n\
==========================================================      \n\
| Accuracy score: {resultados[0]['Accuracy_score']}   |\n\
========================================")

The best train dataframe is |X_train_without_outliers_sel|.
| Accuracy score: 0.7142857142857143   |


---

## Step 2: Model hyperparameters optimization

- ### 2.1 Grid SearchCV

In [29]:
model = XGBClassifier(random_state = 42)
model.fit(train_dfs[1], y_train)

y_pred = model.predict(test_dfs[1])

base_accuracy = accuracy_score(y_test, y_pred)
base_accuracy

0.7142857142857143

In [None]:
# We define the parameters that we want to adjust by hand
param_grid = {
    "device": ['gpu'], # Default GPU device selection from the list of available and supported devices. Only cuda devices are supported currently.
    "eta": [0.05, 0.1, 0.3], # (Learning_rate)
    "gamma": [0, 1],
    "subsample": [0.5], # Setting it to 0.5 means that XGBoost would randomly sample half of the training data prior to growing trees. and this will prevent overfitting
    "sampling_method": ['uniform', 'gradient_based'],
    "process_type": ['default', 'update'],
    "grow_policy": ['depthwise', # split at nodes closest to the root.
                    'lossguide'], # split at nodes with highest loss change.
    "multi_strategy": ['one_output_per_tree', 'multi_output_tree'],
    "lambda": [1, 3, 5],
    "tree_method": ['auto', 'exact', 'approx', 'hist']
    
}

# We initialize the grid
grid = GridSearchCV(model,
                     param_grid,
                       scoring = "accuracy",
                        verbose = 0,
                          cv = 6)
grid

In [31]:
grid.fit(train_dfs[1], y_train)

print(f"Best hyperparameters: {grid.best_params_}")

Best hyperparameters: {'device': 'gpu', 'eta': 0.3, 'gamma': 1, 'grow_policy': 'depthwise', 'lambda': 5, 'multi_strategy': 'one_output_per_tree', 'process_type': 'default', 'sampling_method': 'gradient_based', 'subsample': 0.9, 'tree_method': 'approx'}


>NOTE: We use subsample: 0.5 instead of 0.9 because is better. It is a handmade error in the step above.

In [33]:
model_grid = XGBClassifier(
    device = 'gpu', # Default GPU device selection from the list of available and supported devices. Only cuda devices are supported currently.
    subsample = 0.5, # Setting it to 0.5 means that XGBoost would randomly sample half of the training data prior to growing trees. and this will prevent overfitting
    eta = 0.3,
    gamma = 1,
    tree_method = 'approx',
    sampling_method = 'gradient_based',
    process_type = 'default',
    grow_policy = 'depthwise', 
    random_state = 42
)
model_grid.fit(train_dfs[1], y_train)

y_pred = model_grid.predict(test_dfs[1])

grid_accuracy = accuracy_score(y_test, y_pred)
grid_accuracy

0.7337662337662337

In [34]:
print(f"We have an increment of {round(((grid_accuracy - base_accuracy)/base_accuracy)*100, 2)}%")

We have an increment of 2.73%


---

# TRY WITH HYPEROPT Library

---

In [None]:
# hp.uniform(label, low, high) || Returns a value uniformly between low and high.
# hp.quniform(label, low, high, q) || Returns a value like round(uniform(low, high) / q) * q
    # The parameter 'q' will always be set to 1.0

space={'max_depth': hp.quniform("max_depth", 3, 18, 1),
        'gamma': hp.uniform ('gamma', 1,9),
        'reg_alpha' : hp.quniform('reg_alpha', 40,180,1),
        'reg_lambda' : hp.uniform('reg_lambda', 0,1),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
        'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
        'n_estimators': 150,
        'seed': 42
    }

In [14]:
def objective(space):
    clf=XGBClassifier(device = 'gpu',
                    n_estimators =space['n_estimators'], max_depth = int(space['max_depth']), gamma = space['gamma'],
                    reg_alpha = int(space['reg_alpha']),min_child_weight=int(space['min_child_weight']),
                    colsample_bytree=int(space['colsample_bytree']))
    
    evaluation = [( train_dfs[1], y_train), ( test_dfs[1], y_test)]
    
    clf.fit(train_dfs[1], y_train,
            eval_set=evaluation,verbose=False)
    

    pred = clf.predict(test_dfs[1])
    accuracy = accuracy_score(y_test, pred>0.5)
    print ("SCORE:", accuracy)
    return {'loss': -accuracy, 'status': STATUS_OK }

In [None]:
trials = Trials()

best_hyperparams = fmin(fn = objective, # Objective Function to optimize
                        space = space, # Hyperparameter's Search Space
                        algo = tpe.suggest, # Optimization algorithm (representative TPE)
                        max_evals = 100, # Number of optimization attempts
                        trials = trials)

SCORE:                                                 
0.6428571428571429                                     
SCORE:                                                                            
0.6428571428571429                                                                
SCORE:                                                                            
0.6428571428571429                                                                
SCORE:                                                                            
0.6428571428571429                                                                
SCORE:                                                                            
0.6428571428571429                                                                
SCORE:                                                                            
0.6428571428571429                                                                
SCORE:                                                    

In [25]:
print("The best hyperparameters are : ","\n")
print(best_hyperparams)

The best hyperparameters are :  

{'colsample_bytree': 0.7124437387891622, 'gamma': 6.808679556357256, 'max_depth': 5.0, 'min_child_weight': 5.0, 'reg_alpha': 168.0, 'reg_lambda': 0.06685211067493846}


In [27]:
model_grid = XGBClassifier(
    device = 'gpu', # Default GPU device selection from the list of available and supported devices. Only cuda devices are supported currently.
    subsample = 0.5, # Setting it to 0.5 means that XGBoost would randomly sample half of the training data prior to growing trees. and this will prevent overfitting
    sampling_method = 'gradient_based',
    colsample_bytree = 0.7124437387891622,
    gamma = 6.808679556357256,
    max_depth = 5,
    min_child_weight = 5.0,
    reg_alpha = 168.0,
    reg_lambda = 0.06685211067493846,
    random_state = 42
)
model_grid.fit(train_dfs[1], y_train)

y_pred = model_grid.predict(test_dfs[1])

grid_accuracy = accuracy_score(y_test, y_pred)
grid_accuracy

0.6428571428571429