# Boosting Algorithm

## Library Imports

In [None]:
import warnings
import pandas as pd

from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score

from pickle import dump

warnings.filterwarnings('ignore')

## Getting the Data

In [169]:
train = pd.read_csv('https://raw.githubusercontent.com/Diegomca98/4geeks-ml-template-prjs/random-forest/data/processed/clean_train.csv')
test = pd.read_csv('https://raw.githubusercontent.com/Diegomca98/4geeks-ml-template-prjs/random-forest/data/processed/clean_test.csv')

train.to_csv('../data/processed/clean_train.csv', index = False)
test.to_csv('../data/processed/clean_test.csv', index = False)

In [170]:
train_data = pd.read_csv('../data/processed/clean_train.csv')
test_data = pd.read_csv('../data/processed/clean_test.csv')

In [171]:
train_data.head()

Unnamed: 0,Pregnancies,Glucose,Insulin,BMI,Age,Outcome
0,0.333333,0.490323,0.159574,0.505112,0.15,0.0
1,0.666667,0.316129,0.07565,0.214724,0.033333,0.0
2,0.555556,0.0,0.0,0.139059,0.25,0.0
3,0.111111,0.303226,0.0,0.224949,0.0,0.0
4,0.666667,0.393548,0.0,0.292434,0.083333,0.0


In [172]:
test_data.head()

Unnamed: 0,Pregnancies,Glucose,Insulin,BMI,Age,Outcome
0,0.111111,0.483871,0.200946,0.554192,0.083333,0.0
1,0.0,0.529032,0.141844,0.188139,0.0,0.0
2,0.777778,0.593548,0.159574,0.159509,0.5,0.0
3,0.111111,0.277419,0.088652,0.388548,0.016667,0.0
4,0.333333,0.296774,0.0,0.501022,0.0,0.0


In [173]:
xtrain = train_data.drop('Outcome', axis = 1)
ytrain = train_data['Outcome']

xtest = test_data.drop('Outcome', axis = 1)
ytest = test_data['Outcome']

## Model Creation

In [198]:
XGB_model = XGBClassifier()
XGB_model.fit(xtrain, ytrain)

In [199]:
ypredict = XGB_model.predict(xtest)
ypredict

array([0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1,
       0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1,
       0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 1, 0, 0])

In [200]:
XGB_model.get_params()

{'objective': 'binary:logistic',
 'base_score': None,
 'booster': None,
 'callbacks': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'device': None,
 'early_stopping_rounds': None,
 'enable_categorical': False,
 'eval_metric': None,
 'feature_types': None,
 'gamma': None,
 'grow_policy': None,
 'importance_type': None,
 'interaction_constraints': None,
 'learning_rate': None,
 'max_bin': None,
 'max_cat_threshold': None,
 'max_cat_to_onehot': None,
 'max_delta_step': None,
 'max_depth': None,
 'max_leaves': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'multi_strategy': None,
 'n_estimators': None,
 'n_jobs': None,
 'num_parallel_tree': None,
 'random_state': 42,
 'reg_alpha': None,
 'reg_lambda': None,
 'sampling_method': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None}

In [201]:
accuracy_score(ytest, ypredict)

0.697841726618705

<h2 style='text-align:center;'>Model Optimization</h2>

### Model Optimization - GridSearchCV

In [229]:
# hps = {
#     'learning_rate': [0.001, 0.01, 0.05, 0.1, 1],
#     'n_estimators': [100, 200, 300],
#     'max_depth': [1, 2, 3, 4, 5, 10, 15, 20],
#     'min_child_weight': [1, 2, 3],
#     'subsample': [0.6, 0.8, 1.0],
#     'colsample_bytree': [0.6, 0.8, 1.0]
# }

hps = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.001, 0.01, 0.1, 1],
}

grid_search = GridSearchCV(
    XGBClassifier(),
    param_grid = hps,
    scoring = 'accuracy',
    cv = 10,
)

grid_search.fit(xtrain, ytrain)

In [230]:
grid_search.best_params_

{'learning_rate': 0.01, 'n_estimators': 300}

In [231]:
model_grid_opt = XGBClassifier(
    learning_rate = 0.1,
    n_estimators = 300,
    random_state = 42
)

model_grid_opt.fit(xtrain, ytrain)

In [232]:
ypred_grid_opt = XGB_model.predict(xtest)
ypred_grid_opt

array([0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1,
       0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1,
       0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 1, 0, 0])

### Model Optimization - RandomizedSearchCV

In [235]:
# hps = {
#     'learning_rate': [0.001, 0.01, 0.05, 0.1, 1],
#     'n_estimators': [100, 200, 300],
#     'max_depth': [1, 2, 3, 4, 5, 10, 15, 20],
#     'min_child_weight': [1, 2, 3],
#     'subsample': [0.6, 0.8, 1.0],
#     'colsample_bytree': [0.6, 0.8, 1.0]
# }
hps = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.001, 0.01, 0.1, 1],
}

random_search = RandomizedSearchCV(
    XGBClassifier(),
    param_distributions = hps,
    scoring = 'accuracy',
    cv = 10,
)

random_search.fit(xtrain, ytrain)

In [236]:
random_search.best_params_

{'n_estimators': 300, 'learning_rate': 0.01}

In [237]:
# model_grid_opt = XGBClassifier(
#     subsample = 0.6,
#     n_estimators = 300,
#     min_child_weight = 1,
#     max_depth = 10,
#     learning_rate = 0.01,
#     colsample_bytree = 0.8
# ).fit(xtrain, ytrain)

model_rand_opt = XGBClassifier(
    n_estimators = 300,
    learning_rate = 0.01,
    random_state = 42
).fit(xtrain, ytrain)

In [238]:
ypred_rand_opt = XGB_model.predict(xtest)
ypred_rand_opt

array([0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1,
       0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1,
       0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 1, 0, 0])

In [244]:
print('Grid and Random Search Optimization:')
print(accuracy_score(ytest, ypred_grid_opt))
print(accuracy_score(ytest, ypred_rand_opt))

Grid and Random Search Optimization:
0.697841726618705
0.697841726618705


In my case, the Optimized `RandomForestClassifier()`(0.7553956834532374) seems to be more accurate than the Optimized `XGBClassifier()`(0.697841726618705)

In [245]:
dump(model_grid_opt, open("../models/boosting_algorithm-nest-300_learn-rate-01_rand-state-42.sav", "wb"))