# Tunability for XGBoostClassifier

# Intro

In [31]:
import pandas as pd
import warnings 
warnings.filterwarnings("ignore")

random_state = pd.read_csv("../../data/vars.csv")['random_state'].iloc[0]
n_iter = pd.read_csv("../../data/vars.csv")['n_iter'].iloc[0]
n_iter_BS = pd.read_csv("../../data/vars.csv")['n_iter_BS'].iloc[0]

In [32]:
df_college = pd.read_csv("../../data/processed/college.csv")
df_credit = pd.read_csv("../../data/processed/credit.csv")
df_diabetes = pd.read_csv("../../data/processed/diabetes.csv")
df_penguins = pd.read_csv("../../data/processed/penguins.csv")

In [33]:
print(df_college.info())
print()
print(df_credit.info())
print()
print(df_diabetes.info())
print()
print(df_penguins.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   type_school            1000 non-null   object 
 1   school_accreditation   1000 non-null   object 
 2   gender                 1000 non-null   object 
 3   interest               1000 non-null   object 
 4   residence              1000 non-null   object 
 5   parent_age             1000 non-null   int64  
 6   parent_salary          1000 non-null   int64  
 7   house_area             1000 non-null   float64
 8   average_grades         1000 non-null   float64
 9   parent_was_in_college  1000 non-null   bool   
 10  will_go_to_college     1000 non-null   int64  
dtypes: bool(1), float64(2), int64(3), object(5)
memory usage: 79.2+ KB
None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 16 columns):
 #   Column  Non-Null Count  Dtype 

In [34]:

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.impute import SimpleImputer
import xgboost as xgb
import numpy as np


num_pipeline = Pipeline(steps=[
    ('num_impute', SimpleImputer(strategy='mean')),
    ('scale', MinMaxScaler())
])

cat_pipeline = Pipeline(steps=[
    ('cat_impute', SimpleImputer(strategy='most_frequent')),
    ('one-hot', OneHotEncoder(handle_unknown='ignore'))
])

col_trans = ColumnTransformer([
    ('num_pipeline', num_pipeline, make_column_selector(dtype_include = np.number)),
    ('cat_pipeline', cat_pipeline, make_column_selector(dtype_include = np.object_))
])

model_pipeline = Pipeline(steps=[
    ('preprocessing', col_trans),
    ('model', xgb.XGBClassifier())
])

# RandomizedSearchCV

In [35]:
from scipy.stats import uniform

xgboost_params = {
    'model__booster': ['gbtree', 'gblinear', 'dart'],  
    'model__learning_rate': uniform(0.01, 0.3),  
    'model__max_depth': range(3, 11),  
    # 'min_child_weight': uniform(1, 10),  
    # 'subsample': uniform(0.6, 0.4),  
    # 'colsample_bytree': uniform(0.6, 0.4),  
    # 'gamma': uniform(0, 0.5),  
    'model__lambda': uniform(0, 3),  
    # 'alpha': uniform(0, 3), 
    # 'n_estimators': range(50, 301),  
}

## Credit dataset

In [36]:
from sklearn.model_selection import RandomizedSearchCV

X = df_credit.iloc[:, 0:-1]
y = df_credit.iloc[:, -1]

random_search_model = RandomizedSearchCV(estimator = model_pipeline,
                                   param_distributions = xgboost_params,
                                   n_iter = n_iter,
                                   cv = 5,
                                   scoring = 'f1',
                                   random_state = random_state)

random_search_model.fit(X, y)

best_f1_credit = random_search_model.best_score_

print(f"Best f1 score: {best_f1_credit}")
print(f"Best params: {random_search_model.best_params_}")

random_search_model.best_estimator_

Best f1 score: 0.8484097223141853
Best params: {'model__booster': 'dart', 'model__lambda': 2.4623541591847404, 'model__learning_rate': 0.11389274604783184, 'model__max_depth': 6}


## Data frame with params, results and mean result

In [37]:
tested_params = random_search_model.cv_results_['params']

In [38]:
df_xgboost = pd.DataFrame(tested_params)
df_xgboost['credit_f1'] = random_search_model.cv_results_['mean_test_score']
df_xgboost

Unnamed: 0,model__booster,model__lambda,model__learning_rate,model__max_depth,credit_f1
0,dart,2.842428,0.072822,8,0.841026
1,dart,0.322970,0.188516,6,0.816028
2,gblinear,1.256422,0.110622,7,0.524043
3,dart,1.314424,0.230765,7,0.831328
4,gbtree,1.736576,0.203607,8,0.829256
...,...,...,...,...,...
95,gblinear,1.280339,0.129165,9,0.502844
96,gbtree,0.243740,0.145204,6,0.815526
97,gbtree,2.313443,0.238380,6,0.831142
98,gbtree,1.866815,0.152680,10,0.833254


## College dataset

In [39]:
from sklearn.model_selection import GridSearchCV

X = df_college.iloc[:, 0:-1]
y = df_college.iloc[:, -1]

formatted_params = []
for params in tested_params:
    formatted_param = {key: [value] for key, value in params.items()}
    formatted_params.append(formatted_param)

grid_search_model = GridSearchCV(model_pipeline,
                                 param_grid=formatted_params,
                                 scoring='f1',
                                 cv = 5)

grid_search_model.fit(X, y)

df_xgboost['college_f1'] = grid_search_model.cv_results_['mean_test_score']

best_f1_college = grid_search_model.best_score_

print(f"Best f1 score: {best_f1_college}")
print(f"Best params: {grid_search_model.best_params_}")
grid_search_model.best_estimator_

Best f1 score: 0.9004796102365622
Best params: {'model__booster': 'gbtree', 'model__lambda': 0.19759142942952823, 'model__learning_rate': 0.16677135315523955, 'model__max_depth': 7}


## Diabetes dataset

In [40]:
from sklearn.model_selection import GridSearchCV

X = df_diabetes.iloc[:, 0:-1]
y = df_diabetes.iloc[:, -1]

formatted_params = []
for params in tested_params:
    formatted_param = {key: [value] for key, value in params.items()}
    formatted_params.append(formatted_param)

grid_search_model = GridSearchCV(model_pipeline,
                                 param_grid=formatted_params,
                                 scoring='f1',
                                 cv = 5)

grid_search_model.fit(X, y)

df_xgboost['diabetes_f1'] = grid_search_model.cv_results_['mean_test_score']

best_f1_diabetes = grid_search_model.best_score_

print(f"Best f1 score: {best_f1_diabetes}")
print(f"Best params: {grid_search_model.best_params_}")
grid_search_model.best_estimator_

Best f1 score: 0.6507042649686119
Best params: {'model__booster': 'dart', 'model__lambda': 1.6564300258144156, 'model__learning_rate': 0.14870826209311114, 'model__max_depth': 3}


## Penguins dataset

In [41]:
from sklearn.model_selection import GridSearchCV

X = df_penguins.iloc[:, 0:-1]
y = df_penguins.iloc[:, -1]

formatted_params = []
for params in tested_params:
    formatted_param = {key: [value] for key, value in params.items()}
    formatted_params.append(formatted_param)

grid_search_model = GridSearchCV(model_pipeline,
                                 param_grid=formatted_params,
                                 scoring='f1',
                                 cv = 5)

grid_search_model.fit(X, y)

df_xgboost['penguins_f1'] = grid_search_model.cv_results_['mean_test_score']

best_f1_penguins = grid_search_model.best_score_


print(f"Best f1 score: {best_f1_penguins}")
print(f"Best params: {grid_search_model.best_params_}")
grid_search_model.best_estimator_

Best f1 score: 1.0
Best params: {'model__booster': 'gblinear', 'model__lambda': 0.002557428295219988, 'model__learning_rate': 0.14038355070715738, 'model__max_depth': 7}


## Tunability 

In [42]:
df_xgboost['mean_f1'] = df_xgboost[['credit_f1', 'college_f1', 'diabetes_f1', 'penguins_f1']].mean(axis=1)
df_xgboost

Unnamed: 0,model__booster,model__lambda,model__learning_rate,model__max_depth,credit_f1,college_f1,diabetes_f1,penguins_f1,mean_f1
0,dart,2.842428,0.072822,8,0.841026,0.883098,0.628308,0.989944,0.835594
1,dart,0.322970,0.188516,6,0.816028,0.892014,0.619144,0.989944,0.829282
2,gblinear,1.256422,0.110622,7,0.524043,0.685079,0.000000,0.710573,0.479924
3,dart,1.314424,0.230765,7,0.831328,0.889861,0.599954,0.989944,0.827772
4,gbtree,1.736576,0.203607,8,0.829256,0.894055,0.605083,0.989944,0.829584
...,...,...,...,...,...,...,...,...,...
95,gblinear,1.280339,0.129165,9,0.502844,0.685079,0.000000,0.710573,0.474624
96,gbtree,0.243740,0.145204,6,0.815526,0.885084,0.621163,0.989944,0.827929
97,gbtree,2.313443,0.238380,6,0.831142,0.895136,0.603728,0.989944,0.829987
98,gbtree,1.866815,0.152680,10,0.833254,0.881082,0.620562,0.989944,0.831210


In [43]:
# sigma star
sigma_star_row = df_xgboost[df_xgboost['mean_f1'] == max(df_xgboost['mean_f1'])]
sigma_star_row

Unnamed: 0,model__booster,model__lambda,model__learning_rate,model__max_depth,credit_f1,college_f1,diabetes_f1,penguins_f1,mean_f1
16,gbtree,2.294253,0.095158,4,0.842325,0.892766,0.640371,0.989944,0.841352


### (3.3) Measuring Overall Tunability of a ML Algorithm

In [44]:
d_credit = -sigma_star_row['credit_f1'].iloc[0] + best_f1_credit
d_college = -sigma_star_row['college_f1'].iloc[0] + best_f1_college
d_diabetes = -sigma_star_row['diabetes_f1'].iloc[0] + best_f1_diabetes
d_penguins = -sigma_star_row['penguins_f1'].iloc[0] + best_f1_penguins

d = pd.Series([d_credit, d_college, d_diabetes, d_penguins])
print(d)
print(f"Mean: {d.mean()}")


0    0.006085
1    0.007713
2    0.010333
3    0.010056
dtype: float64
Mean: 0.00854686176424077


# BayesSearchCV

In [45]:
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

n_iter_BS = pd.read_csv('../../data/vars.csv')['n_iter_BS'].iloc[0]
BS_iterations = pd.read_csv('../../data/vars.csv')['BS_iterations'].iloc[0]

xgboost_params = {
    'model__booster': Categorical(['gbtree', 'gblinear', 'dart']),  
    'model__learning_rate': Real(0.01, 0.3),  
    'model__max_depth': Integer(3, 10),  
    'model__lambda': Real(0, 3),  
}

## Credit dataset

In [46]:
X = df_credit.iloc[:, 0:-1]
y = df_credit.iloc[:, -1]
best_BS_f1_credit = 0

for i in range(BS_iterations):
    bayes_search_model = BayesSearchCV(estimator=model_pipeline,
                                    search_spaces=xgboost_params,
                                    n_iter=int(n_iter_BS),
                                    #random_state=random_state,
                                    cv=5,
                                    scoring='f1')
                                    
    bayes_search_model.fit(X, y)

    if (bayes_search_model.best_score_ > best_BS_f1_credit):
        best_BS_f1_credit = bayes_search_model.best_score_
        credit_best_estimator = bayes_search_model.best_estimator_

print(f"Best f1 score: {best_BS_f1_credit}")

credit_best_estimator

Best f1 score: 0.8505815453917156


## College dataset

In [47]:
X = df_college.iloc[:, 0:-1]
y = df_college.iloc[:, -1]

best_BS_f1_college = 0

for i in range(BS_iterations):
    bayes_search_model = BayesSearchCV(estimator=model_pipeline,
                                    search_spaces=xgboost_params,
                                    n_iter=int(n_iter_BS),
                                    #random_state=random_state,
                                    cv=5,
                                    scoring='f1')
                                    
    bayes_search_model.fit(X, y)

    if (bayes_search_model.best_score_ > best_BS_f1_college):
        best_BS_f1_college = bayes_search_model.best_score_
        college_best_estimator = bayes_search_model.best_estimator_

print(f"Best f1 score: {best_BS_f1_college}")

college_best_estimator

Best f1 score: 0.9029817721233737


## Diabetes dataset

In [48]:
X = df_diabetes.iloc[:, 0:-1]
y = df_diabetes.iloc[:, -1]

best_BS_f1_diabetes = 0

for i in range(BS_iterations):
    bayes_search_model = BayesSearchCV(estimator=model_pipeline,
                                    search_spaces=xgboost_params,
                                    n_iter=int(n_iter_BS),
                                    #random_state=random_state,
                                    cv=5,
                                    scoring='f1')
                                    
    bayes_search_model.fit(X, y)

    if (bayes_search_model.best_score_ > best_BS_f1_diabetes):
        best_BS_f1_diabetes = bayes_search_model.best_score_
        diabetes_best_estimator = bayes_search_model.best_estimator_

print(f"Best f1 score: {best_BS_f1_diabetes}")
diabetes_best_estimator

Best f1 score: 0.6611248262292537


## Penguins dataset

In [49]:
X = df_penguins.iloc[:, 0:-1]
y = df_penguins.iloc[:, -1]

best_BS_f1_penguins = 0

for i in range(BS_iterations):
    bayes_search_model = BayesSearchCV(estimator=model_pipeline,
                                    search_spaces=xgboost_params,
                                    n_iter=int(n_iter_BS),
                                    #random_state=random_state,
                                    cv=5,
                                    scoring='f1')
                                    
    bayes_search_model.fit(X, y)

    if (bayes_search_model.best_score_ > best_BS_f1_penguins):
        best_BS_f1_penguins = bayes_search_model.best_score_
        penguins_best_estimator = bayes_search_model.best_estimator_

print(f"Best f1 score: {best_BS_f1_penguins}")

penguins_best_estimator

Best f1 score: 1.0


## Tunability

### (3.3) Measuring Overall Tunability of a ML Algorithm

In [50]:
d_BS_credit = -sigma_star_row['credit_f1'].iloc[0] + best_BS_f1_credit
d_BS_college = -sigma_star_row['college_f1'].iloc[0] + best_BS_f1_college
d_BS_diabetes = -sigma_star_row['diabetes_f1'].iloc[0] + best_BS_f1_diabetes
d_BS_penguins = -sigma_star_row['penguins_f1'].iloc[0] + best_BS_f1_penguins

d_BS = pd.Series([d_BS_credit, d_BS_college, d_BS_diabetes, d_BS_penguins])
print(d_BS)
print(f"Mean: {d_BS.mean()}")

0    0.008256
1    0.010216
2    0.020754
3    0.010056
dtype: float64
Mean: 0.012320498320486678
