# System Requirements 

In [1]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [2]:
!brew install libomp

To reinstall 10.0.0, run `brew reinstall libomp`


# Library Imports

In [89]:
import pandas as pd

from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

# Data Import

In [4]:
df = pd.read_csv('../data/processed/cleaned_dataframe.gz', compression='gzip')

# Data Preprocessing

In [39]:
X = df.drop(['Class'], axis=1)
y = df['Class']

In [53]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.3, 
                                                    stratify=y, 
                                                    random_state=1)


# Learn.co code

In [187]:
# Instantiate XGBClassifier
clf = XGBClassifier()

# Fit XGBClassifier
clf.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [191]:
param_grid = {
    'learning_rate': [0.3],
    'max_depth': [6],
    'min_child_weight': [1],
    'subsample': [0.5],
    'n_estimators': [30],
}

cv = None
n_jobs = -1

## Original code, with accuracy

In [189]:
# Create GridSearch Object
grid_clf = GridSearchCV(clf, 
                        param_grid, 
                        scoring='accuracy', 
                        cv=cv, 
                        n_jobs=n_jobs)

# Fit our GridSearch Object and pass  in the training data
grid_clf.fit(X_train, y_train)

# Store the best parameter combination found in the grid search
best_parameters = grid_clf.best_params_
print('Grid Search found the following optimal parameters: ')
for param_name in sorted(best_parameters.keys()):
    print('{}: {}'.format(param_name, best_parameters[param_name]))

# Predict on training and test data
training_preds = grid_clf.predict(X_train)
test_preds = grid_clf.predict(X_test)

# Accuracy of predictions on training and test data
training_accuracy = accuracy_score(y_train, training_preds)
test_accuracy = accuracy_score(y_test, test_preds)
print('')
print('Training Accuracy: {:.4}%'.format(training_accuracy * 100))
print('Validation accuracy: {:.4}%'.format(test_accuracy * 100))


Grid Search found the following optimal parameters: 
learning_rate: 0.1
max_depth: 6
min_child_weight: 1
n_estimators: 100
subsample: 0.7

Training Accuracy: 99.98%
Validation accuracy: 99.96%


## Adapted code to have f1 as scoring

In [241]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline



random_seed = 1

specify gbtree as its a classification.

In [205]:
# Instantiate XGBClassifier
clf = XGBClassifier(booster='gbtree', 
                    seed=random_seed
                   )

# Build Parameter Grid
param_grid = {
    'learning_rate': [0.3],
    'objective': ['binary:logistic', 'binary:hinge'],
    'max_depth': [6],
    'min_child_weight': [1],
}

In [214]:
def best_f1_score(clf, param_grid, scoring='f1', cv=3, n_jobs=-1):
    '''Conducts Gridsearch to return the best f1 score for a classifier as well as y_pred'''
    
    # Create GridSearch Object
    grid_clf = GridSearchCV(clf, 
                            param_grid, 
                            scoring='f1', 
                            cv=cv
                           )

    # Fit our GridSearch Object and pass  in the training data
    grid_clf.fit(X_train, y_train)

    # Best Score
    best_f1 = grid_clf.best_score_
    print('Best f1 score: {}'.format(round(best_f1, 5)))
    print('\nOptimal parameters:')
    best_parameters = grid_clf.best_estimator_.get_params()
    for param_name in sorted(param_grid.keys()):
        print('\t{}: {}'.format(param_name, best_parameters[param_name]))

    # Predict on test data
    y_pred = grid_clf.predict(X_test)
    # Classification report
    print('\n', classification_report(y_test, y_pred))
    
    return best_f1, y_pred

In [215]:
model_xgb = best_f1_score(clf, param_grid)

Best f1 score: 0.85238

Optimal parameters:
	learning_rate: 0.3
	max_depth: 6
	min_child_weight: 1
	objective: binary:logistic

               precision    recall  f1-score   support

           0       1.00      1.00      1.00     85295
           1       0.93      0.83      0.88       148

    accuracy                           1.00     85443
   macro avg       0.97      0.92      0.94     85443
weighted avg       1.00      1.00      1.00     85443



In [238]:
# Build scores dataframe for comparison
scores_columns = ['Classifier', 'F1 Score']
scores_df = pd.DataFrame(columns=scores_columns)
scores_df

Unnamed: 0,Classifier,F1 Score


In [239]:
scores_df.loc[0] = ['XGBoost', model_xgb[0]]
scores_df

Unnamed: 0,Classifier,F1 Score
0,XGBoost,0.852381


In [240]:
# Build Confusion Matrix
confusion_matrix_xgb = confusion_matrix(y_test, model_xgb[1])
confusion_matrix_xgb

array([[85286,     9],
       [   25,   123]])

# Attempt 1

In [54]:
train = xgb.DMatrix(X_train, label=y_train)
test = xgb. DMatrix(X_test, label=y_test)

In [100]:
scores_columns = ['Accuracy', 'F1 Score', 'Hyperparameters']
scores_df = pd.DataFrame(columns=scores_columns)


In [105]:
param_grid = {'max_depth': [3,5,10,20], 
              'eta': [0.1,0.2,0.3], 
              'objective': 'multi:softmax', 
              'num_class': 2}
epochs = 10

In [106]:
# Train model
model = xgb.train(param_grid, train, epochs, verbose_eval=2)

# Use the trained model to predict classifications for the test data
predictions = model.predict(test)

# Measure accuracy of test data
xgb_acc = accuracy_score(y_test, predictions)
print('Latest Accuracy = {}'.format(round(xgb_acc, 6)))

xgb_f1 = f1_score(y_test, predictions)
print('Latest F1 Score = {}'.format(round(xgb_f1, 6)))

new_scores = pd.DataFrame([[xgb_acc, 
                            xgb_f1,
                            param_grid]],
                          columns=scores_columns)
scores_df = scores_df.append(new_scores)

scores_df

Latest Accuracy = 0.999602
Latest F1 Score = 0.879433


Unnamed: 0,Accuracy,F1 Score,Hyperparameters
0,0.999567,0.870175,"{'max_depth': 10, 'eta': 0.3, 'objective': 'mu..."
0,0.999614,0.883392,"{'max_depth': 10, 'eta': 0.3, 'objective': 'mu..."
0,0.999602,0.879433,"{'max_depth': 3, 'eta': 0.3, 'objective': 'mul..."


from Kaggle: https://www.kaggle.com/phunter/xgboost-with-gridsearchcv

In [110]:
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import RobustScaler

In [121]:
cv = 3
n_jobs = -1
verbose = 3
scoring='f1'

In [122]:
clf_xgb = xgb.XGBClassifier()

param_grid_xgb = [{'max_depth': [5,10], 
              'eta': [0.3], 
              'objective': ['multi:softmax'], 
              'num_class': [2]}]

gs_xgb = GridSearchCV(estimator=clf_xgb,
                     param_grid=param_grid_xgb,
                     scoring=scoring,
                     cv=cv,
                     n_jobs=n_jobs,
                     verbose=verbose)

In [123]:
gs_xgb.fit(X_train, y_train)

Fitting 3 folds for each of 2 candidates, totalling 6 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   6 | elapsed:  2.1min remaining:  2.1min
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:  2.3min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:  2.3min finished


GridSearchCV(cv=3,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs=None,
                                     num_parallel_tree=None, random_state=None,
                                     reg_alpha=None, reg_lambda=None,
                                     scale_pos_weight=None, subsample=None,
                                     tree_method=None,

In [126]:
best_params, score = max(clf_xgb.grid_scores_, key=lambda x:x[1])
print('Best F1 Score: {}'.format(score))

for param in sort(best_params.keys()):
    print('{}: {}%'.format(param, best_params[param]))

TypeError: 'method' object is not iterable

From kaggle(2): https://www.kaggle.com/tilii7/hyperparameter-grid-search-with-xgboost

In [176]:
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier
import numpy as np

In [177]:
params = {
        'min_child_weight': [5],
        'gamma': [1],
        'subsample': [0.8],
        'colsample_bytree': [0.8, 1],
        'max_depth': [5]
        }
cv = 5
combinations = cv * np.prod([len(v) for v in params.values()])
print('There are {} combinations'.format(combinations))

There are 10 combinations


In [178]:
xgb = XGBClassifier(learning_rate=0.02, n_estimators=600, objective='binary:logistic', silent=True, nthread=1)

In [None]:
# grid = GridSearchCV(estimator=xgb, param_grid=params, scoring='roc_auc', n_jobs=4, cv=skf.split(X,Y), verbose=3 )
# grid.fit(X, Y)
# print('\n All results:')
# print(grid.cv_results_)
# print('\n Best estimator:')
# print(grid.best_estimator_)
# print('\n Best score:')
# print(grid.best_score_ * 2 - 1)
# print('\n Best parameters:')
# print(grid.best_params_)
# results = pd.DataFrame(grid.cv_results_)
# results.to_csv('xgb-grid-search-results-01.csv', index=False)

# y_test = grid.best_estimator_.predict_proba(test)
# results_df = pd.DataFrame(data={'id':test_df['id'], 'target':y_test[:,1]})
# results_df.to_csv('submission-grid-search-xgb-porto-01.csv', index=False)

In [182]:
# Set up stratified flds and grid search parameters.  
folds = 3
param_comb = 5

skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=1)

random_search = RandomizedSearchCV(xgb, param_distributions=params, n_iter=param_comb, scoring='roc_auc', n_jobs=-1, cv=skf.split(X,y), verbose=3, random_state=1001 )

# Here we go

random_search.fit(X, y)


Fitting 3 folds for each of 2 candidates, totalling 6 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   6 | elapsed:  5.1min remaining:  5.1min
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:  5.1min remaining:    0.0s


KeyboardInterrupt: 

In [None]:
print(random_search.cv_results_)
print('\n Best estimator:')
print(random_search.best_estimator_)
print('\n Best normalized gini score for %d-fold search with %d parameter combinations:' % (folds, param_comb))
print(random_search.best_score_ * 2 - 1)
print('\n Best hyperparameters:')
print(random_search.best_params_)

In [None]:
# Make prediction based on best parameters found in search
y_test = random_search.predict_proba(test)

In [81]:
# Can now tune hyperparameters OR/AND make it more efficient
# e.g. can you get away with fewer epochs? or fewer trees
