In [None]:
import pandas as pd
from xgboost import XGBClassifier
from utils import predict_and_evaluate

In [None]:
train_df = pd.read_csv("train_data.csv")
test_df = pd.read_csv("test_data.csv")

In [None]:
train_df.shape

In [None]:
test_df.shape

In [None]:
train_df.head()

In [None]:
# Create X_train, X_test, y_train, y_test for ease of use
X_train = train_df.drop('Class', axis=1)
y_train = train_df['Class']

X_test = test_df.drop('Class', axis=1)
y_test = test_df['Class']

# Hyperparameter Tuning
Hyperparameter tuning refers to the shaping of the model architecture from the available space. This, in simple words, is nothing but searching for the right hyperparameter to find high precision and accuracy. Two of the most widely-used parameter optimiser techniques are:

1. Grid search
2. Random search

## Grid Search

 - In this tuning technique, we simply build a model for every combination of various hyperparameters and evaluate each model. The model which gives the highest accuracy wins.
 - The pattern followed here is similar to the grid, where all the values are placed in the form of a matrix.
 - Each set of parameters is taken into consideration and the accuracy is noted.
 - Once all the combinations are evaluated, the model with the set of parameters which give the top accuracy is considered to be the best. 

In [None]:
# Examples of Parameter Matrix

# # Parameter matrix for Random Forest
# rf_params = {'n_estimators': [100, 200, 300, 500, 800],
#                'max_features': ['sqrt', 'log2'],
#                'max_depth': [8, 10, 20],
#                'min_samples_split': [2, 5, 10]
#               }

# # Parameter matrix for Gradient Boost
# gbm_params = {  "n_estimators":[100,150,200],
#                 "learning_rate": [0.01, 0.025, 0.05],
#                 "max_depth":[3,5],
#                 "subsample":[ 0.8, 0.9,1.0], 
#             }

# # Parameter matrix for XGBoost
# xgb_params = {'n_estimators' : [100,400,800],
#          "learning_rate"    : [0.01,0.05, 0.10] ,
#          "max_depth"        : [ 3, 5, 7, 13],
#          "min_child_weight" : [ 3, 5],}

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
xgb_params = {'n_estimators' : [100, 200],
              "max_depth"    : [ 3, 4],
         }

In [None]:
gs_xgb = GridSearchCV(estimator= XGBClassifier(), 
                        param_grid= xgb_params, 
                        n_jobs = 4)

In [None]:
gs_xgb.fit(X_train, y_train)

In [None]:
gs_xgb.best_params_

In [None]:
gs_xgb.best_estimator_

In [None]:
# predictons
xgb_gs_hpo = predict_and_evaluate(gs_xgb.best_estimator_, X_test, y_test)

## Random Search

Random search is a technique where random combinations of the hyperparameters are used to find the best solution for the built model.

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
rs_xgb = RandomizedSearchCV(estimator= XGBClassifier(), 
                        param_distributions = xgb_params,
                        n_iter= 2,
                        n_jobs = 2)

In [None]:
%time rs_xgb.fit(X_train, y_train)

In [None]:
rs_xgb.best_params_

In [None]:
rs_xgb.best_estimator_

In [None]:
# predictons
xgb_rs_hpo = predict_and_evaluate(gs_xgb.best_estimator_, X_test, y_test)