### Fine-tuning

Let's fine-tune both RF and XGB

***Random Forest Classifier Fine-tuning***

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 4)]
# Number of features to consid  er at every split
max_features = [4,6,8]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num =10)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [20, 40, 60]
# Minimum number of samples required at each leaf node
min_samples_leaf = [2,5, 10, 20] #>1 to reduce overfitting
# Method of selecting samples for training each tree
bootstrap = [True] #bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

In [None]:
# Use the random grid to search for best hyperparameters
# Fit Random Forest Classifier
rf = RandomForestClassifier(n_jobs=-1)
# Random search of parameters, using 3 fold cross validation, 
folds = 3 
param_comb = 20

skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 42)
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = param_comb, cv = skf.split(x_train_final,y_train), verbose=2, random_state=42, n_jobs = -1)

In [None]:
# Fit the random search model
start_time = timer() #start
rf_random.fit(x_train_final, y_train.values.ravel())
timer(start_time) #end

# Check the best parameters
rf_random.best_params_

#30 minutes

In [None]:
# Get the model
RF_RS = rf_random.best_estimator_

In [None]:
# Score
start_time = timer() #start
RF_RS_scores= cross_validate(RF_RS, x_train_final, y_train.values.ravel(), cv=5, scoring=scoring)
timer(start_time) #end

# Store Results
RF_RS_AUC = (RF_RS_scores['test_roc_auc']).mean().round(3)
RF_RS_Acc = (RF_RS_scores['test_accuracy']).mean().round(3)
RF_RS_Prec = (RF_RS_scores['test_precision']).mean().round(3)
RF_RS_Rec = (RF_RS_scores['test_recall']).mean().round(3)

# Takes 6 minutes

***XG Boost Fine-tuning***

In [None]:
# Set the parameter grid for XGBoost
params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5],
        'n_estimators':[200,400,600]}

xgb_c = xgb.XGBClassifier(learning_rate=0.02, objective='binary:logistic',
                    silent=True, nthread=1)

In [None]:
# Random search of parameters, using 5 fold cross validation, 
folds = 3
param_comb = 20

skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 42)
xgb_random = RandomizedSearchCV(xgb_c, param_distributions=params, n_iter=param_comb, scoring='roc_auc', n_jobs=4, cv=skf.split(x_train_final,y_train), verbose=3, random_state=42 )

In [None]:
# Fit the random search model
start_time = timer() #start
xgb_random.fit(x_train_final, y_train.values.ravel())
timer(start_time) #end

# Check the best parameters
xgb_random.best_params_

#12 minutes

In [None]:
# Get the model
XGB_RS = xgb_random.best_estimator_

In [None]:
# Score
start_time = timer() #start
XGB_RS_scores= cross_validate(XGB_RS, x_train_final, y_train.values.ravel(), cv=5, scoring=scoring)
timer(start_time) #end

# Store Results
XGB_RS_AUC = (XGB_RS_scores['test_roc_auc']).mean().round(3)
XGB_RS_Acc = (XGB_RS_scores['test_accuracy']).mean().round(3)
XGB_RS_Prec = (XGB_RS_scores['test_precision']).mean().round(3)
XGB_RS_Rec = (XGB_RS_scores['test_recall']).mean().round(3)

# takes 6 min

In [None]:
# Check model outputs so far
models = [('Logistic Regression (train)', LogR_AUC, LogR_Acc, LogR_Prec, LogR_Rec),   
          ('Random Forest Regression (train)', RF_AUC, RF_Acc, RF_Prec, RF_Rec),   
          ('Random Forest Regression RS (train)', RF_RS_AUC, RF_RS_Acc, RF_RS_Prec, RF_RS_Rec),   
          ('XG Boost Regressor (train)', XGB_AUC, XGB_Acc, XGB_Prec, XGB_Rec),
          ('XG Boost Regressor RS (train)', XGB_RS_AUC, XGB_RS_Acc, XGB_RS_Prec, XGB_RS_Rec)
        ]

result = pd.DataFrame(data = models, columns=['Model', 'AUC', 'Accuracy','Precision', 'Recall'])
result