In [1]:
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
import xgboost as xgb
import functions as f

In [2]:
df = pd.read_csv("main.csv")

Splitting data into the classic 80/20 split using the split_train_test function whose functionality can be found in the functions.py file

In [3]:
X_train, X_test, y_train, y_test = f.split_train_test(df, target_col='target')

Here I train an XGB baseline model and report its metrics using the train_and_report_xgb function

In [9]:
model, preds, report = f.train_and_report_xgb(X_train, y_train, X_test, y_test)

              precision    recall  f1-score   support

           0       0.97      0.98      0.98     11904
           1       0.96      0.92      0.94      3810
           2       0.99      0.99      0.99      4286

    accuracy                           0.97     20000
   macro avg       0.97      0.97      0.97     20000
weighted avg       0.97      0.97      0.97     20000



Created and implemented a parameter grid search list. 

In [15]:
param = {
    'n_estimators':      [100, 300],
        'learning_rate':     [0.05, 0.1],
        'max_depth':         [3, 6],
        'subsample':         [0.8, 1.0],
        'colsample_bytree':  [0.8, 1.0]
}

XG = xgb.XGBClassifier()

grid_search = GridSearchCV(
    estimator=XG,
    param_grid=param,
    cv=5,
    scoring='accuracy',   
    n_jobs=-1,
    verbose=2
)
grid_search.fit(X_train, y_train)



Fitting 5 folds for each of 32 candidates, totalling 160 fits


Using the best performing model and creating a metrics report

In [17]:
best_rf = grid_search.best_estimator_
y_pred_best_rf = best_rf.predict(X_test)
print("Random Forest Model Accuracy (After Tuning):", classification_report(y_test, y_pred_best_rf))

Random Forest Model Accuracy (After Tuning):               precision    recall  f1-score   support

           0       0.97      0.98      0.98     11904
           1       0.96      0.92      0.94      3810
           2       0.99      0.99      0.99      4286

    accuracy                           0.97     20000
   macro avg       0.97      0.97      0.97     20000
weighted avg       0.97      0.97      0.97     20000



Finally I create a cross validation check to test the model throughout the entire dataset instead of in only one split.

In [19]:
cv_scores_best_rf = cross_val_score(best_rf, X_train, y_train, cv=5, scoring='accuracy')
print(f'Cross-Validation Scores (After Tuning): {cv_scores_best_rf}')
print(f'Mean CV Accuracy (After Tuning): {cv_scores_best_rf.mean()}')

Cross-Validation Scores (After Tuning): [0.9746875  0.977625   0.977      0.9764375  0.97449841]
Mean CV Accuracy (After Tuning): 0.9760496812300768
