In [59]:
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
import xgboost as xgb

In [24]:
df = pd.read_csv("6_features.csv")

In [26]:
X = df.loc[:, df.columns!= 'target']
y = df['target']

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [28]:
XG_B = xgb.XGBClassifier(n_estimators=200, learning_rate=0.1)

In [32]:
XG_B.fit(X_train, y_train)
y_hat_XG_B = XG_B.predict(X_test)
print(classification_report(y_test, y_hat_XG_B))

              precision    recall  f1-score   support

           0       0.97      0.98      0.98     11904
           1       0.96      0.92      0.94      3810
           2       0.99      0.99      0.99      4286

    accuracy                           0.97     20000
   macro avg       0.97      0.97      0.97     20000
weighted avg       0.97      0.97      0.97     20000



## Grid Search

In [47]:
param = {
    'n_estimators':      [100, 300],
        'learning_rate':     [0.05, 0.1],
        'max_depth':         [3, 6],
        'subsample':         [0.8, 1.0],
        'colsample_bytree':  [0.8, 1.0]
}

XG = xgb.XGBClassifier()

In [49]:
grid_search = GridSearchCV(
    estimator=XG,
    param_grid=param,
    cv=5,
    scoring='accuracy',   
    n_jobs=-1,
    verbose=2
)

In [51]:
grid_search.fit(X_train, y_train)


Fitting 5 folds for each of 32 candidates, totalling 160 fits


In [68]:
print(grid_search.best_params_)

{'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 300, 'subsample': 0.8}


In [53]:
best_XG_B = grid_search.best_estimator_
y_pred_best_XG = best_XG_B.predict(X_test)
print("Random Forest Model Accuracy (After Tuning):", classification_report(y_test, y_pred_best_XG))

Random Forest Model Accuracy (After Tuning):               precision    recall  f1-score   support

           0       0.97      0.98      0.98     11904
           1       0.96      0.92      0.94      3810
           2       0.99      0.99      0.99      4286

    accuracy                           0.97     20000
   macro avg       0.97      0.97      0.97     20000
weighted avg       0.97      0.97      0.97     20000



In [63]:
cv_scores_best_XG = cross_val_score(best_XG_B, X_train, y_train, cv=5, scoring='accuracy')
print(f'Cross-Validation Scores (After Tuning): {cv_scores_best_XG}')
print(f'Mean CV Accuracy (After Tuning): {cv_scores_best_XG.mean()}')

Cross-Validation Scores (After Tuning): [0.9746875  0.977625   0.977      0.9764375  0.97449841]
Mean CV Accuracy (After Tuning): 0.9760496812300768
