# BaggingClassifier

In [1]:
# increase the width of the notebook
from IPython.display import display, HTML, Markdown
display(HTML("<style>.container { width:90% !important; }</style>"))

import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
import joblib

In [2]:
# Load data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

y_train = train["Score"]
y_test = test["Score"]

X_train = train.drop("Score", axis=1)
X_test = test.drop("Score", axis=1)

## Transformations

In [3]:
# Preprocessing setup
numeric_features = ["WhiteElo", "EloDif"]
categorical_features = ["Opening_name", "Time_format", "Increment_binary"]

numeric_transformer = Pipeline([
    ("scaler", StandardScaler())
])
categorical_transformer = Pipeline([
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", numeric_transformer, numeric_features),
    ("cat", categorical_transformer, categorical_features)
])

## Base estimator and bagging classifier

In [4]:
dt = DecisionTreeClassifier(criterion='entropy', random_state=42)
bagging_model = BaggingClassifier(estimator=dt, random_state=42)

# Full pipeline with preprocessing
pipe_bag = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", bagging_model)
])

# Hyperparameter grid
param_grid_bag = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_samples':  [0.6, 1.0],
    'classifier__max_features': [0.6, 1.0],
    'classifier__bootstrap':    [True, False]
}

# Grid SearchCV
grid_search_bag = GridSearchCV(
    pipe_bag,
    param_grid=param_grid_bag,
    cv=3,
    scoring='accuracy',
    n_jobs=-1
)

grid_search_bag.fit(X_train, y_train)

# Best settings
print("Best Bagging:", grid_search_bag.best_params_)
print("Best CV accuracy: {:.3f}".format(grid_search_bag.best_score_))

Best Bagging: {'classifier__bootstrap': True, 'classifier__max_features': 0.6, 'classifier__max_samples': 0.6, 'classifier__n_estimators': 100}
Best CV accuracy: 0.523


In [5]:
# Evaluate on test set
y_pred_bag = grid_search_bag.predict(X_test)
print("Test accuracy (Bagging): {:.3f}".format(accuracy_score(y_test, y_pred_bag)))
print("\nClassification Report:\n", classification_report(y_test, y_pred_bag))

Test accuracy (Bagging): 0.517

Classification Report:
               precision    recall  f1-score   support

   Black Win       0.50      0.47      0.49      4524
        Draw       0.00      0.00      0.00       566
   White Win       0.53      0.62      0.57      4910

    accuracy                           0.52     10000
   macro avg       0.34      0.36      0.35     10000
weighted avg       0.49      0.52      0.50     10000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [6]:
# Save the model
joblib.dump(grid_search_bag.best_estimator_, 'best_bagging_model.joblib')

['best_bagging_model.joblib']

### The Bagging Classifier achieved an accuracy of 0.517, which is the lowest performance we have observed across all the models tested.