# Voting using Various Models

In [1]:
# increase the width of the notebook
from IPython.display import display, HTML, Markdown

display(HTML("<style>.container { width:90% !important; }</style>"))

In [2]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import VotingClassifier
import numpy as np
import joblib
from tensorflow.keras.models import load_model as load_keras_model

In [3]:
ada_boost = joblib.load("Models/best_adaboost.joblib")
random_forest =joblib.load("Models/best_random_forest.joblib")
desicion_tree = joblib.load("Models/best_decision_tree.joblib")
decision_tree_pca = joblib.load("Models/best_decision_tree_pca.joblib")
best_gradient_boosting = joblib.load("Models/best_gradient_boosting.joblib")
#best_mlp_dropout = load_keras_model("Models/best_mlp_dropout.keras")
best_svc_poly = joblib.load("Models/best_svc_poly.joblib")
best_svc_rbf = joblib.load("Models/best_svc_rbf.joblib")
#best_xgboost = joblib.load("Models/best_xgboost.joblib")
lr_best_model = joblib.load("Models/best_logistic_regression_model.joblib")
sgd_best_model = joblib.load("Models/best_sgd.joblib")
#simple_mlp_chess = joblib.load("Models/simple_mlp_chess.keras")
best_svc_linear = joblib.load("Models/best_linear_svc_model.joblib")

In [4]:
voting_clf = VotingClassifier(
estimators=[("ada",ada_boost),
            ("rf",random_forest),
            ("dt",desicion_tree),
            ("dt_pca",decision_tree_pca),
            ("gb",best_gradient_boosting),
            ("svc_poly",best_svc_poly),
            ("svc_rbf",best_svc_rbf),
            ("lr_best",lr_best_model),
            ("sgd_best",sgd_best_model),
            ("svc_linear",best_svc_linear)
           ]
)

In [5]:
# Load data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

y_train = train["Score"]
y_test = test["Score"]

X_train = train.drop("Score", axis=1)
X_test = test.drop("Score", axis=1)

In [6]:
from sklearn.pipeline import Pipeline

In [7]:
#Preprocessing pipelines
numeric_features = ["WhiteElo", "EloDif"]
categorical_features = ["Opening_name", "Time_format", "Increment_binary"]

numeric_transformer = Pipeline([
    ("scaler", StandardScaler())
])
categorical_transformer = Pipeline([
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", numeric_transformer, numeric_features),
    ("cat", categorical_transformer, categorical_features)
])

In [8]:
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed  = preprocessor.transform(X_test)

In [9]:
voting_clf.fit(X_train_transformed, y_train)

VotingClassifier(estimators=[('ada',
                              AdaBoostClassifier(n_estimators=250,
                                                 random_state=42)),
                             ('rf',
                              RandomForestClassifier(max_depth=10,
                                                     min_samples_leaf=2,
                                                     n_estimators=200,
                                                     random_state=42)),
                             ('dt',
                              DecisionTreeClassifier(criterion='entropy',
                                                     max_depth=5,
                                                     random_state=42)),
                             ('dt_pca',
                              DecisionTreeClassifier(criterion='entropy',
                                                     max_depth=5,
                                                     random_state=42)),
         

In [10]:
y_pred = voting_clf.predict(X_test_transformed)

In [11]:
# Evaluate the performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Voting Classifier Accuracy: {accuracy:.3f}")

Voting Classifier Accuracy: 0.541


In [12]:
import joblib

joblib.dump(voting_clf, 'voting_clf.joblib')

['voting_clf.joblib']

## Soft Voting

In [13]:
from sklearn.calibration import CalibratedClassifierCV

# Wrap the already-trained, hinge‑loss SGD in a sigmoid calibrator:
sgd_cal = CalibratedClassifierCV(sgd_best_model,
                                 method='sigmoid',
                                 cv=3)  

# Fit it on your training data:
sgd_cal.fit(X_train_transformed, y_train)


CalibratedClassifierCV(base_estimator=SGDClassifier(alpha=0.1, random_state=42),
                       cv=3)

In [14]:
# assume best_svc_poly, best_svc_rbf, best_svc_linear are already fitted with probability=False
svc_poly_cal   = CalibratedClassifierCV(best_svc_poly,   method='sigmoid', cv=3)
svc_rbf_cal    = CalibratedClassifierCV(best_svc_rbf,    method='sigmoid', cv=3)
svc_linear_cal = CalibratedClassifierCV(best_svc_linear, method='sigmoid', cv=3)

# fit the calibrators on your training data
svc_poly_cal.fit(X_train_transformed, y_train)
svc_rbf_cal.fit(X_train_transformed, y_train)
svc_linear_cal.fit(X_train_transformed, y_train)

CalibratedClassifierCV(base_estimator=LinearSVC(C=0.1, dual=False,
                                                random_state=42),
                       cv=3)

In [15]:
voting_clf_soft = VotingClassifier(
    estimators=[
        ("ada", ada_boost),
        ("rf",random_forest),
        ("dt", desicion_tree),
        ("dt_pca", decision_tree_pca),
        ("gb", best_gradient_boosting),
        ("svc_poly", svc_poly_cal),
        ("svc_rbf", svc_rbf_cal),
        ("lr_best", lr_best_model),
        ("sgd_best", sgd_cal),
        ("svc_linear", svc_linear_cal)
    ],
    voting='soft',            # use class‐probabilities
    weights=[2,2,1,1,1,1,1,1,1,1]  # give more weight to stronger models
)

In [16]:
voting_clf_soft.fit(X_train_transformed, y_train)

VotingClassifier(estimators=[('ada',
                              AdaBoostClassifier(n_estimators=250,
                                                 random_state=42)),
                             ('rf',
                              RandomForestClassifier(max_depth=10,
                                                     min_samples_leaf=2,
                                                     n_estimators=200,
                                                     random_state=42)),
                             ('dt',
                              DecisionTreeClassifier(criterion='entropy',
                                                     max_depth=5,
                                                     random_state=42)),
                             ('dt_pca',
                              DecisionTreeClassifier(criterion='entropy',
                                                     max_depth=5,
                                                     random_state=42)),
         

In [17]:
y_pred_soft = voting_clf_soft.predict(X_test_transformed)

In [18]:
# Evaluate the performance
accuracy = accuracy_score(y_test, y_pred_soft)
print(f"Soft Voting Classifier Accuracy: {accuracy:.3f}")

Soft Voting Classifier Accuracy: 0.540


### Similar score with hard voting

## The fact that the Voting Classifier cannot outperform the best individual models is a strong indication that the models in the ensemble are not sufficiently diverse and are likely making similar types of errors.