In [159]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, log_loss, roc_auc_score, matthews_corrcoef, cohen_kappa_score
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import xgboost as xgb
import lightgbm as lgb
from skopt import BayesSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder

In [160]:
# Load the data
data = pd.read_csv('Data1.csv')

In [161]:
# Separate features and target
X = data.drop(columns=['Rated_(T/F)', 'Winner', 'Game_Status'])
y = data['Winner']

In [162]:
# Preprocessing: Scaling the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [163]:
# PCA for dimensionality reduction
pca = PCA(n_components=15)  # Reducing to 15 components
X_pca = pca.fit_transform(X_scaled)

In [164]:
# Print explained variance
print("Explained Variance Ratio:", sum(pca.explained_variance_ratio_)*100)

Explained Variance Ratio: 95.49578201410569


In [165]:
# Split the data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.20, random_state=42)

In [166]:
# SMOTE (Synthetic Minority Over-sampling Technique) to handle class imbalance
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

In [167]:
# 1. **RandomForestClassifier with Hyperparameter Tuning**
# Hyperparameter search space for RandomForest
search_space_rf = {
    'n_estimators': [50, 100, 150, 200],
    'max_depth': [5, 10, 20, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5]
}

In [168]:
# Initialize Stratified K-Fold Cross-Validation
stratified_kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

In [169]:
# RandomForest model with BayesSearchCV for hyperparameter tuning
grid_search_rf = BayesSearchCV(
    estimator=RandomForestClassifier(class_weight='balanced', random_state=42, n_jobs=-1),
    search_spaces=search_space_rf,
    n_iter=20,  # Number of iterations for search
    cv=stratified_kfold,
    scoring="accuracy",
    n_jobs=-1,
    verbose=1
)
grid_search_rf.fit(X_resampled, y_resampled)
y_pred_rf = grid_search_rf.predict(X_test)

Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits


In [170]:
# 2. **LightGBM Classifier**
# Label encoding
label_mapping = {'black': 1, 'draw': 0, 'white': 2}
y_train_encoded = np.array([label_mapping.get(label, -1) for label in y_train])
y_test_encoded = np.array([label_mapping.get(label, -1) for label in y_test])

In [171]:
# Ensure no undefined labels
assert np.all(y_train_encoded != -1), "Some labels in y_train are undefined"
assert np.all(y_test_encoded != -1), "Some labels in y_test are undefined"

In [172]:
# Resample the training data using SMOTE
X_train_oversampled, y_train_oversampled = smote.fit_resample(X_train, y_train_encoded)

In [173]:
# LightGBM model parameters
params_lgb = {
    'objective': 'multiclass',
    'num_class': 3,
    'max_depth': 7,
    'learning_rate': 0.1,
    'num_leaves': 31,
    'metric': 'multi_logloss',
    'scale_pos_weight': 5,  # Adjusting for class imbalance
    'boosting_type': 'gbdt',
    'max_bin': 255
}

In [174]:
# Create LightGBM dataset and train
train_data = lgb.Dataset(X_train_oversampled, label=y_train_oversampled)
test_data = lgb.Dataset(X_test, label=y_test_encoded, reference=train_data)

In [175]:
model_lgb = lgb.train(params_lgb, train_data, num_boost_round=150, valid_sets=[test_data])
preds_lgb = model_lgb.predict(X_test, num_iteration=model_lgb.best_iteration)
pred_labels_lgb = np.argmax(preds_lgb, axis=1)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004511 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3825
[LightGBM] [Info] Number of data points in the train set: 10524, number of used features: 15
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


In [176]:
# 3. **Support Vector Machine (SVM)**
svm_model = SVC(kernel='rbf', class_weight='balanced', probability=True)  # Handling imbalance
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)

In [177]:
# 4. **XGBoost Classifier**
# Encode labels for XGBoost
y_train_encoded_xgb = np.array([label_mapping.get(label, -1) for label in y_train])
y_test_encoded_xgb = np.array([label_mapping.get(label, -1) for label in y_test])

In [178]:
# Ensure no undefined labels
assert np.all(y_train_encoded_xgb != -1), "Some labels in y_train are undefined"
assert np.all(y_test_encoded_xgb != -1), "Some labels in y_test are undefined"

In [179]:
# Calculate scale_pos_weight for XGBoost (for class imbalance)
scale_pos_weight = (np.sum(y_train_encoded_xgb == 1) + np.sum(y_train_encoded_xgb == 2)) / np.sum(y_train_encoded_xgb == 0)

In [None]:
# XGBoost parameters
params_xgb = {
    'objective': 'multi:softmax',  # Multi-class classification
    'num_class': 3,                # 3 classes: draw, black, white
    'max_depth': 5,
    'learning_rate': 0.1,
    'scale_pos_weight': scale_pos_weight
}

In [181]:
# Train XGBoost model
model_xgb = xgb.train(params=params_xgb, 
                      dtrain=xgb.DMatrix(X_train, label=y_train_encoded_xgb), 
                      num_boost_round=150)

Parameters: { "scale_pos_weight" } are not used.



In [182]:
# Make predictions with XGBoost
X_test_df = pd.DataFrame(X_test)
# Convert X_train to a DataFrame (if not already done)
X_train_df = pd.DataFrame(X_train)
X_test_df = X_test_df[X_train_df.columns]
xgb_test = xgb.DMatrix(X_test_df)
preds_xgb = model_xgb.predict(xgb_test)

In [183]:
# Evaluation metrics
models = {
    "Random Forest": grid_search_rf,
    "LightGBM": model_lgb,
    "SVM": svm_model,
    "XGBoost": model_xgb
}

In [190]:
def evaluate_model(model, model_name, X_test, y_test_encoded, X_test_df=None):
    # Predictions for different models
    if model_name == "LightGBM":
        # Get predictions and pick the class with the highest probability
        y_pred = np.argmax(model.predict(X_test, num_iteration=model.best_iteration), axis=1)
        y_pred_prob = model.predict(X_test, num_iteration=model.best_iteration)  # Probabilities for LightGBM
        
    elif model_name == "XGBoost":
        # XGBoost: Use model.predict() for probabilities (multi-class classification)
        xgb_test = xgb.DMatrix(X_test_df)
        y_pred_prob = model.predict(xgb_test, output_margin=False)  # Probabilities for each class
        
        # Check the shape of y_pred_prob to confirm it's a 2D array
        print(f"y_pred_prob shape: {y_pred_prob.shape}")
        
        # If y_pred_prob is 1D (only class labels), adjust accordingly
        if len(y_pred_prob.shape) == 1:  # 1D array (predicted labels)
            y_pred = y_pred_prob  # Use labels directly
            print("y_pred_prob is 1D. Using labels directly.")
        else:  # 2D array (probabilities for each class)
            print(f"Sum of probabilities for each sample: {np.sum(y_pred_prob, axis=1)}")  # Should sum to 1
            y_pred = np.argmax(y_pred_prob, axis=1)  # Convert probabilities to class labels
            print("y_pred_prob is 2D. Using np.argmax for class labels.")
                 
    elif model_name == "SVM":
        # SVM: Get probabilities for each class
        y_pred_prob = model.predict_proba(X_test)  # Get class probabilities
        y_pred = np.argmax(y_pred_prob, axis=1)  # Predicted class labels (choose class with highest probability)
        
    else:
        y_pred = model.predict(X_test)
        y_pred_prob = model.predict_proba(X_test)  # Probabilities for other models
    
    # If predictions are in string format, convert them to numeric labels using the label_encoder
    if isinstance(y_pred[0], str):
        label_encoder = LabelEncoder()
        y_pred = label_encoder.fit_transform(y_pred)
    
    print("y_pred_prob shape:", y_pred_prob.shape)
    print("Sum of probabilities for each sample:", np.sum(y_pred_prob, axis=1))  # This should be close to 1

    # Accuracy
    accuracy = accuracy_score(y_test_encoded, y_pred)
    print(f"\n{model_name} Accuracy: {accuracy * 100:.2f}%")

    # Classification Report
    print(f"\n{model_name} Classification Report:\n{classification_report(y_test_encoded, y_pred)}")

    # Confusion Matrix
    cm = confusion_matrix(y_test_encoded, y_pred)
    print(f"\n{model_name} Confusion Matrix:\n{cm}")

    # Log Loss (for all models, using predicted probabilities)
    log_loss_value = log_loss(y_test_encoded, y_pred_prob)
    print(f"\n{model_name} Log Loss: {log_loss_value:.4f}")

    # ROC-AUC (for all models, using predicted probabilities)
    roc_auc = roc_auc_score(y_test_encoded, y_pred_prob, multi_class='ovr')
    print(f"\n{model_name} ROC-AUC: {roc_auc:.4f}")

    # Matthews Correlation Coefficient (MCC)
    mcc_value = matthews_corrcoef(y_test_encoded, y_pred)
    print(f"\n{model_name} Matthews Correlation Coefficient (MCC): {mcc_value:.4f}")

    # Cohen's Kappa Score
    kappa_value = cohen_kappa_score(y_test_encoded, y_pred)
    print(f"\n{model_name} Cohen's Kappa Score: {kappa_value:.4f}")


for model_name, model in models.items():
    evaluate_model(model, model_name, X_test, y_test_encoded, X_test_df)

y_pred_prob shape: (1745, 3)
Sum of probabilities for each sample: [1. 1. 1. ... 1. 1. 1.]

Random Forest Accuracy: 37.36%

Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.03      0.28      0.05        71
           1       0.48      0.09      0.15       803
           2       0.64      0.65      0.64       871

    accuracy                           0.37      1745
   macro avg       0.38      0.34      0.28      1745
weighted avg       0.54      0.37      0.39      1745


Random Forest Confusion Matrix:
[[ 20  22  29]
 [448  70 285]
 [254  55 562]]

Random Forest Log Loss: 1.3904

Random Forest ROC-AUC: 0.5304

Random Forest Matthews Correlation Coefficient (MCC): 0.1219

Random Forest Cohen's Kappa Score: 0.0972
y_pred_prob shape: (1745, 3)
Sum of probabilities for each sample: [1. 1. 1. ... 1. 1. 1.]

LightGBM Accuracy: 57.02%

LightGBM Classification Report:
              precision    recall  f1-score   support

     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


AxisError: axis 1 is out of bounds for array of dimension 1