In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score

In [2]:
# Set random seed for reproducibility
SEED = 111
np.random.seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)

In [3]:
# Load the dataset
df = pd.read_csv("large_dataset.csv")

In [10]:
# Check the total number of values and missing values for each column in X
column_stats = pd.DataFrame({
    "Column": ["age_diff", "kd_diff", "height_diff", "weight_diff", "reach_diff"],
    "Total Values": [df[column].shape[0] for column in ["age_diff", "kd_diff", "height_diff", "weight_diff", "reach_diff"]],
    "Missing Values": [df[column].isnull().sum() for column in ["age_diff", "kd_diff", "height_diff", "weight_diff", "reach_diff"]]
})

print(column_stats)

        Column  Total Values  Missing Values
0     age_diff          7439             213
1      kd_diff          7439               0
2  height_diff          7439               0
3  weight_diff          7439               0
4   reach_diff          7439            1038


In [11]:
# Fill missing values in 'age_diff' and 'reach_diff' with their respective column mean
df["age_diff"] = df["age_diff"].fillna(df["age_diff"].mean())
df["reach_diff"] = df["reach_diff"].fillna(df["reach_diff"].mean())

In [12]:
# Check the total number of values and missing values for each column in X
column_stats = pd.DataFrame({
    "Column": ["age_diff", "kd_diff", "height_diff", "weight_diff", "reach_diff"],
    "Total Values": [df[column].shape[0] for column in ["age_diff", "kd_diff", "height_diff", "weight_diff", "reach_diff"]],
    "Missing Values": [df[column].isnull().sum() for column in ["age_diff", "kd_diff", "height_diff", "weight_diff", "reach_diff"]]
})

print(column_stats)

        Column  Total Values  Missing Values
0     age_diff          7439               0
1      kd_diff          7439               0
2  height_diff          7439               0
3  weight_diff          7439               0
4   reach_diff          7439               0


In [13]:
# Use specific columns for features
X = df[["age_diff", "kd_diff", "height_diff", "weight_diff", "reach_diff"]].values
y = df["winner"].values

In [14]:
# Train/Validation/Test split (8:1:1 ratio)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=SEED)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=SEED)

print(f"Train size: {len(y_train)}, Validation size: {len(y_val)}, Test size: {len(y_test)}")

Train size: 5951, Validation size: 744, Test size: 744


In [15]:
# Initialize models
models = {
    "SVM": SVC(kernel="linear", random_state=SEED),
    "RandomForest": RandomForestClassifier(n_estimators=100, random_state=SEED),
    "XGBoost": XGBClassifier(n_estimators=200, random_state=SEED),
    "LogisticRegression": LogisticRegression(random_state=SEED)
}

In [17]:
# Map 'Red' and 'Blue' to 1 and 0 respectively
y_train = pd.Series(y_train).map({'Red': 1, 'Blue': 0}).values
y_val = pd.Series(y_val).map({'Red': 1, 'Blue': 0}).values
y_test = pd.Series(y_test).map({'Red': 1, 'Blue': 0}).values

In [18]:
# Train and evaluate models on validation set
for name, model in models.items():
    model.fit(X_train, y_train)
    y_val_pred = model.predict(X_val)
    f1 = f1_score(y_val, y_val_pred)
    print(f"{name} Validation F1-Score: {f1:.4f}")
    print(f"Classification Report for {name}:\n{classification_report(y_val, y_val_pred)}")

SVM Validation F1-Score: 0.8260
Classification Report for SVM:
              precision    recall  f1-score   support

           0       0.80      0.35      0.49       263
           1       0.73      0.95      0.83       481

    accuracy                           0.74       744
   macro avg       0.77      0.65      0.66       744
weighted avg       0.75      0.74      0.71       744

RandomForest Validation F1-Score: 0.7761
Classification Report for RandomForest:
              precision    recall  f1-score   support

           0       0.59      0.53      0.56       263
           1       0.76      0.80      0.78       481

    accuracy                           0.70       744
   macro avg       0.67      0.66      0.67       744
weighted avg       0.70      0.70      0.70       744

XGBoost Validation F1-Score: 0.7793
Classification Report for XGBoost:
              precision    recall  f1-score   support

           0       0.59      0.45      0.51       263
           1       0.7

In [19]:
# Evaluate models on the test set
print("\n--- Test Set Performance ---")
for name, model in models.items():
    y_test_pred = model.predict(X_test)
    f1 = f1_score(y_test, y_test_pred)
    print(f"{name} Test F1-Score: {f1:.4f}")
    print(f"Classification Report for {name}:\n{classification_report(y_test, y_test_pred)}")


--- Test Set Performance ---
SVM Test F1-Score: 0.8219
Classification Report for SVM:
              precision    recall  f1-score   support

           0       0.72      0.35      0.47       252
           1       0.74      0.93      0.82       492

    accuracy                           0.73       744
   macro avg       0.73      0.64      0.65       744
weighted avg       0.73      0.73      0.70       744

RandomForest Test F1-Score: 0.7698
Classification Report for RandomForest:
              precision    recall  f1-score   support

           0       0.54      0.49      0.52       252
           1       0.75      0.79      0.77       492

    accuracy                           0.69       744
   macro avg       0.65      0.64      0.64       744
weighted avg       0.68      0.69      0.68       744

XGBoost Test F1-Score: 0.7950
Classification Report for XGBoost:
              precision    recall  f1-score   support

           0       0.60      0.44      0.51       252
          