In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score

In [2]:
# Set random seed for reproducibility
SEED = 111
np.random.seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)

In [3]:
# Load the dataset
df = pd.read_csv("large_dataset.csv")

In [5]:
# Use specific columns for features
X = df[["age_diff", "kd_diff", "height_diff", "weight_diff", "reach_diff"]].values
y = df["winner"].values

In [6]:
# Train/Validation/Test split (8:1:1 ratio)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=SEED)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=SEED)

print(f"Train size: {len(y_train)}, Validation size: {len(y_val)}, Test size: {len(y_test)}")

Train size: 5951, Validation size: 744, Test size: 744


In [7]:
# Initialize models
models = {
    "SVM": SVC(kernel="linear", random_state=SEED),
    "RandomForest": RandomForestClassifier(n_estimators=100, random_state=SEED),
    "XGBoost": XGBClassifier(n_estimators=200, random_state=SEED),
    "LogisticRegression": LogisticRegression(random_state=SEED)
}

In [8]:
# Train and evaluate models on validation set
for name, model in models.items():
    model.fit(X_train, y_train)
    y_val_pred = model.predict(X_val)
    f1 = f1_score(y_val, y_val_pred)
    print(f"{name} Validation F1-Score: {f1:.4f}")
    print(f"Classification Report for {name}:\n{classification_report(y_val, y_val_pred)}")

ValueError: Input X contains NaN.
SVC does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values