In [38]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score

In [39]:
# Load datasets
fighter_stats = pd.read_csv("fighter_stats.csv")
large_dataset = pd.read_csv("large_dataset.csv")

In [40]:
# Data Preprocessing
# Drop rows with missing 'name' in fighter_stats
fighter_stats = fighter_stats.dropna(subset=["name"])

# Fill missing numeric columns with mean, categorical with mode
numeric_columns_fighter_stats = fighter_stats.select_dtypes(include=[np.number]).columns
fighter_stats[numeric_columns_fighter_stats] = fighter_stats[numeric_columns_fighter_stats].fillna(
    fighter_stats[numeric_columns_fighter_stats].mean()
)

numeric_columns_large_dataset = large_dataset.select_dtypes(include=[np.number]).columns
categorical_columns_large_dataset = large_dataset.select_dtypes(exclude=[np.number]).columns
large_dataset[numeric_columns_large_dataset] = large_dataset[numeric_columns_large_dataset].fillna(
    large_dataset[numeric_columns_large_dataset].mean()
)
for column in categorical_columns_large_dataset:
    large_dataset[column] = large_dataset[column].fillna(large_dataset[column].mode()[0])

In [41]:
# Encode categorical features (e.g., stances)
stance_map = {stance: idx for idx, stance in enumerate(large_dataset["r_stance"].unique(), start=1)}
large_dataset["r_stance"] = large_dataset["r_stance"].map(stance_map)
large_dataset["b_stance"] = large_dataset["b_stance"].map(stance_map)

In [42]:
# Define features and labels for evaluation
feature_sets = {
    "selected_features": [
        "height_diff", "weight_diff", "reach_diff",
        "r_stance", "b_stance"
    ],
    "all_features": [
        "age_diff", "height_diff", "weight_diff", "reach_diff",
        "SLpM_total_diff", "SApM_total_diff", "sig_str_acc_total_diff",
        "td_acc_total_diff", "str_def_total_diff", "td_def_total_diff",
        "sub_avg_diff", "td_avg_diff"
    ]
}

# Define the target variable
y = large_dataset["winner"].apply(lambda x: 1 if x == "Red" else 0)

In [43]:
# Define models
models = {
    "RandomForest": RandomForestClassifier(n_estimators=50, random_state=42),
    "LogisticRegression": LogisticRegression(max_iter=50, random_state=42),
    "SVM": SVC(max_iter=50, random_state=42),
    "XGBoost": XGBClassifier(n_estimators=50, use_label_encoder=False, eval_metric="logloss", random_state=42)
}

In [44]:
# Define split ratios
split_ratios = [(0.8, 0.1, 0.1), (0.6, 0.3, 0.1), (0.6, 0.2, 0.2)]

In [45]:
# Define mapping for standardizing weight classes
weight_class_mapping = {
    "UFC Women's Flyweight Title": "Women's Flyweight",
    "UFC Women's Strawweight Title": "Women's Strawweight",
    "UFC Women's Bantamweight Title": "Women's Bantamweight",
    "UFC Flyweight Title": "Flyweight",
    "UFC Bantamweight Title": "Bantamweight",
    "UFC Featherweight Title": "Featherweight",
    "UFC Lightweight Title": "Lightweight",
    "UFC Welterweight Title": "Welterweight",
    "UFC Middleweight Title": "Middleweight",
    "UFC Light Heavyweight Title": "Light Heavyweight",
    "UFC Heavyweight Title": "Heavyweight"
}

# Standardize weight_class column
large_dataset["weight_class"] = large_dataset["weight_class"].replace(weight_class_mapping)

In [46]:
# Group by weight class
weight_classes = large_dataset["weight_class"].unique()
results = []

# Loop through each weight class
for weight_class in weight_classes:
    subset = large_dataset[large_dataset["weight_class"] == weight_class]
    if len(subset) < 10:  # Skip classes with fewer than 10 fights
        continue

    y = subset["winner"].apply(lambda x: 1 if x == "Red" else 0)
    for feature_set_name, features in feature_sets.items():
        X = subset[features].dropna()
        for train_ratio, valid_ratio, test_ratio in split_ratios:
            # Calculate split indices
            total_len = len(X)
            train_end = int(total_len * train_ratio)
            valid_end = train_end + int(total_len * valid_ratio)

            # Split the data
            X_train = X.iloc[:train_end]
            y_train = y.iloc[:train_end]

            X_valid = X.iloc[train_end:valid_end]
            y_valid = y.iloc[train_end:valid_end]

            X_test = X.iloc[valid_end:]
            y_test = y.iloc[valid_end:]

            # Skip if y_train contains only one class
            if len(y_train.unique()) < 2:
                print(f"Skipping weight class '{weight_class}' due to single-class training data.")
                continue

            # Train models and evaluate
            for model_name, model in models.items():
                model.fit(X_train, y_train)
                y_pred_test = model.predict(X_test)
                y_pred_valid = model.predict(X_valid)
                test_accuracy = accuracy_score(y_test, y_pred_test)
                test_f1 = f1_score(y_test, y_pred_test)
                valid_accuracy = accuracy_score(y_valid, y_pred_valid)
                valid_f1 = f1_score(y_valid, y_pred_valid)

                results.append({
                    "Weight Class": weight_class,
                    "Feature Set": feature_set_name,
                    "Model": model_name,
                    "Split Ratio": f"{int(train_ratio*10)}:{int(valid_ratio*10)}:{int(test_ratio*10)}",
                    "Validation Accuracy": valid_accuracy,
                    "Validation F1": valid_f1,
                    "Test Accuracy": test_accuracy,
                    "Test F1": test_f1
                })

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Parameters: { "use_label_encoder" } are not used.

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Parameters: { "use_label_encoder" } are not us

Skipping weight class 'Open Weight' due to single-class training data.
Skipping weight class 'Open Weight' due to single-class training data.
Skipping weight class 'Open Weight' due to single-class training data.
Skipping weight class 'Open Weight' due to single-class training data.
Skipping weight class 'Open Weight' due to single-class training data.
Skipping weight class 'Open Weight' due to single-class training data.


In [47]:
# Convert results to DataFrame and display
results_df = pd.DataFrame(results)
# Set pandas options to display all rows and columns
pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns

# Print the entire DataFrame
print(results_df)

              Weight Class        Feature Set               Model Split Ratio  \
0        Women's Flyweight  selected_features        RandomForest       8:1:1   
1        Women's Flyweight  selected_features  LogisticRegression       8:1:1   
2        Women's Flyweight  selected_features                 SVM       8:1:1   
3        Women's Flyweight  selected_features             XGBoost       8:1:1   
4        Women's Flyweight  selected_features        RandomForest       6:3:1   
5        Women's Flyweight  selected_features  LogisticRegression       6:3:1   
6        Women's Flyweight  selected_features                 SVM       6:3:1   
7        Women's Flyweight  selected_features             XGBoost       6:3:1   
8        Women's Flyweight  selected_features        RandomForest       6:2:2   
9        Women's Flyweight  selected_features  LogisticRegression       6:2:2   
10       Women's Flyweight  selected_features                 SVM       6:2:2   
11       Women's Flyweight  