In [21]:
import pandas as pd

fighters_profiles = pd.read_csv("../data/raw/fighter_profiles_raw.csv")
fighters_profiles.head(), fighters_profiles.shape
fighters_profiles.isna().mean().sort_values()


fighter_url           0.000
fighter_name          0.000
dob                   0.000
slpm                  0.000
sapm                  0.000
str_acc               0.000
td_avg                0.000
td_acc                0.000
td_def                0.000
sub_avg               0.000
fighter_name_index    0.000
height_in             0.002
reach_in              0.002
stance                0.002
str_def               1.000
dtype: float64

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("../data/raw/ufc_fights_raw.csv")
df.head()


Unnamed: 0,event_url,fight_url,red_fighter,blue_fighter,weight_class,winner,red_kd,blue_kd,red_sig_str_landed,red_sig_str_attempted,blue_sig_str_landed,blue_sig_str_attempted,red_td_landed,red_td_attempted,blue_td_landed,blue_td_attempted
0,http://ufcstats.com/event-details/bd92cf5da541...,http://ufcstats.com/fight-details/4a0db214d972...,Merab Dvalishvili,Petr Yan,Bantamweight,,,,,,,,,,,
1,http://ufcstats.com/event-details/bd92cf5da541...,http://ufcstats.com/fight-details/dfa692db6d39...,Alexandre Pantoja,Joshua Van,Flyweight,,,,,,,,,,,
2,http://ufcstats.com/event-details/bd92cf5da541...,http://ufcstats.com/fight-details/fbbb9e72900b...,Brandon Moreno,Tatsuro Taira,Flyweight,,,,,,,,,,,
3,http://ufcstats.com/event-details/bd92cf5da541...,http://ufcstats.com/fight-details/1dc29f4c6fcd...,Henry Cejudo,Payton Talbott,Bantamweight,,,,,,,,,,,
4,http://ufcstats.com/event-details/bd92cf5da541...,http://ufcstats.com/fight-details/6d6ab10cbaa4...,Jan Blachowicz,Bogdan Guskov,Light Heavyweight,,,,,,,,,,,


In [2]:
# Drop rows with missing key stats or missing winner
df_clean = df.dropna(subset=[
    "winner",
    "red_kd", "blue_kd",
    "red_sig_str_landed", "red_sig_str_attempted",
    "blue_sig_str_landed", "blue_sig_str_attempted",
    "red_td_landed", "red_td_attempted",
    "blue_td_landed", "blue_td_attempted",
])

# Keep only standard Red/Blue outcomes
df_clean = df_clean[df_clean["winner"].isin(["Red", "Blue"])].copy()

# Target: 1 if Red wins, 0 if Blue wins
df_clean["target"] = (df_clean["winner"] == "Red").astype(int)

df_clean.shape


(294, 17)

In [3]:
# Core diff features
df_clean["kd_diff"] = df_clean["red_kd"] - df_clean["blue_kd"]
df_clean["sig_str_diff"] = df_clean["red_sig_str_landed"] - df_clean["blue_sig_str_landed"]
df_clean["td_diff"] = df_clean["red_td_landed"] - df_clean["blue_td_landed"]

df_clean[["kd_diff", "sig_str_diff", "td_diff", "target"]].head()


Unnamed: 0,kd_diff,sig_str_diff,td_diff,target
14,0.0,14.0,1.0,1
15,0.0,-3.0,0.0,0
16,1.0,11.0,0.0,1
17,-1.0,4.0,0.0,0
18,2.0,4.0,0.0,1


In [4]:
# Sig strike accuracy (if attempts > 0, else 0)
df_clean["sig_str_acc_red"] = np.where(
    df_clean["red_sig_str_attempted"] > 0,
    df_clean["red_sig_str_landed"] / df_clean["red_sig_str_attempted"],
    0.0,
)

df_clean["sig_str_acc_blue"] = np.where(
    df_clean["blue_sig_str_attempted"] > 0,
    df_clean["blue_sig_str_landed"] / df_clean["blue_sig_str_attempted"],
    0.0,
)

# TD accuracy (if attempts > 0, else 0)
df_clean["td_acc_red"] = np.where(
    df_clean["red_td_attempted"] > 0,
    df_clean["red_td_landed"] / df_clean["red_td_attempted"],
    0.0,
)

df_clean["td_acc_blue"] = np.where(
    df_clean["blue_td_attempted"] > 0,
    df_clean["blue_td_landed"] / df_clean["blue_td_attempted"],
    0.0,
)

# Attempt diffs
df_clean["sig_str_attempt_diff"] = df_clean["red_sig_str_attempted"] - df_clean["blue_sig_str_attempted"]
df_clean["td_attempt_diff"] = df_clean["red_td_attempted"] - df_clean["blue_td_attempted"]

# Accuracy diffs
df_clean["sig_str_acc_diff"] = df_clean["sig_str_acc_red"] - df_clean["sig_str_acc_blue"]
df_clean["td_acc_diff"] = df_clean["td_acc_red"] - df_clean["td_acc_blue"]

df_clean[[
    "kd_diff", "sig_str_diff", "td_diff",
    "sig_str_attempt_diff", "td_attempt_diff",
    "sig_str_acc_diff", "td_acc_diff", "target"
]].head()


Unnamed: 0,kd_diff,sig_str_diff,td_diff,sig_str_attempt_diff,td_attempt_diff,sig_str_acc_diff,td_acc_diff,target
14,0.0,14.0,1.0,10.0,3.0,0.383333,0.333333,1
15,0.0,-3.0,0.0,4.0,1.0,-0.088381,0.0,0
16,1.0,11.0,0.0,28.0,0.0,0.085714,0.0,1
17,-1.0,4.0,0.0,5.0,0.0,0.045455,0.0,0
18,2.0,4.0,0.0,6.0,0.0,0.07619,0.0,1


In [5]:
feature_cols = [
    "kd_diff",
    "sig_str_diff",
    "td_diff",
    "sig_str_attempt_diff",
    "td_attempt_diff",
    "sig_str_acc_diff",
    "td_acc_diff",
]

# Sanity check: make sure no NaNs in features
df_clean[feature_cols].isna().sum()


kd_diff                 0
sig_str_diff            0
td_diff                 0
sig_str_attempt_diff    0
td_attempt_diff         0
sig_str_acc_diff        0
td_acc_diff             0
dtype: int64

In [6]:
from sklearn.model_selection import train_test_split

X = df_clean[feature_cols]
y = df_clean["target"]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y,
)

X_train.shape, X_test.shape


((235, 7), (59, 7))

In [7]:
# Logistic Regression Model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)

y_pred_lr = log_reg.predict(X_test)

print("Logistic Regression accuracy:", accuracy_score(y_test, y_pred_lr))
print("\nClassification report:\n")
print(classification_report(y_test, y_pred_lr))


Logistic Regression accuracy: 0.8305084745762712

Classification report:

              precision    recall  f1-score   support

           0       0.83      0.76      0.79        25
           1       0.83      0.88      0.86        34

    accuracy                           0.83        59
   macro avg       0.83      0.82      0.82        59
weighted avg       0.83      0.83      0.83        59



In [8]:
# Random Forest Model
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=300,
    random_state=42,
)

rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

print("RandomForest accuracy:", accuracy_score(y_test, y_pred_rf))
print("\nClassification report:\n")
print(classification_report(y_test, y_pred_rf))


RandomForest accuracy: 0.8135593220338984

Classification report:

              precision    recall  f1-score   support

           0       0.77      0.80      0.78        25
           1       0.85      0.82      0.84        34

    accuracy                           0.81        59
   macro avg       0.81      0.81      0.81        59
weighted avg       0.81      0.81      0.81        59



In [9]:
# Gradient Boosting Model
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier(random_state=42)
gb.fit(X_train, y_train)
y_pred_gb = gb.predict(X_test)

print("GradientBoosting accuracy:", accuracy_score(y_test, y_pred_gb))
print("\nClassification report:\n")
print(classification_report(y_test, y_pred_gb))


GradientBoosting accuracy: 0.7288135593220338

Classification report:

              precision    recall  f1-score   support

           0       0.68      0.68      0.68        25
           1       0.76      0.76      0.76        34

    accuracy                           0.73        59
   macro avg       0.72      0.72      0.72        59
weighted avg       0.73      0.73      0.73        59



In [10]:
import joblib

# Save Logistic Regression
joblib.dump(log_reg, "../models/logistic_regression_v1.joblib")

# Save Random Forest
joblib.dump(rf, "../models/random_forest_v1.joblib")

# Save Gradient Boosting
joblib.dump(gb, "../models/gradient_boosting_v1.joblib")

print("Models saved successfully!")


Models saved successfully!
