In [30]:
import pandas as pd

# Load fights
fights = pd.read_csv("../data/raw/ufc_fights_raw.csv")

# Load fighter index (name ↔ URL)
fighters_index = pd.read_csv("../data/raw/fighters_index.csv")

# Load fighter profiles (URL ↦ stats)
fighter_profiles = pd.read_csv("../data/raw/fighter_profiles_raw.csv")

fights.shape, fighters_index.shape, fighter_profiles.shape


((313, 16), (500, 2), (500, 15))

In [31]:
# Ensure one URL per fighter name
name_to_url = (
    fighters_index
    .drop_duplicates("fighter_name")
    .set_index("fighter_name")["fighter_url"]
)

# Map red and blue fighter names to URLs
fights["red_fighter_url"] = fights["red_fighter"].map(name_to_url)
fights["blue_fighter_url"] = fights["blue_fighter"].map(name_to_url)

fights[["red_fighter", "red_fighter_url", "blue_fighter", "blue_fighter_url"]].head()


Unnamed: 0,red_fighter,red_fighter_url,blue_fighter,blue_fighter_url
0,Merab Dvalishvili,http://ufcstats.com/fighter-details/c03520b5c8...,Petr Yan,http://ufcstats.com/fighter-details/d661ce4da7...
1,Alexandre Pantoja,http://ufcstats.com/fighter-details/a0f0004aad...,Joshua Van,http://ufcstats.com/fighter-details/17e9764940...
2,Brandon Moreno,http://ufcstats.com/fighter-details/792be9a24d...,Tatsuro Taira,http://ufcstats.com/fighter-details/4461d7e473...
3,Henry Cejudo,http://ufcstats.com/fighter-details/056c493bbd...,Payton Talbott,http://ufcstats.com/fighter-details/6e743a33d5...
4,Jan Blachowicz,http://ufcstats.com/fighter-details/99df7d0a2a...,Bogdan Guskov,http://ufcstats.com/fighter-details/ef5dcb10d2...


In [32]:
# Drop the broken str_def column for now
fighter_profiles = fighter_profiles.drop(columns=["str_def"])

# Red profiles
red_profiles = fighter_profiles.rename(
    columns=lambda c: f"red_{c}" if c != "fighter_url" else "red_fighter_url"
)

# Blue profiles
blue_profiles = fighter_profiles.rename(
    columns=lambda c: f"blue_{c}" if c != "fighter_url" else "blue_fighter_url"
)

red_profiles.head()



Unnamed: 0,red_fighter_url,red_fighter_name,red_height_in,red_reach_in,red_stance,red_dob,red_slpm,red_sapm,red_str_acc,red_td_avg,red_td_acc,red_td_def,red_sub_avg,red_fighter_name_index
0,http://ufcstats.com/fighter-details/c03520b5c8...,Merab Dvalishvili,66.0,68.0,Orthodox,"Jan 10, 1991",4.33,2.55,42.0,6.4,37.0,82.0,0.3,Merab Dvalishvili
1,http://ufcstats.com/fighter-details/d661ce4da7...,Petr Yan,67.0,67.0,Switch,"Feb 11, 1993",5.12,4.14,54.0,1.58,48.0,85.0,0.1,Petr Yan
2,http://ufcstats.com/fighter-details/a0f0004aad...,Alexandre Pantoja,65.0,67.0,Orthodox,"Apr 16, 1990",4.36,3.88,50.0,2.8,47.0,69.0,1.0,Alexandre Pantoja
3,http://ufcstats.com/fighter-details/17e9764940...,Joshua Van,65.0,65.0,Orthodox,"Oct 10, 2001",8.86,6.36,56.0,0.85,63.0,81.0,0.0,Joshua Van
4,http://ufcstats.com/fighter-details/792be9a24d...,Brandon Moreno,67.0,70.0,Orthodox,"Dec 07, 1993",3.96,3.62,44.0,1.51,44.0,64.0,0.4,Brandon Moreno


In [33]:
# Start from fights that have both URLs
merged = (
    fights_with_urls
    .merge(red_profiles, on="red_fighter_url", how="inner")
    .merge(blue_profiles, on="blue_fighter_url", how="inner")
)

merged.shape


(313, 44)

In [34]:
merged[
    [
        "red_fighter", "blue_fighter",
        "red_height_in", "blue_height_in",
        "red_reach_in", "blue_reach_in",
        "red_slpm", "blue_slpm",
        "red_td_avg", "blue_td_avg",
    ]
].head()


Unnamed: 0,red_fighter,blue_fighter,red_height_in,blue_height_in,red_reach_in,blue_reach_in,red_slpm,blue_slpm,red_td_avg,blue_td_avg
0,Merab Dvalishvili,Petr Yan,66.0,67.0,68.0,67.0,4.33,5.12,6.4,1.58
1,Merab Dvalishvili,Cory Sandhagen,66.0,71.0,68.0,70.0,4.33,4.86,6.4,1.15
2,Merab Dvalishvili,Sean O'Malley,66.0,71.0,68.0,72.0,4.33,6.3,6.4,0.27
3,Alexandre Pantoja,Joshua Van,65.0,65.0,67.0,65.0,4.36,8.86,2.8,0.85
4,Alexandre Pantoja,Kai Kara-France,65.0,64.0,67.0,69.0,4.36,4.46,2.8,0.56


In [35]:
# Start from the merged fights + fighter profiles
model_df = merged.copy()

# Keep only standard Red/Blue outcomes
model_df = model_df[model_df["winner"].isin(["Red", "Blue"])].copy()

# Target: 1 if Red wins, 0 if Blue wins
model_df["target"] = (model_df["winner"] == "Red").astype(int)

model_df[["red_fighter", "blue_fighter", "winner", "target"]].head()


Unnamed: 0,red_fighter,blue_fighter,winner,target
1,Merab Dvalishvili,Cory Sandhagen,Red,1
2,Merab Dvalishvili,Sean O'Malley,Red,1
4,Alexandre Pantoja,Kai Kara-France,Red,1
10,Terrance McKinney,Viacheslav Borshchev,Red,1
11,Matheus Camilo,Viacheslav Borshchev,Blue,0


In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("../data/raw/ufc_fights_raw.csv")
df.head()


Unnamed: 0,event_url,fight_url,red_fighter,blue_fighter,weight_class,winner,red_kd,blue_kd,red_sig_str_landed,red_sig_str_attempted,blue_sig_str_landed,blue_sig_str_attempted,red_td_landed,red_td_attempted,blue_td_landed,blue_td_attempted
0,http://ufcstats.com/event-details/bd92cf5da541...,http://ufcstats.com/fight-details/4a0db214d972...,Merab Dvalishvili,Petr Yan,Bantamweight,,,,,,,,,,,
1,http://ufcstats.com/event-details/bd92cf5da541...,http://ufcstats.com/fight-details/dfa692db6d39...,Alexandre Pantoja,Joshua Van,Flyweight,,,,,,,,,,,
2,http://ufcstats.com/event-details/bd92cf5da541...,http://ufcstats.com/fight-details/fbbb9e72900b...,Brandon Moreno,Tatsuro Taira,Flyweight,,,,,,,,,,,
3,http://ufcstats.com/event-details/bd92cf5da541...,http://ufcstats.com/fight-details/1dc29f4c6fcd...,Henry Cejudo,Payton Talbott,Bantamweight,,,,,,,,,,,
4,http://ufcstats.com/event-details/bd92cf5da541...,http://ufcstats.com/fight-details/6d6ab10cbaa4...,Jan Blachowicz,Bogdan Guskov,Light Heavyweight,,,,,,,,,,,


In [2]:
# Drop rows with missing key stats or missing winner
df_clean = df.dropna(subset=[
    "winner",
    "red_kd", "blue_kd",
    "red_sig_str_landed", "red_sig_str_attempted",
    "blue_sig_str_landed", "blue_sig_str_attempted",
    "red_td_landed", "red_td_attempted",
    "blue_td_landed", "blue_td_attempted",
])

# Keep only standard Red/Blue outcomes
df_clean = df_clean[df_clean["winner"].isin(["Red", "Blue"])].copy()

# Target: 1 if Red wins, 0 if Blue wins
df_clean["target"] = (df_clean["winner"] == "Red").astype(int)

df_clean.shape


(294, 17)

In [3]:
# Core diff features
df_clean["kd_diff"] = df_clean["red_kd"] - df_clean["blue_kd"]
df_clean["sig_str_diff"] = df_clean["red_sig_str_landed"] - df_clean["blue_sig_str_landed"]
df_clean["td_diff"] = df_clean["red_td_landed"] - df_clean["blue_td_landed"]

df_clean[["kd_diff", "sig_str_diff", "td_diff", "target"]].head()


Unnamed: 0,kd_diff,sig_str_diff,td_diff,target
14,0.0,14.0,1.0,1
15,0.0,-3.0,0.0,0
16,1.0,11.0,0.0,1
17,-1.0,4.0,0.0,0
18,2.0,4.0,0.0,1


In [4]:
# Sig strike accuracy (if attempts > 0, else 0)
df_clean["sig_str_acc_red"] = np.where(
    df_clean["red_sig_str_attempted"] > 0,
    df_clean["red_sig_str_landed"] / df_clean["red_sig_str_attempted"],
    0.0,
)

df_clean["sig_str_acc_blue"] = np.where(
    df_clean["blue_sig_str_attempted"] > 0,
    df_clean["blue_sig_str_landed"] / df_clean["blue_sig_str_attempted"],
    0.0,
)

# TD accuracy (if attempts > 0, else 0)
df_clean["td_acc_red"] = np.where(
    df_clean["red_td_attempted"] > 0,
    df_clean["red_td_landed"] / df_clean["red_td_attempted"],
    0.0,
)

df_clean["td_acc_blue"] = np.where(
    df_clean["blue_td_attempted"] > 0,
    df_clean["blue_td_landed"] / df_clean["blue_td_attempted"],
    0.0,
)

# Attempt diffs
df_clean["sig_str_attempt_diff"] = df_clean["red_sig_str_attempted"] - df_clean["blue_sig_str_attempted"]
df_clean["td_attempt_diff"] = df_clean["red_td_attempted"] - df_clean["blue_td_attempted"]

# Accuracy diffs
df_clean["sig_str_acc_diff"] = df_clean["sig_str_acc_red"] - df_clean["sig_str_acc_blue"]
df_clean["td_acc_diff"] = df_clean["td_acc_red"] - df_clean["td_acc_blue"]

df_clean[[
    "kd_diff", "sig_str_diff", "td_diff",
    "sig_str_attempt_diff", "td_attempt_diff",
    "sig_str_acc_diff", "td_acc_diff", "target"
]].head()


Unnamed: 0,kd_diff,sig_str_diff,td_diff,sig_str_attempt_diff,td_attempt_diff,sig_str_acc_diff,td_acc_diff,target
14,0.0,14.0,1.0,10.0,3.0,0.383333,0.333333,1
15,0.0,-3.0,0.0,4.0,1.0,-0.088381,0.0,0
16,1.0,11.0,0.0,28.0,0.0,0.085714,0.0,1
17,-1.0,4.0,0.0,5.0,0.0,0.045455,0.0,0
18,2.0,4.0,0.0,6.0,0.0,0.07619,0.0,1


In [5]:
feature_cols = [
    "kd_diff",
    "sig_str_diff",
    "td_diff",
    "sig_str_attempt_diff",
    "td_attempt_diff",
    "sig_str_acc_diff",
    "td_acc_diff",
]

# Sanity check: make sure no NaNs in features
df_clean[feature_cols].isna().sum()


kd_diff                 0
sig_str_diff            0
td_diff                 0
sig_str_attempt_diff    0
td_attempt_diff         0
sig_str_acc_diff        0
td_acc_diff             0
dtype: int64

In [6]:
from sklearn.model_selection import train_test_split

X = df_clean[feature_cols]
y = df_clean["target"]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y,
)

X_train.shape, X_test.shape


((235, 7), (59, 7))

In [7]:
# Logistic Regression Model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)

y_pred_lr = log_reg.predict(X_test)

print("Logistic Regression accuracy:", accuracy_score(y_test, y_pred_lr))
print("\nClassification report:\n")
print(classification_report(y_test, y_pred_lr))


Logistic Regression accuracy: 0.8305084745762712

Classification report:

              precision    recall  f1-score   support

           0       0.83      0.76      0.79        25
           1       0.83      0.88      0.86        34

    accuracy                           0.83        59
   macro avg       0.83      0.82      0.82        59
weighted avg       0.83      0.83      0.83        59



In [8]:
# Random Forest Model
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=300,
    random_state=42,
)

rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

print("RandomForest accuracy:", accuracy_score(y_test, y_pred_rf))
print("\nClassification report:\n")
print(classification_report(y_test, y_pred_rf))


RandomForest accuracy: 0.8135593220338984

Classification report:

              precision    recall  f1-score   support

           0       0.77      0.80      0.78        25
           1       0.85      0.82      0.84        34

    accuracy                           0.81        59
   macro avg       0.81      0.81      0.81        59
weighted avg       0.81      0.81      0.81        59



In [9]:
# Gradient Boosting Model
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier(random_state=42)
gb.fit(X_train, y_train)
y_pred_gb = gb.predict(X_test)

print("GradientBoosting accuracy:", accuracy_score(y_test, y_pred_gb))
print("\nClassification report:\n")
print(classification_report(y_test, y_pred_gb))


GradientBoosting accuracy: 0.7288135593220338

Classification report:

              precision    recall  f1-score   support

           0       0.68      0.68      0.68        25
           1       0.76      0.76      0.76        34

    accuracy                           0.73        59
   macro avg       0.72      0.72      0.72        59
weighted avg       0.73      0.73      0.73        59



In [10]:
import joblib

# Save Logistic Regression
joblib.dump(log_reg, "../models/logistic_regression_v1.joblib")

# Save Random Forest
joblib.dump(rf, "../models/random_forest_v1.joblib")

# Save Gradient Boosting
joblib.dump(gb, "../models/gradient_boosting_v1.joblib")

print("Models saved successfully!")


Models saved successfully!
