In [17]:
import pandas as pd
import numpy as np

# Load fights (with post-fight stats)
fights = pd.read_csv("../data/raw/ufc_fights_raw.csv")

# Load fighter index (name ↔ profile URL)
fighters_index = pd.read_csv("../data/raw/fighters_index.csv")

# Load fighter profiles (profile URL ↦ height/reach/SLpM/etc.)
fighter_profiles = pd.read_csv("../data/raw/fighter_profiles_raw.csv")

fights.shape, fighters_index.shape, fighter_profiles.shape


((313, 16), (500, 2), (500, 15))

In [18]:
# Map fighter_name -> fighter_url using the index
name_to_url = (
    fighters_index
    .drop_duplicates("fighter_name")
    .set_index("fighter_name")["fighter_url"]
)

# Attach URLs to fights
fights["red_fighter_url"] = fights["red_fighter"].map(name_to_url)
fights["blue_fighter_url"] = fights["blue_fighter"].map(name_to_url)

fights[["red_fighter", "red_fighter_url", "blue_fighter", "blue_fighter_url"]].head()


Unnamed: 0,red_fighter,red_fighter_url,blue_fighter,blue_fighter_url
0,Merab Dvalishvili,http://ufcstats.com/fighter-details/c03520b5c8...,Petr Yan,http://ufcstats.com/fighter-details/d661ce4da7...
1,Alexandre Pantoja,http://ufcstats.com/fighter-details/a0f0004aad...,Joshua Van,http://ufcstats.com/fighter-details/17e9764940...
2,Brandon Moreno,http://ufcstats.com/fighter-details/792be9a24d...,Tatsuro Taira,http://ufcstats.com/fighter-details/4461d7e473...
3,Henry Cejudo,http://ufcstats.com/fighter-details/056c493bbd...,Payton Talbott,http://ufcstats.com/fighter-details/6e743a33d5...
4,Jan Blachowicz,http://ufcstats.com/fighter-details/99df7d0a2a...,Bogdan Guskov,http://ufcstats.com/fighter-details/ef5dcb10d2...


In [19]:
fights_with_urls = fights.dropna(subset=["red_fighter_url", "blue_fighter_url"]).copy()

fights.shape, fights_with_urls.shape


((313, 18), (313, 18))

In [20]:
# Drop str_def for now since it's all NaN
fighter_profiles = fighter_profiles.drop(columns=["str_def"])

# Red profiles: prefix all columns with 'red_' except URL
red_profiles = fighter_profiles.rename(
    columns=lambda c: f"red_{c}" if c != "fighter_url" else "red_fighter_url"
)

# Blue profiles: prefix all columns with 'blue_' except URL
blue_profiles = fighter_profiles.rename(
    columns=lambda c: f"blue_{c}" if c != "fighter_url" else "blue_fighter_url"
)

red_profiles.head()


Unnamed: 0,red_fighter_url,red_fighter_name,red_height_in,red_reach_in,red_stance,red_dob,red_slpm,red_sapm,red_str_acc,red_td_avg,red_td_acc,red_td_def,red_sub_avg,red_fighter_name_index
0,http://ufcstats.com/fighter-details/c03520b5c8...,Merab Dvalishvili,66.0,68.0,Orthodox,"Jan 10, 1991",4.33,2.55,42.0,6.4,37.0,82.0,0.3,Merab Dvalishvili
1,http://ufcstats.com/fighter-details/d661ce4da7...,Petr Yan,67.0,67.0,Switch,"Feb 11, 1993",5.12,4.14,54.0,1.58,48.0,85.0,0.1,Petr Yan
2,http://ufcstats.com/fighter-details/a0f0004aad...,Alexandre Pantoja,65.0,67.0,Orthodox,"Apr 16, 1990",4.36,3.88,50.0,2.8,47.0,69.0,1.0,Alexandre Pantoja
3,http://ufcstats.com/fighter-details/17e9764940...,Joshua Van,65.0,65.0,Orthodox,"Oct 10, 2001",8.86,6.36,56.0,0.85,63.0,81.0,0.0,Joshua Van
4,http://ufcstats.com/fighter-details/792be9a24d...,Brandon Moreno,67.0,70.0,Orthodox,"Dec 07, 1993",3.96,3.62,44.0,1.51,44.0,64.0,0.4,Brandon Moreno


In [21]:
merged = (
    fights_with_urls
    .merge(red_profiles, on="red_fighter_url", how="inner")
    .merge(blue_profiles, on="blue_fighter_url", how="inner")
)

merged.shape


(313, 44)

In [22]:
# Start from merged with profiles
model_df = merged.copy()

# Keep only fights where we know who won (Red/Blue only)
model_df = model_df[model_df["winner"].isin(["Red", "Blue"])].copy()

# Target: 1 if Red wins, 0 if Blue wins
model_df["target"] = (model_df["winner"] == "Red").astype(int)

model_df[["red_fighter", "blue_fighter", "winner", "target"]].head()


Unnamed: 0,red_fighter,blue_fighter,winner,target
1,Merab Dvalishvili,Cory Sandhagen,Red,1
2,Merab Dvalishvili,Sean O'Malley,Red,1
4,Alexandre Pantoja,Kai Kara-France,Red,1
10,Terrance McKinney,Viacheslav Borshchev,Red,1
11,Matheus Camilo,Viacheslav Borshchev,Blue,0


In [23]:
import datetime as dt

def compute_age(dob_str):
    try:
        dob = pd.to_datetime(dob_str)
        today = pd.Timestamp("today")
        return (today - dob).days / 365.25
    except:
        return np.nan

# Compute ages for red/blue fighters
model_df["red_age"] = model_df["red_dob"].apply(compute_age)
model_df["blue_age"] = model_df["blue_dob"].apply(compute_age)

# Age difference
model_df["age_diff"] = model_df["red_age"] - model_df["blue_age"]

model_df[["red_age", "blue_age", "age_diff"]].head()


Unnamed: 0,red_age,blue_age,age_diff
1,34.888433,33.612594,1.275838
2,34.888433,31.101985,3.786448
4,35.624914,32.681725,2.94319
10,31.208761,33.894593,-2.685832
11,24.908966,33.894593,-8.985626


In [24]:
stance_map = {
    "Orthodox": 0,
    "Southpaw": 1,
    "Switch": 2
}

model_df["red_stance_code"] = model_df["red_stance"].map(stance_map).fillna(-1)
model_df["blue_stance_code"] = model_df["blue_stance"].map(stance_map).fillna(-1)

# stance_diff = red stance - blue stance
model_df["stance_diff"] = model_df["red_stance_code"] - model_df["blue_stance_code"]

model_df[["red_stance", "blue_stance", "stance_diff"]].head()


Unnamed: 0,red_stance,blue_stance,stance_diff
1,Orthodox,Switch,-2.0
2,Orthodox,Switch,-2.0
4,Orthodox,Orthodox,0.0
10,Switch,Orthodox,2.0
11,Orthodox,Orthodox,0.0


In [25]:
# Count how many total fights each fighter has appeared in (red or blue)
red_counts = model_df["red_fighter"].value_counts()
blue_counts = model_df["blue_fighter"].value_counts()

# Combine counts
fight_counts = red_counts.add(blue_counts, fill_value=0).astype(int)

fight_counts.head()


Aaron Pico                1
Abdul Rakhman Yakhyaev    1
Abus Magomedov            1
Adam Fugitt               1
Aiemann Zahabi            1
Name: count, dtype: int32

In [26]:
model_df["red_num_fights"] = model_df["red_fighter"].map(fight_counts)
model_df["blue_num_fights"] = model_df["blue_fighter"].map(fight_counts)

model_df["experience_diff"] = model_df["red_num_fights"] - model_df["blue_num_fights"]

model_df[[
    "red_fighter", "red_num_fights",
    "blue_fighter", "blue_num_fights",
    "experience_diff"
]].head()


Unnamed: 0,red_fighter,red_num_fights,blue_fighter,blue_num_fights,experience_diff
1,Merab Dvalishvili,2,Cory Sandhagen,1,1
2,Merab Dvalishvili,2,Sean O'Malley,1,1
4,Alexandre Pantoja,1,Kai Kara-France,1,0
10,Terrance McKinney,1,Viacheslav Borshchev,2,-1
11,Matheus Camilo,2,Viacheslav Borshchev,2,0


In [27]:
# Physical differences
model_df["height_diff"] = model_df["red_height_in"] - model_df["blue_height_in"]
model_df["reach_diff"] = model_df["red_reach_in"] - model_df["blue_reach_in"]

# Striking volume/damage (per minute)
model_df["slpm_diff"] = model_df["red_slpm"] - model_df["blue_slpm"]
model_df["sapm_diff"] = model_df["red_sapm"] - model_df["blue_sapm"]

# Wrestling / grappling
model_df["td_avg_diff"] = model_df["red_td_avg"] - model_df["blue_td_avg"]
model_df["td_acc_diff"] = model_df["red_td_acc"] - model_df["blue_td_acc"]
model_df["td_def_diff"] = model_df["red_td_def"] - model_df["blue_td_def"]
model_df["sub_avg_diff"] = model_df["red_sub_avg"] - model_df["blue_sub_avg"]

model_df[[
    "red_fighter", "blue_fighter",
    "height_diff", "reach_diff",
    "slpm_diff", "sapm_diff",
    "td_avg_diff", "td_acc_diff", "td_def_diff", "sub_avg_diff",
    "target"
]].head()


Unnamed: 0,red_fighter,blue_fighter,height_diff,reach_diff,slpm_diff,sapm_diff,td_avg_diff,td_acc_diff,td_def_diff,sub_avg_diff,target
1,Merab Dvalishvili,Cory Sandhagen,-5.0,-2.0,-0.53,-0.92,5.25,3.0,26.0,0.1,1
2,Merab Dvalishvili,Sean O'Malley,-5.0,-4.0,-1.97,-0.93,6.13,-5.0,21.0,0.0,1
4,Alexandre Pantoja,Kai Kara-France,1.0,-2.0,-0.1,0.64,2.24,17.0,-14.0,1.0,1
10,Terrance McKinney,Viacheslav Borshchev,-1.0,4.0,0.68,-1.88,3.59,40.0,34.0,2.1,1
11,Matheus Camilo,Viacheslav Borshchev,-1.0,0.0,-2.91,-1.21,4.43,53.0,-43.0,0.5,0


In [28]:
feature_cols = [
    "height_diff",
    "reach_diff",
    "slpm_diff",
    "sapm_diff",
    "td_avg_diff",
    "td_acc_diff",
    "td_def_diff",
    "sub_avg_diff",
    "age_diff",
    "stance_diff",
    "experience_diff",
]

# Check for missing values in features
print(model_df[feature_cols].isna().sum())

# If any NaNs, drop those rows
model_df = model_df.dropna(subset=feature_cols + ["target"]).copy()

print("\nAfter dropping NaNs:")
print(model_df[feature_cols].isna().sum())
model_df.shape


height_diff        1
reach_diff         1
slpm_diff          0
sapm_diff          0
td_avg_diff        0
td_acc_diff        0
td_def_diff        0
sub_avg_diff       0
age_diff           0
stance_diff        0
experience_diff    0
dtype: int64

After dropping NaNs:
height_diff        0
reach_diff         0
slpm_diff          0
sapm_diff          0
td_avg_diff        0
td_acc_diff        0
td_def_diff        0
sub_avg_diff       0
age_diff           0
stance_diff        0
experience_diff    0
dtype: int64


(293, 62)

In [29]:
from sklearn.model_selection import train_test_split

X = model_df[feature_cols]
y = model_df["target"]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y,
)

X_train.shape, X_test.shape


((234, 11), (59, 11))

In [30]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

pre_log_reg = LogisticRegression(max_iter=1000)
pre_log_reg.fit(X_train, y_train)

y_pred = pre_log_reg.predict(X_test)

print("Pre-fight Logistic Regression accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification report:\n")
print(classification_report(y_test, y_pred))


Pre-fight Logistic Regression accuracy: 0.6271186440677966

Classification report:

              precision    recall  f1-score   support

           0       0.61      0.42      0.50        26
           1       0.63      0.79      0.70        33

    accuracy                           0.63        59
   macro avg       0.62      0.61      0.60        59
weighted avg       0.62      0.63      0.61        59

