# Step 1: Combine ELO Ratings with Fighter Details

This notebook merges ELO ratings with fighter demographic data to create a unified fighter profile for predictive analysis.

## Overview
- Load ELO ratings from `build/elo_ratings_current.csv`
- Load fighter details from `data/ufc_fighter_tott.csv`
- Match fighters by URL and name
- Parse demographic data (age, reach, height, weight)
- Create comprehensive fighter profiles

## Output
- `build/fighter_profiles.csv` with 16 features per fighter

In [10]:
import pandas as pd
import numpy as np
from datetime import datetime
import re
from pathlib import Path

#2 Helper Functions

In [11]:
###Helper Functions

def normalize_name(name):
    """Normalize fighter name for matching"""
    if pd.isna(name):
        return ""
    return str(name).strip().lower()

def parse_height(height_str):
    """Convert height string to inches"""
    if pd.isna(height_str) or height_str == "--":
        return None
    
    # Handle formats like "5' 11""", "6' 0""", etc.
    match = re.search(r"(\d+)'\s*(\d+)\"", str(height_str))
    if match:
        feet = int(match.group(1))
        inches = int(match.group(2))
        return feet * 12 + inches
    
    return None

def parse_weight(weight_str):
    """Convert weight string to pounds"""
    if pd.isna(weight_str) or weight_str == "--":
        return None
    
    # Extract number from "155 lbs." format
    match = re.search(r"(\d+)", str(weight_str))
    if match:
        return int(match.group(1))
    
    return None

def parse_reach(reach_str):
    """Convert reach string to inches"""
    if pd.isna(reach_str) or reach_str == "--":
        return None
    
    # Extract number from "76"" format
    match = re.search(r"(\d+)\"", str(reach_str))
    if match:
        return int(match.group(1))
    
    return None

def parse_dob(dob_str):
    """Parse date of birth and calculate current age"""
    if pd.isna(dob_str) or dob_str == "--":
        return None, None
    
    try:
        # Handle formats like "Jul 13, 1978", "Sep 02, 1981"
        dob = datetime.strptime(str(dob_str), "%b %d, %Y")
        current_date = datetime.now()
        age = current_date.year - dob.year - ((current_date.month, current_date.day) < (dob.month, dob.day))
        return dob, age
    except:
        return None, None

def get_weight_class(weight):
    """Determine weight class from weight in pounds"""
    if pd.isna(weight):
        return "Unknown"
    if weight <= 125:
        return "Flyweight"
    elif weight <= 135:
        return "Bantamweight"
    elif weight <= 145:
        return "Featherweight"
    elif weight <= 155:
        return "Lightweight"
    elif weight <= 170:
        return "Welterweight"
    elif weight <= 185:
        return "Middleweight"
    elif weight <= 205:
        return "Light Heavyweight"
    elif weight <= 265:
        return "Heavyweight"
    else:
        return "Super Heavyweight"



## 3. Load Data Files

In [12]:
import os
print(os.getcwd())
os.chdir("/Users/niko/Desktop/UFC-elo-ranking")
### Load ELO Data
print("Loading ELO ratings...")
elo_df = pd.read_csv("build/elo_ratings_current.csv")
print(f"Loaded {len(elo_df)} fighters with ELO ratings")
print(f"Columns: {list(elo_df.columns)}")
elo_df.head()

/Users/niko/Desktop/UFC-elo-ranking
Loading ELO ratings...
Loaded 2623 fighters with ELO ratings
Columns: ['fighter_id', 'fighter_name', 'rating', 'fights', 'wins', 'losses', 'draws', 'first_date', 'last_date']


Unnamed: 0,fighter_id,fighter_name,rating,fights,wins,losses,draws,first_date,last_date
0,http://ufcstats.com/fighter-details/07f72a2a75...,Jon Jones,1738.435793,23,22,1,0,2008-08-09,2024-11-16
1,http://ufcstats.com/fighter-details/6506c1d34d...,Georges St-Pierre,1710.009316,22,20,2,0,2004-01-31,2017-11-04
2,http://ufcstats.com/fighter-details/275aca31f6...,Islam Makhachev,1685.573623,16,15,1,0,2015-05-23,2025-01-18
3,http://ufcstats.com/fighter-details/150ff4cc64...,Max Holloway,1676.828349,29,22,7,0,2012-02-04,2025-07-19
4,http://ufcstats.com/fighter-details/f1b2aa7853...,Kamaru Usman,1670.634206,18,16,2,0,2015-07-12,2025-06-14


In [13]:
### Load fighter details
print("Loading fighter details...")
fighter_details_df = pd.read_csv("data/ufc_fighter_tott.csv")
print(f"Loaded {len(fighter_details_df)} fighter details")
print(f"Columns: {list(fighter_details_df.columns)}")
fighter_details_df.head()

Loading fighter details...
Loaded 4404 fighter details
Columns: ['FIGHTER', 'HEIGHT', 'WEIGHT', 'REACH', 'STANCE', 'DOB', 'URL']


Unnamed: 0,FIGHTER,HEIGHT,WEIGHT,REACH,STANCE,DOB,URL
0,Tom Aaron,--,155 lbs.,--,,"Jul 13, 1978",http://ufcstats.com/fighter-details/93fe7332d1...
1,Danny Abbadi,"5' 11""",155 lbs.,--,Orthodox,"Jul 03, 1983",http://ufcstats.com/fighter-details/15df64c02b...
2,David Abbott,"6' 0""",265 lbs.,--,Switch,--,http://ufcstats.com/fighter-details/b361180739...
3,Shamil Abdurakhimov,"6' 3""",235 lbs.,"76""",Orthodox,"Sep 02, 1981",http://ufcstats.com/fighter-details/2f5cbecbbe...
4,Hiroyuki Abe,"5' 6""",145 lbs.,--,Orthodox,--,http://ufcstats.com/fighter-details/c0ed7b2081...


## 4. Prepare Fighter Details Data

In [14]:
# Create full names for fighter details (using single FIGHTER column) Makes the column names consistent between both datasets
fighter_details_df['fighter_name'] = fighter_details_df['FIGHTER'].fillna('')
print(f"Created fighter names for {len(fighter_details_df)} fighters")
fighter_details_df[['FIGHTER', 'fighter_name']].head()

Created fighter names for 4404 fighters


Unnamed: 0,FIGHTER,fighter_name
0,Tom Aaron,Tom Aaron
1,Danny Abbadi,Danny Abbadi
2,David Abbott,David Abbott
3,Shamil Abdurakhimov,Shamil Abdurakhimov
4,Hiroyuki Abe,Hiroyuki Abe


In [15]:
# Parse demographic data
print("Parsing demographic data...")
fighter_details_df['height_inches'] = fighter_details_df['HEIGHT'].apply(parse_height)
fighter_details_df['weight_lbs'] = fighter_details_df['WEIGHT'].apply(parse_weight)
fighter_details_df['reach_inches'] = fighter_details_df['REACH'].apply(parse_reach)
fighter_details_df['stance'] = fighter_details_df['STANCE'].fillna('Unknown')

# Parse date of birth and calculate ageThis code transforms raw text dates like "Jul 13, 1978" into usable numeric data like age: 46
dob_age_data = fighter_details_df['DOB'].apply(parse_dob)
fighter_details_df['date_of_birth'] = [dob for dob, age in dob_age_data]
fighter_details_df['age'] = [age for dob, age in dob_age_data]

print("Demographic data parsed!")

Parsing demographic data...
Demographic data parsed!


In [16]:
# Show parsing results
print("Sample of parsed data:")
sample_cols = ['FIGHTER', 'HEIGHT', 'height_inches', 'WEIGHT', 'weight_lbs', 
               'REACH', 'reach_inches', 'DOB', 'age']
fighter_details_df[sample_cols].head(10)

Sample of parsed data:


Unnamed: 0,FIGHTER,HEIGHT,height_inches,WEIGHT,weight_lbs,REACH,reach_inches,DOB,age
0,Tom Aaron,--,,155 lbs.,155.0,--,,"Jul 13, 1978",47.0
1,Danny Abbadi,"5' 11""",71.0,155 lbs.,155.0,--,,"Jul 03, 1983",42.0
2,David Abbott,"6' 0""",72.0,265 lbs.,265.0,--,,--,
3,Shamil Abdurakhimov,"6' 3""",75.0,235 lbs.,235.0,"76""",76.0,"Sep 02, 1981",43.0
4,Hiroyuki Abe,"5' 6""",66.0,145 lbs.,145.0,--,,--,
5,Daichi Abe,"5' 11""",71.0,170 lbs.,170.0,"71""",71.0,"Nov 27, 1991",33.0
6,Papy Abedi,"5' 11""",71.0,185 lbs.,185.0,--,,"Jun 30, 1978",47.0
7,Ricardo Abreu,"5' 11""",71.0,185 lbs.,185.0,--,,"Apr 27, 1984",41.0
8,Klidson Abreu,"6' 0""",72.0,205 lbs.,205.0,"74""",74.0,"Dec 24, 1992",32.0
9,Daniel Acacio,"5' 8""",68.0,180 lbs.,180.0,--,,"Dec 27, 1977",47.0


## 5. Match Fighters by URL

In [17]:
# Normalize names for matching (Consistent comparison: All names in same format)
elo_df['fighter_name_normalized'] = elo_df['fighter_name'].apply(normalize_name)
fighter_details_df['fighter_name_normalized'] = fighter_details_df['fighter_name'].apply(normalize_name)

# Extract uniquefighter IDs from URLs for matching
elo_df['fighter_id_clean'] = elo_df['fighter_id'].str.extract(r'/([^/]+)$')
fighter_details_df['fighter_id_clean'] = fighter_details_df['URL'].str.extract(r'/([^/]+)$')

print(f"Extracted fighter IDs from URLs")
print(f"ELO fighters: {len(elo_df)}")
print(f"Fighter details: {len(fighter_details_df)}")

Extracted fighter IDs from URLs
ELO fighters: 2623
Fighter details: 4404


In [18]:
##combine the ELO ratings with fighter demographic data.
print("Matching fighters by URL...")
merged_df = elo_df.merge(
    fighter_details_df[['fighter_id_clean', 'fighter_name', 'height_inches', 'weight_lbs', 
                       'reach_inches', 'stance', 'date_of_birth', 'age']],
    on='fighter_id_clean',
    how='left',
    suffixes=('_elo', '_details')
)

print(f"Merged {len(merged_df)} fighters")
print(f"Fighters with details: {merged_df['fighter_name_details'].notna().sum()}")
print(f"Unmatched fighters: {merged_df['fighter_name_details'].isna().sum()}")

Matching fighters by URL...
Merged 2624 fighters
Fighters with details: 2571
Unmatched fighters: 53


##6. Name-Based Matching for Unmatched Fighters

In [19]:
# For unmatched fighters, try name matching
unmatched = merged_df[merged_df['fighter_name_details'].isna()]
print(f"Found {len(unmatched)} unmatched fighters, attempting name matching...")

# Create a mapping for name matching
name_mapping = {}
for _, row in fighter_details_df.iterrows():
    if pd.notna(row['fighter_name']):
        name_mapping[normalize_name(row['fighter_name'])] = row

print(f"Created name mapping for {len(name_mapping)} fighters")

Found 53 unmatched fighters, attempting name matching...
Created name mapping for 4395 fighters


In [20]:
###Try to match unmatched fighters by name
matched_count = 0
for idx, row in unmatched.iterrows():
    elo_name = row['fighter_name_normalized']
    if elo_name in name_mapping:
        details_row = name_mapping[elo_name]
        merged_df.loc[idx, 'fighter_name_details'] = details_row['fighter_name']
        merged_df.loc[idx, 'height_inches'] = details_row['height_inches']
        merged_df.loc[idx, 'weight_lbs'] = details_row['weight_lbs']
        merged_df.loc[idx, 'reach_inches'] = details_row['reach_inches']
        merged_df.loc[idx, 'stance'] = details_row['stance']
        merged_df.loc[idx, 'date_of_birth'] = details_row['date_of_birth']
        merged_df.loc[idx, 'age'] = details_row['age']
        matched_count += 1

print(f"Successfully matched {matched_count} additional fighters by name")
print(f"Total matched: {merged_df['fighter_name_details'].notna().sum()}")
print(f"Still unmatched: {merged_df['fighter_name_details'].isna().sum()}")

Successfully matched 0 additional fighters by name
Total matched: 2571
Still unmatched: 53


## 7. Calculate Additional Features

In [21]:
# Calculate win rate
merged_df['win_rate'] = merged_df['wins'] / merged_df['fights']
print(f"Calculated win rates for {len(merged_df)} fighters")

# Determine weight class from weight
merged_df['weight_class'] = merged_df['weight_lbs'].apply(get_weight_class)
print(f"Assigned weight classes for {len(merged_df)} fighters")

# Show weight class distribution
print("\nWeight class distribution:")
print(merged_df['weight_class'].value_counts())

Calculated win rates for 2624 fighters
Assigned weight classes for 2624 fighters

Weight class distribution:
weight_class
Welterweight         430
Lightweight          401
Middleweight         324
Flyweight            307
Bantamweight         295
Featherweight        267
Heavyweight          255
Light Heavyweight    249
Unknown               76
Super Heavyweight     20
Name: count, dtype: int64


8. Create Final Output

In [22]:
# Select and rename columns for output
output_columns = {
    'fighter_id': 'fighter_id',
    'fighter_name_elo': 'fighter_name',
    'rating': 'current_elo_rating',
    'fights': 'total_fights',
    'wins': 'total_wins',
    'losses': 'total_losses',
    'draws': 'total_draws',
    'win_rate': 'win_rate',
    'age': 'age',
    'reach_inches': 'reach_inches',
    'height_inches': 'height_inches',
    'weight_lbs': 'weight_lbs',
    'weight_class': 'weight_class',
    'stance': 'stance',
    'date_of_birth': 'date_of_birth',
    'first_date': 'debut_date',
    'last_date': 'last_fight_date'
}

# Create final output dataframe
output_df = merged_df[list(output_columns.keys())].copy()
output_df.columns = list(output_columns.values())

print(f"Created output dataframe with {len(output_df)} fighters and {len(output_df.columns)} columns")

Created output dataframe with 2624 fighters and 17 columns


In [23]:
# Step 1: Keep as float (proper NaN handling)
numeric_columns = ['age', 'reach_inches', 'height_inches', 'weight_lbs']
for col in numeric_columns:
    if col in output_df.columns:
        output_df[col] = pd.to_numeric(output_df[col], errors='coerce')

# Step 2: Add missing value indicators (crucial for ML)
output_df['has_age'] = output_df['age'].notna().astype(int)
output_df['has_reach'] = output_df['reach_inches'].notna().astype(int)
output_df['has_height'] = output_df['height_inches'].notna().astype(int)
output_df['has_weight'] = output_df['weight_lbs'].notna().astype(int)

print("\nTotal missing values:", output_df.isna().sum().sum())
print("Total data cells:", len(output_df) * len(output_df.columns))
print("Missing percentage:", (output_df.isna().sum().sum() / (len(output_df) * len(output_df.columns))) * 100, "%") 
# Count missing values in each column
print("Missing values per column:")
print(output_df.isna().sum())

print("\nTotal missing values:", output_df.isna().sum().sum())
print("Total data cells:", len(output_df) * len(output_df.columns))
print("Missing percentage:", (output_df.isna().sum().sum() / (len(output_df) * len(output_df.columns))) * 100, "%") 



Total missing values: 1910
Total data cells: 55104
Missing percentage: 3.4661730545876885 %
Missing values per column:
fighter_id              0
fighter_name            0
current_elo_rating      0
total_fights            0
total_wins              0
total_losses            0
total_draws             0
win_rate                0
age                   253
reach_inches          990
height_inches         285
weight_lbs             76
weight_class            0
stance                 53
date_of_birth         253
debut_date              0
last_fight_date         0
has_age                 0
has_reach               0
has_height              0
has_weight              0
dtype: int64

Total missing values: 1910
Total data cells: 55104
Missing percentage: 3.4661730545876885 %


## 9. Save Results

In [24]:
# Save to file
output_path = Path("build/fighter_profiles.csv")
output_path.parent.mkdir(parents=True, exist_ok=True)
output_df.to_csv(output_path, index=False)

print(f"Output saved to: {output_path}")

Output saved to: build/fighter_profiles.csv


10. Summary Statistics

In [25]:
# Show top 10 fighters
print("\nTop 10 fighters by ELO rating:")
top_10 = output_df[['fighter_name', 'current_elo_rating', 'age', 'weight_class']].head(10)
display(top_10)


Top 10 fighters by ELO rating:


Unnamed: 0,fighter_name,current_elo_rating,age,weight_class
0,Jon Jones,1738.435793,38.0,Light Heavyweight
1,Georges St-Pierre,1710.009316,44.0,Middleweight
2,Islam Makhachev,1685.573623,33.0,Lightweight
3,Max Holloway,1676.828349,33.0,Featherweight
4,Kamaru Usman,1670.634206,38.0,Welterweight
5,Khabib Nurmagomedov,1666.583185,36.0,Lightweight
6,Alexander Volkanovski,1652.877752,36.0,Featherweight
7,Amanda Nunes,1652.839011,37.0,Bantamweight
8,Francis Ngannou,1647.231858,38.0,Heavyweight
9,Merab Dvalishvili,1646.728219,34.0,Bantamweight


In [26]:
# Show sample of complete data
print("\nSample of fighter profiles:")
sample_cols = ['fighter_name', 'current_elo_rating', 'age', 'reach_inches', 
               'height_inches', 'weight_lbs', 'weight_class', 'win_rate']
display(output_df[sample_cols].head())


Sample of fighter profiles:


Unnamed: 0,fighter_name,current_elo_rating,age,reach_inches,height_inches,weight_lbs,weight_class,win_rate
0,Jon Jones,1738.435793,38.0,84.0,76.0,205.0,Light Heavyweight,0.956522
1,Georges St-Pierre,1710.009316,44.0,76.0,71.0,185.0,Middleweight,0.909091
2,Islam Makhachev,1685.573623,33.0,70.0,70.0,155.0,Lightweight,0.9375
3,Max Holloway,1676.828349,33.0,69.0,71.0,145.0,Featherweight,0.758621
4,Kamaru Usman,1670.634206,38.0,76.0,72.0,170.0,Welterweight,0.888889


## 11. Data Quality Analysis

In [27]:
# Analyze data quality
print("Data Quality Analysis:")
print("=" * 50)

# Missing data analysis
missing_data = output_df.isnull().sum()
print("\nMissing data by column:")
for col, missing in missing_data.items():
    if missing > 0:
        percentage = missing / len(output_df) * 100
        print(f"{col}: {missing} ({percentage:.1f}%)")

# Weight class distribution
print("\nWeight class distribution:")
weight_class_counts = output_df['weight_class'].value_counts()
for weight_class, count in weight_class_counts.items():
    percentage = count / len(output_df) * 100
    print(f"{weight_class}: {count} ({percentage:.1f}%)")

# Age distribution
print(f"\nAge statistics:")
age_data = output_df['age'].dropna()
if len(age_data) > 0:
    print(f"Average age: {age_data.mean():.1f}")
    print(f"Age range: {age_data.min()} - {age_data.max()}")
    print(f"Median age: {age_data.median():.1f}")

Data Quality Analysis:

Missing data by column:
age: 253 (9.6%)
reach_inches: 990 (37.7%)
height_inches: 285 (10.9%)
weight_lbs: 76 (2.9%)
stance: 53 (2.0%)
date_of_birth: 253 (9.6%)

Weight class distribution:
Welterweight: 430 (16.4%)
Lightweight: 401 (15.3%)
Middleweight: 324 (12.3%)
Flyweight: 307 (11.7%)
Bantamweight: 295 (11.2%)
Featherweight: 267 (10.2%)
Heavyweight: 255 (9.7%)
Light Heavyweight: 249 (9.5%)
Unknown: 76 (2.9%)
Super Heavyweight: 20 (0.8%)

Age statistics:
Average age: 38.9
Age range: 20.0 - 82.0
Median age: 38.0


In [28]:
# Step 1: Keep as float (proper NaN handling)
numeric_columns = ['age', 'reach_inches', 'height_inches', 'weight_lbs']
for col in numeric_columns:
    if col in output_df.columns:
        output_df[col] = pd.to_numeric(output_df[col], errors='coerce')

# Step 2: Add missing value indicators (crucial for ML)
output_df['has_age'] = output_df['age'].notna().astype(int)
output_df['has_reach'] = output_df['reach_inches'].notna().astype(int)
output_df['has_height'] = output_df['height_inches'].notna().astype(int)
output_df['has_weight'] = output_df['weight_lbs'].notna().astype(int)

print("\nTotal missing values:", output_df.isna().sum().sum())
print("Total data cells:", len(output_df) * len(output_df.columns))
print("Missing percentage:", (output_df.isna().sum().sum() / (len(output_df) * len(output_df.columns))) * 100, "%") 
# Count missing values in each column
print("Missing values per column:")
print(output_df.isna().sum())

print("\nTotal missing values:", output_df.isna().sum().sum())
print("Total data cells:", len(output_df) * len(output_df.columns))
print("Missing percentage:", (output_df.isna().sum().sum() / (len(output_df) * len(output_df.columns))) * 100, "%") 



Total missing values: 1910
Total data cells: 55104
Missing percentage: 3.4661730545876885 %
Missing values per column:
fighter_id              0
fighter_name            0
current_elo_rating      0
total_fights            0
total_wins              0
total_losses            0
total_draws             0
win_rate                0
age                   253
reach_inches          990
height_inches         285
weight_lbs             76
weight_class            0
stance                 53
date_of_birth         253
debut_date              0
last_fight_date         0
has_age                 0
has_reach               0
has_height              0
has_weight              0
dtype: int64

Total missing values: 1910
Total data cells: 55104
Missing percentage: 3.4661730545876885 %
