# Step 4: Feature Engineering

**Goal:** Create features using differentials to remove corner bias and prepare for modeling.

**Date:** 1/3/2026

**Approach:** Use ONLY differential features (Red - Blue) to predict which fighter has the advantage.

**Key Features to Engineer:**
1. Win percentage differential
2. Experience ratios
3. Stance matchup indicator
4. Odds differential

In [25]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load cleaned data from Step 3
df = pd.read_csv('../data/processed/ufc_cleaned.csv')
df['Date'] = pd.to_datetime(df['Date'])

print(f" Loaded cleaned data: {df.shape[0]:,} fights, {df.shape[1]} features")
[print(col) for col in df.columns]

 Loaded cleaned data: 6,290 fights, 89 features
RedFighter
BlueFighter
RedOdds
BlueOdds
RedExpectedValue
BlueExpectedValue
Date
Location
Country
Winner
TitleBout
WeightClass
Gender
NumberOfRounds
BlueCurrentLoseStreak
BlueCurrentWinStreak
BlueDraws
BlueAvgSigStrLanded
BlueAvgSigStrPct
BlueAvgSubAtt
BlueAvgTDLanded
BlueAvgTDPct
BlueLongestWinStreak
BlueLosses
BlueTotalRoundsFought
BlueTotalTitleBouts
BlueWinsByDecisionMajority
BlueWinsByDecisionSplit
BlueWinsByDecisionUnanimous
BlueWinsByKO
BlueWinsBySubmission
BlueWinsByTKODoctorStoppage
BlueWins
BlueStance
BlueHeightCms
BlueReachCms
BlueWeightLbs
RedCurrentLoseStreak
RedCurrentWinStreak
RedDraws
RedAvgSigStrLanded
RedAvgSigStrPct
RedAvgSubAtt
RedAvgTDLanded
RedAvgTDPct
RedLongestWinStreak
RedLosses
RedTotalRoundsFought
RedTotalTitleBouts
RedWinsByDecisionMajority
RedWinsByDecisionSplit
RedWinsByDecisionUnanimous
RedWinsByKO
RedWinsBySubmission
RedWinsByTKODoctorStoppage
RedWins
RedStance
RedHeightCms
RedReachCms
RedWeightLbs
RedAge
Bl

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [23]:
# TODO 1: Create binary target variable
# y = 1 means the fighter with the ADVANTAGE (positive differentials) won
# In our case, Red corner has the advantage when differential > 0

# Your code here to create y
# Hint: y = (df['Winner'] == 'Red').astype(int)
y = (df['Winner'] == 'Red').astype(int)

print(f"Target variable created:")
print(f"  Advantage won (y=1): {y.sum()} fights")  # Fill in
print(f"  Disadvantage won (y=0): {(df['Winner']=='Blue').sum()} fights")  # Fill in

Target variable created:
  Advantage won (y=1): 3656 fights
  Disadvantage won (y=0): 2634 fights


In [24]:
# TODO 2: Find all existing differential columns

# Find columns that end with 'Dif'
# Your code here
# Hint: existing_dif = [col for col in df.columns if col.endswith('Dif')]

existing_dif = [col for col in df.columns if col.endswith('Dif')]
print(f"Found {len(existing_dif)} existing differential columns:")
print(existing_dif)

Found 15 existing differential columns:
['LoseStreakDif', 'WinStreakDif', 'LongestWinStreakDif', 'WinDif', 'LossDif', 'TotalRoundDif', 'TotalTitleBoutDif', 'KODif', 'SubDif', 'HeightDif', 'ReachDif', 'AgeDif', 'SigStrDif', 'AvgSubAttDif', 'AvgTDDif']


In [29]:
# Create WinPctDif (not in dataset yet)

# Calculate Red win %
df['red_fights'] = df['RedWins'] + df['RedLosses']
df['red_winpct'] = (df['RedWins']/df['red_fights']).fillna(0.5)

# Calculate Blue win %
df['blue_fights'] = df['BlueWins'] + df['BlueLosses']
df['blue_winpct'] = (df['BlueWins']/df['blue_fights']).fillna(0.5)

# Create differential
df['WinPctDif'] = df['red_winpct'] - df['blue_winpct']

print(f"Created WinPctDif")
print(f"  Range: {df['WinPctDif'].min():.3f} to {df['WinPctDif'].max():.3f}")

Created WinPctDif
  Range: -1.000 to 1.000


In [30]:
# TODO 4: Create stance matchup indicator

# Same stance = 1, different stance = 0
# Your code here
# Hint: df['SameStance'] = (df['RedStance'] == df['BlueStance']).astype(int)
df['SameStance'] = (df['RedStance']==df['BlueStance']).astype(int)

print(f"Created SameStance")
print(f"  Same stance fights: {df['SameStance'].sum()} ({df['SameStance'].mean()*100:.1f}%)")

Created SameStance
  Same stance fights: 3824 (60.8%)


In [33]:
# TODO 5: Ensure TitleBout is binary (0/1)

# Check current dtype
print(f"TitleBout dtype: {df['TitleBout'].dtype}")
print(f"Unique values: {df['TitleBout'].unique()}")

# If it's boolean, convert to int
# Your code here
# Hint: if df['TitleBout'].dtype == 'bool': df['TitleBout'] = df['TitleBout'].astype(int)
if df['TitleBout'].dtype =='bool':
    df['TitleBout'] = df['TitleBout'].astype(int)

print(f"✓ TitleBout is binary")
print(f"  Title bouts: {df['TitleBout'].sum()} ({df['TitleBout'].mean()*100:.1f}%)")

TitleBout dtype: bool
Unique values: [ True False]
✓ TitleBout is binary
  Title bouts: 294 (4.7%)


In [34]:
# TODO 6: Create final feature set using ONLY differentials

# Columns to definitely EXCLUDE (leakage or not useful)
exclude_cols = [
    # Leakage (known after fight)
    'Winner', 'Finish', 'FinishDetails', 'FinishRound', 'FinishRoundTime', 'TotalFightTimeSecs',
    
    # Metadata (not features)
    'RedFighter', 'BlueFighter', 'Date', 'Location', 'Country',
    
    # Temp columns we created
    'red_fights', 'blue_fights', 'red_winpct', 'blue_winpct',
    
    # Categorical that need encoding (we'll skip for now)
    'WeightClass', 'Gender', 'RedStance', 'BlueStance'
]

# Add all Red___ and Blue___ columns to exclude list
red_blue_cols = [col for col in df.columns if col.startswith('Red') or col.startswith('Blue')]
exclude_cols = exclude_cols + red_blue_cols

# Keep only differentials and engineered features
X = df.drop(columns=[col for col in exclude_cols if col in df.columns])

print(f"Final feature set: {X.shape[1]} features")
print(f"Features: {X.columns.tolist()}")

Final feature set: 24 features
Features: ['TitleBout', 'NumberOfRounds', 'LoseStreakDif', 'WinStreakDif', 'LongestWinStreakDif', 'WinDif', 'LossDif', 'TotalRoundDif', 'TotalTitleBoutDif', 'KODif', 'SubDif', 'HeightDif', 'ReachDif', 'AgeDif', 'SigStrDif', 'AvgSubAttDif', 'AvgTDDif', 'EmptyArena', 'RSubOdds', 'BSubOdds', 'RKOOdds', 'BKOOdds', 'WinPctDif', 'SameStance']


In [35]:
# TODO 7: Save final datasets

# Save full featured dataset
df.to_csv('../data/processed/ufc_engineered.csv', index=False)

# Save feature matrix (X) and target (y) separately  
X.to_csv('../data/processed/X_features.csv', index=False)
y.to_csv('../data/processed/y_target.csv', index=False, header=True)

print("✓ Saved all files")
print(f"  - Full dataset: data/processed/ufc_engineered.csv")
print(f"  - X shape: {X.shape}")
print(f"  - y shape: {len(y)}")

✓ Saved all files
  - Full dataset: data/processed/ufc_engineered.csv
  - X shape: (6290, 24)
  - y shape: 6290
