In [4]:
# notebook globals
import pandas as pd
import numpy as np
from dateutil.relativedelta import relativedelta

# load data
fighters = pd.read_csv('../tuf/fighters.csv', parse_dates=['dob']).apply(lambda x: x.str.strip() if x.dtype == 'object' else x)
fights = pd.read_csv('../tuf/fights.csv', parse_dates=['event_date'])
fights[['f1_name', 'f2_name']] = fights[['f1_name', 'f2_name']].apply(lambda x: x.str.strip())

# calculate fighter age at the time of the fight
def calculate_age(dob, event_date):
    if pd.isna(dob) or pd.isna(event_date): return None
    return relativedelta(event_date, dob).years

In [5]:
# add fighter metrics
for fighter_num in ['f1', 'f2']:
    fights = fights.merge(
        fighters[['name', 'height', 'reach', 'stance', 'dob']],
        left_on=f'{fighter_num}_name',
        right_on='name',
        how='left',
        suffixes=('', f'_{fighter_num}')
    )
    # rename merged column based on fighter
    fights = fights.rename(columns={
        'height': f'{fighter_num}_height',
        'reach': f'{fighter_num}_reach',
        'stance': f'{fighter_num}_stance',
        'dob': f'{fighter_num}_dob'
    })
    # calculate age at time of fight
    fights[f'{fighter_num}_age'] = fights.apply(
        lambda x: calculate_age(x[f'{fighter_num}_dob'], x['event_date']),
        axis=1
    )
    fights.drop(columns=['name'], inplace=True) # already have their names

# fighter differentials
fights['reach_diff'] = (fights['f1_reach'] - fights['f2_reach']).abs()
fights['height_diff'] = (fights['f1_height'] - fights['f2_height']).abs()
fights['age_diff'] = (fights['f1_age'] - fights['f2_age']).abs()

In [6]:
# new column labels
column_labels = [
    # event details
    'event_name', 'event_date',
    
    # outcome
    'outcome', 'winner', 'loser', 'weight_class',
    
    # fighter 1 details
    'f1_name', 'f1_strikes', 'f1_td', 'f1_td_def',
    'f1_height', 'f1_reach', 'f1_stance', 'f1_age',
    
    # fighter 2 details
    'f2_name', 'f2_strikes', 'f2_td', 'f2_td_def',
    'f2_height', 'f2_reach', 'f2_stance', 'f2_age',
    
    # win details
    'method', 'method_details',

    # duration details
    'end_round',
    'time',
    'total_time',

    # fighter differentials
    'reach_diff', 'height_diff', 'age_diff',
]

# get the data from the columns
fights = fights[column_labels]
# make dates panda friendly
fights['event_date'] = pd.to_datetime(fights['event_date'])

# get takedown completion / defence rate
for idx, row in fights.iterrows():
    # Takedown attempts
    f1_td_attempts = row['f1_td'] + row['f2_td_def']
    f2_td_attempts = row['f2_td'] + row['f1_td_def']

    # Completion rates
    f1_td_completion_rate = row['f1_td'] / f1_td_attempts if f1_td_attempts > 0 else np.nan
    f2_td_completion_rate = row['f2_td'] / f2_td_attempts if f2_td_attempts > 0 else np.nan

    # Takedown defense attempts (i.e., opponent's attempts)
    f1_td_def_attempts = row['f2_td'] + row['f1_td_def']
    f2_td_def_attempts = row['f1_td'] + row['f2_td_def']

    # Defense rates
    f1_td_def_rate = row['f1_td_def'] / f1_td_def_attempts if f1_td_def_attempts > 0 else np.nan
    f2_td_def_rate = row['f2_td_def'] / f2_td_def_attempts if f2_td_def_attempts > 0 else np.nan

    # Store results
    fights.at[idx, 'f1_td_rate'] = f1_td_completion_rate
    fights.at[idx, 'f2_td_rate'] = f2_td_completion_rate
    fights.at[idx, 'f1_td_def_rate'] = f1_td_def_rate
    fights.at[idx, 'f2_td_def_rate'] = f2_td_def_rate

# more recent events first
fights_sorted = fights.sort_values('event_date', ascending=False)

# output new csv file
fights_sorted.to_csv('enhanced_fights.csv', index=False)