In [1]:
import pandas as pd
import numpy as np
import os
from pandas import read_excel

In [4]:
# raw_fighters_set = pd.read_csv('Datasets/raw_fighter_details.csv')
# ufc_master = pd.read_csv('Datasets/ufc-master.csv')
raw_fighters_set = pd.read_csv('UFCData/raw_fighter_details.csv')
ufc_master = pd.read_csv('UFCData/ufc-master.csv')

red_details = raw_fighters_set.rename(
    columns=lambda x: f"Red_{x}" if x != "fighter_name" else "RedFighter"
)
blue_details = raw_fighters_set.rename(
    columns=lambda x: f"Blue_{x}" if x != "fighter_name" else "BlueFighter"
)

# Merge red fighter details
ufc_master_with_red = ufc_master.merge(
    red_details, how="left", on="RedFighter"
)

# Merge blue fighter details
ufc_master_combined = ufc_master_with_red.merge(
    blue_details, how="left", on="BlueFighter"
)

ufc_master_combined.to_csv('ufc_master_combined', index=False)


In [5]:
ufc_master_combined.head()

Unnamed: 0,RedFighter,BlueFighter,RedOdds,BlueOdds,RedExpectedValue,BlueExpectedValue,Date,Location,Country,Winner,...,Blue_Stance,Blue_DOB,Blue_SLpM,Blue_Str_Acc,Blue_SApM,Blue_Str_Def,Blue_TD_Avg,Blue_TD_Acc,Blue_TD_Def,Blue_Sub_Avg
0,Alexandre Pantoja,Kai Asakura,-250.0,215.0,40.0,215.0,2024-12-07,"Las Vegas, Nevada, USA",USA,Red,...,,,,,,,,,,
1,Shavkat Rakhmonov,Ian Machado Garry,-210.0,295.0,47.619,295.0,2024-12-07,"Las Vegas, Nevada, USA",USA,Red,...,,,,,,,,,,
2,Ciryl Gane,Alexander Volkov,-380.0,300.0,26.3158,300.0,2024-12-07,"Las Vegas, Nevada, USA",USA,Red,...,Orthodox,"Oct 24, 1988",4.88,59%,2.48,56%,0.66,70%,66%,0.2
3,Bryce Mitchell,Kron Gracie,-950.0,625.0,10.5263,625.0,2024-12-07,"Las Vegas, Nevada, USA",USA,Red,...,Southpaw,"Jul 11, 1988",5.15,41%,8.13,53%,0.88,25%,0%,0.9
4,Nate Landwehr,Dooho Choi,-130.0,110.0,76.9231,110.0,2024-12-07,"Las Vegas, Nevada, USA",USA,Blue,...,Orthodox,"Apr 10, 1991",5.32,51%,6.38,53%,0.41,20%,50%,0.0


In [6]:
null_values_per_feature = ufc_master_combined.isnull().sum()
print("Null values per feature:")
print(null_values_per_feature[null_values_per_feature > 0])

Null values per feature:
RedOdds                227
BlueOdds               226
RedExpectedValue       227
BlueExpectedValue      226
BlueAvgSigStrLanded    930
                      ... 
Blue_Str_Def           735
Blue_TD_Avg            735
Blue_TD_Acc            735
Blue_TD_Def            735
Blue_Sub_Avg           735
Length: 81, dtype: int64


In [7]:
total_nulls = ufc_master_combined.isnull().sum().sum()
print(f"Total null values in the dataset: {total_nulls}")

Total null values in the dataset: 216075


## Dropping rows that have null values

In [8]:
columns_to_check = [
    'Red_Reach', 'Red_Stance', 'Red_SLpM', 'Red_Str_Acc', 'Red_SApM', 'Red_Str_Def',
    'Red_TD_Avg', 'Red_TD_Acc', 'Red_TD_Def', 'Red_Sub_Avg', 'Blue_Height',
    'Blue_Weight', 'Blue_Reach', 'Blue_Stance', 'Blue_SLpM', 'Blue_Str_Acc',
    'Blue_SApM', 'Blue_Str_Def', 'Blue_TD_Avg', 'Blue_TD_Acc', 'Blue_TD_Def', 'Blue_Sub_Avg'
]

ufc_master_combined = ufc_master_combined.dropna(subset=columns_to_check)

ufc_master_combined.head()


Unnamed: 0,RedFighter,BlueFighter,RedOdds,BlueOdds,RedExpectedValue,BlueExpectedValue,Date,Location,Country,Winner,...,Blue_Stance,Blue_DOB,Blue_SLpM,Blue_Str_Acc,Blue_SApM,Blue_Str_Def,Blue_TD_Avg,Blue_TD_Acc,Blue_TD_Def,Blue_Sub_Avg
2,Ciryl Gane,Alexander Volkov,-380.0,300.0,26.3158,300.0,2024-12-07,"Las Vegas, Nevada, USA",USA,Red,...,Orthodox,"Oct 24, 1988",4.88,59%,2.48,56%,0.66,70%,66%,0.2
3,Bryce Mitchell,Kron Gracie,-950.0,625.0,10.5263,625.0,2024-12-07,"Las Vegas, Nevada, USA",USA,Red,...,Southpaw,"Jul 11, 1988",5.15,41%,8.13,53%,0.88,25%,0%,0.9
4,Nate Landwehr,Dooho Choi,-130.0,110.0,76.9231,110.0,2024-12-07,"Las Vegas, Nevada, USA",USA,Blue,...,Orthodox,"Apr 10, 1991",5.32,51%,6.38,53%,0.41,20%,50%,0.0
5,Dominick Reyes,Anthony Smith,-380.0,300.0,26.3158,300.0,2024-12-07,"Las Vegas, Nevada, USA",USA,Red,...,Orthodox,"Jul 26, 1988",2.94,46%,4.38,42%,0.49,28%,51%,0.7
7,Movsar Evloev,Aljamain Sterling,-238.0,195.0,42.0168,195.0,2024-12-07,"Las Vegas, Nevada, USA",USA,Red,...,Orthodox,"Jul 31, 1989",4.84,49%,2.2,63%,1.77,24%,41%,0.9


## Scaling down features from percentage to integer

In [9]:
from sklearn.preprocessing import StandardScaler

features_to_scale = [
    'Red_SLpM', 'Red_Str_Acc', 'Red_SApM', 'Red_Str_Def', 'Red_TD_Avg', 'Red_TD_Acc',
    'Red_TD_Def', 'Red_Sub_Avg', 'Blue_SLpM', 'Blue_Str_Acc', 'Blue_SApM',
    'Blue_Str_Def', 'Blue_TD_Avg', 'Blue_TD_Acc', 'Blue_TD_Def', 'Blue_Sub_Avg'
]

# Remove percentage symbols and convert to float for scaling
for feature in features_to_scale:
    if ufc_master_combined[feature].dtype == 'object':
        ufc_master_combined[feature] = ufc_master_combined[feature].str.replace('%', '').astype(float) / 100

scaler = StandardScaler()
ufc_master_combined[features_to_scale] = scaler.fit_transform(ufc_master_combined[features_to_scale])

ufc_master_combined[features_to_scale] = ufc_master_combined[features_to_scale].round(2)

ufc_master_combined.to_excel('ufc_master_combined_statistics_diff.xlsx', index=False)

In [21]:
ufc_master_combined['SLpM Diff'] = ufc_master_combined['Red_SLpM'] - ufc_master_combined['Blue_SLpM']

ufc_master_combined['Exp. Str. Acc. Diff.'] = (
    np.sqrt(np.maximum(ufc_master_combined['Red_Str_Acc'] * (1 - ufc_master_combined['Red_Str_Def']), 0))
    - np.sqrt(np.maximum(ufc_master_combined['Blue_Str_Acc'] * (1 - ufc_master_combined['Blue_Str_Def']), 0))
)

ufc_master_combined['TD Avg. Diff.'] = ufc_master_combined['Red_TD_Avg'] - ufc_master_combined['Blue_TD_Avg']

ufc_master_combined['Exp. TD Acc. Diff.'] = (np.sqrt(np.maximum(ufc_master_combined['Red_TD_Acc']*(1 - ufc_master_combined['Red_TD_Def']), 0))
                                             - np.sqrt(np.maximum(ufc_master_combined['Blue_TD_Acc']*(1 - ufc_master_combined['Blue_TD_Def']), 0)))

ufc_master_combined['Sub Avg. Diff.'] = ufc_master_combined['Red_Sub_Avg'] - ufc_master_combined['Blue_Sub_Avg']

ufc_master_combined.to_excel('ufc_master_final.xlsx', index=False)