## Calculation of ELO rating for UFC fighters (current and retired fighters)  

### An ELO rating was first used in professional chess and then later used for other sports to measure an individual or teams abilities and ranking. After each match the score is updated, the winner gains points and the loser loses points. The loser always loses the same amount the winner gets. 

In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv("Data/combined_ufc_dataset.csv")

# Remove exact duplicates
df = df.drop_duplicates()

# Create sorted versions of fighters for duplicate match filtering (but keep original RED/BLUE info)
df['F1'] = df[['REDFIGHTER', 'BLUEFIGHTER']].min(axis=1)
df['F2'] = df[['REDFIGHTER', 'BLUEFIGHTER']].max(axis=1)

# Remove mirrored/repeated matchups
df = df.drop_duplicates(subset=['F1', 'F2', 'WINNER'])

# Drop temp columns used for deduplication
df.drop(columns=['F1', 'F2'], inplace=True)

# Initialize Elo ratings
initial_elo = 1000
elo_ratings = {}
K = 32  # Sensitivity of Elo change

def expected_score(rating_a, rating_b):
    return 1 / (1 + 10 ** ((rating_b - rating_a) / 400))

def update_elo(winner, loser, finish_type=None):
    if winner not in elo_ratings:
        elo_ratings[winner] = initial_elo
    if loser not in elo_ratings:
        elo_ratings[loser] = initial_elo

    Ra = elo_ratings[winner]
    Rb = elo_ratings[loser]

    Ea = expected_score(Ra, Rb)
    Eb = expected_score(Rb, Ra)

    Ra_new = Ra + K * (1 - Ea)
    Rb_new = Rb + K * (0 - Eb)

    # Bonus for KO or Submission wins
    if finish_type in ['KO', 'SUBMISSION']:
        Ra_new += 10

    elo_ratings[winner] = Ra_new
    elo_ratings[loser] = Rb_new

# Convert 'DATE' to datetime and sort by date
df['DATE'] = pd.to_datetime(df['DATE'], errors='coerce')
df = df.dropna(subset=['DATE'])
df = df.sort_values('DATE')

# Apply Elo updates fight by fight
for _, row in df.iterrows():
    red = str(row['REDFIGHTER']).strip()
    blue = str(row['BLUEFIGHTER']).strip()
    winner_color = str(row['WINNER']).strip().upper()

    if winner_color == 'RED':
        winner = red
        loser = blue
    elif winner_color == 'BLUE':
        winner = blue
        loser = red
    else:
        continue  # Skip draws, No Contests, etc.

    finish = str(row.get("FINISH", "")).strip().upper()
    update_elo(winner, loser, finish_type=finish)

# Convert Elo dict to DataFrame
elo_df = pd.DataFrame.from_dict(elo_ratings, orient='index', columns=['ELO'])
elo_df.reset_index(inplace=True)
elo_df.rename(columns={'index': 'FIGHTER'}, inplace=True)

# Optional: Show top 10 fighters by Elo
print(elo_df.sort_values(by='ELO', ascending=False).head(10))


  exec(code_obj, self.user_global_ns, self.user_ns)


                   FIGHTER          ELO
14               jon jones  1233.966639
942        islam makhachev  1199.955761
413    khabib nurmagomedov  1191.247043
1058        belal muhammad  1186.338649
419           max holloway  1184.054598
858           leon edwards  1182.188791
612           amanda nunes  1179.187259
1013  valentina shevchenko  1171.026743
181       charles oliveira  1169.112836
267     demetrious johnson  1158.300077


In [66]:
import pandas as pd

# Load the dataset
df = pd.read_csv("Data/combined_ufc_dataset.csv")

# Remove exact duplicates
df = df.drop_duplicates()

# Create sorted versions of fighters for duplicate match filtering
df['F1'] = df[['REDFIGHTER', 'BLUEFIGHTER']].min(axis=1)
df['F2'] = df[['REDFIGHTER', 'BLUEFIGHTER']].max(axis=1)

# Remove mirrored/repeated matchups
df = df.drop_duplicates(subset=['F1', 'F2', 'WINNER'])

# Drop temp columns used for deduplication
df.drop(columns=['F1', 'F2'], inplace=True)

# Initialize Elo ratings
initial_elo = 1000
elo_ratings = {}
K = 32  # Sensitivity of Elo change

# Expected score calculation
def expected_score(rating_a, rating_b):
    return 1 / (1 + 10 ** ((rating_b - rating_a) / 400))

# Update Elo with bonuses
def update_elo(winner, loser, finish_type=None, round_ended=None, stats=None):
    if winner not in elo_ratings:
        elo_ratings[winner] = initial_elo
    if loser not in elo_ratings:
        elo_ratings[loser] = initial_elo

    Ra = elo_ratings[winner]
    Rb = elo_ratings[loser]

    Ea = expected_score(Ra, Rb)
    Eb = expected_score(Rb, Ra)

    Ra_new = Ra + K * (1 - Ea)
    Rb_new = Rb + K * (0 - Eb)

    # --- Bonus logic ---
    finish_type = (finish_type or "").upper()

    bonus = 0
    if "KO" in finish_type:
        bonus = 10
    elif "SUB" in finish_type:
        bonus = 7
    elif "DEC" in finish_type or "DECISION" in finish_type:
        bonus = 3

    # Extra bonus for early finishes
    try:
        round_ended = int(round_ended)
        if round_ended == 1:
            bonus += 5
        elif round_ended == 2:
            bonus += 3
    except:
        pass

    # Stat-based bonuses
    if stats:
        sig_strikes = stats.get('SIG_STRIKES', 0)
        knockdowns = stats.get('KNOCKDOWNS', 0)
        control_time = stats.get('CONTROL_TIME', 0)  # in seconds

        bonus += min(knockdowns * 2, 6)
        bonus += min(sig_strikes // 50, 3)
        bonus += min(control_time // 60, 2)  # e.g. 2 points for > 2 min control

    Ra_new += bonus
    elo_ratings[winner] = Ra_new
    elo_ratings[loser] = Rb_new

# Convert 'DATE' to datetime and sort
df['DATE'] = pd.to_datetime(df['DATE'], errors='coerce')
df = df.dropna(subset=['DATE'])
df = df.sort_values('DATE')

# Apply Elo updates fight-by-fight
for _, row in df.iterrows():
    red = str(row['REDFIGHTER']).strip()
    blue = str(row['BLUEFIGHTER']).strip()
    winner_color = str(row['WINNER']).strip().upper()

    if winner_color == 'RED':
        winner = red
        loser = blue
    elif winner_color == 'BLUE':
        winner = blue
        loser = red
    else:
        continue  # Skip draws, No Contests, etc.

    finish = row.get('FINISH', '')
    round_ended = row.get('ROUND', None)

    # Optional stats dictionary (adjust column names as needed)
    stats = {
        'SIG_STRIKES': row.get(f'SIG_STRIKES_{winner_color}', 0),
        'KNOCKDOWNS': row.get(f'KNOCKDOWNS_{winner_color}', 0),
        'CONTROL_TIME': row.get(f'CONTROL_TIME_{winner_color}', 0),  # assumed in seconds
    }

    update_elo(winner, loser, finish_type=finish, round_ended=round_ended, stats=stats)

# Convert Elo ratings to DataFrame
elo_df = pd.DataFrame.from_dict(elo_ratings, orient='index', columns=['ELO'])
elo_df.reset_index(inplace=True)
elo_df.rename(columns={'index': 'FIGHTER'}, inplace=True)

# Show top 10 fighters by Elo
print(elo_df.sort_values(by='ELO', ascending=False).head(10))


  exec(code_obj, self.user_global_ns, self.user_ns)


                  FIGHTER          ELO
14              jon jones  1407.811758
419          max holloway  1388.434963
181      charles oliveira  1379.403907
942       islam makhachev  1369.900891
256        dustin poirier  1351.747944
413   khabib nurmagomedov  1331.825771
1058       belal muhammad  1328.600130
858          leon edwards  1326.748032
1015      francis ngannou  1317.891964
612          amanda nunes  1315.423918


In [113]:
import pandas as pd

# Load dataset
df = pd.read_csv("Data/combined_ufc_dataset.csv")

# Remove exact duplicates
df = df.drop_duplicates()

# Create sorted versions of fighters to identify duplicate matchups
df['F1'] = df[['REDFIGHTER', 'BLUEFIGHTER']].min(axis=1)
df['F2'] = df[['REDFIGHTER', 'BLUEFIGHTER']].max(axis=1)

# Remove mirrored/repeated matchups
df = df.drop_duplicates(subset=['F1', 'F2', 'WINNER'])

# Drop helper columns
df.drop(columns=['F1', 'F2'], inplace=True)

# Initialize Elo ratings
initial_elo = 1000
elo_ratings = {}
K = 32  # Standard Elo sensitivity

# Expected outcome function
def expected_score(rating_a, rating_b):
    return 1 / (1 + 10 ** ((rating_b - rating_a) / 400))

# Update Elo ratings without bonuses
def update_elo(winner, loser):
    if winner not in elo_ratings:
        elo_ratings[winner] = initial_elo
    if loser not in elo_ratings:
        elo_ratings[loser] = initial_elo

    Ra = elo_ratings[winner]
    Rb = elo_ratings[loser]

    Ea = expected_score(Ra, Rb)
    Eb = expected_score(Rb, Ra)

    Ra_new = Ra + K * (1 - Ea)
    Rb_new = Rb + K * (0 - Eb)

    elo_ratings[winner] = Ra_new
    elo_ratings[loser] = Rb_new

# Process fights chronologically
df['DATE'] = pd.to_datetime(df['DATE'], errors='coerce')
df = df.dropna(subset=['DATE'])
df = df.sort_values('DATE')

# Loop over fights and update Elo
for _, row in df.iterrows():
    red = str(row['REDFIGHTER']).strip()
    blue = str(row['BLUEFIGHTER']).strip()
    winner_color = str(row['WINNER']).strip().upper()

    if winner_color == 'RED':
        winner = red
        loser = blue
    elif winner_color == 'BLUE':
        winner = blue
        loser = red
    else:
        continue  # Skip if draw or invalid

    update_elo(winner, loser)

# Convert Elo ratings to DataFrame
elo_df = pd.DataFrame.from_dict(elo_ratings, orient='index', columns=['ELO'])
elo_df.reset_index(inplace=True)
elo_df.rename(columns={'index': 'FIGHTER'}, inplace=True)

elo_df.to_csv("Data/elo_ratings.csv", index=False)

# Show top 10 fighters
print(elo_df.sort_values(by='ELO', ascending=False).head(10))


  exec(code_obj, self.user_global_ns, self.user_ns)


                   FIGHTER          ELO
4                jon jones  1233.745055
925        islam makhachev  1198.898307
428    khabib nurmagomedov  1191.056247
1075        belal muhammad  1186.455398
419           max holloway  1183.836658
831           leon edwards  1181.371710
553           amanda nunes  1179.101872
1016  valentina shevchenko  1171.060516
16        charles oliveira  1168.005424
265     demetrious johnson  1157.095049


In [93]:
import pandas as pd

# Load dataset
df = pd.read_csv("Data/fighter_fpi_normalized.csv")

# Remove exact duplicates (if any)
df = df.drop_duplicates()

# Initialize Elo ratings for all fighters
initial_elo = 1000
elo_ratings = {}
K = 32  # Standard Elo sensitivity

# Expected outcome function
def expected_score(rating_a, rating_b):
    return 1 / (1 + 10 ** ((rating_b - rating_a) / 400))

# Update Elo ratings
def update_elo(winner, loser):
    if winner not in elo_ratings:
        elo_ratings[winner] = initial_elo
    if loser not in elo_ratings:
        elo_ratings[loser] = initial_elo

    Ra = elo_ratings[winner]
    Rb = elo_ratings[loser]

    Ea = expected_score(Ra, Rb)
    Eb = expected_score(Rb, Ea)

    Ra_new = Ra + K * (1 - Ea)
    Rb_new = Rb + K * (0 - Eb)

    elo_ratings[winner] = Ra_new
    elo_ratings[loser] = Rb_new

# Loop over the dataset to calculate Elo for each fight
for _, row in df.iterrows():
    # Identify fighters and their opponent
    fighter = str(row['Fighter']).strip()
    opponent = str(row['Opponent']).strip()
    
    # Identify the winner based on the column (assuming 'Winner' column stores the color of the winner - Red or Blue)
    winner_color = str(row['Winner']).strip().upper()  # 'RED' or 'BLUE'

    if winner_color == 'RED':
        winner = fighter
        loser = opponent
    elif winner_color == 'BLUE':
        winner = opponent
        loser = fighter
    else:
        continue  # Skip if the winner column has no valid outcome

    # Update Elo ratings for the winner and loser
    update_elo(winner, loser)

# Convert Elo ratings to DataFrame
elo_df = pd.DataFrame.from_dict(elo_ratings, orient='index', columns=['ELO'])
elo_df.reset_index(inplace=True)
elo_df.rename(columns={'index': 'FIGHTER'}, inplace=True)

# Show top 10 fighters by Elo rating
print(elo_df.sort_values(by='ELO', ascending=False).head(10))

# Optionally, save the Elo ratings to a CSV
elo_df.to_csv("Data/fighter_elo_ratings.csv", index=False)


KeyError: 'Opponent'

In [68]:
import pandas as pd
import numpy as np

# Load dataset
df = pd.read_csv("Data/combined_ufc_dataset.csv")

# Step 1: Compute base stats for both RED and BLUE fighters (individual performance)
def safe_div(a, b):
    return np.where(b == 0, 0, a / b)

# Strike and takedown accuracy
for corner in ["RED", "BLUE"]:
    df[f"{corner}_SIGSTR_ACC"] = safe_div(df[f"{corner}AVGSIGSTRLANDED"], df[f"{corner}AVGSIGSTRPCT"])
    df[f"{corner}_TD_ACC"] = safe_div(df[f"{corner}AVGTDLANDED"], df[f"{corner}AVGTDPCT"])
    df[f"{corner}_FINISH_RATE"] = safe_div(
        df[f"{corner}WINSBYKO"] + df[f"{corner}WINSBYSUBMISSION"],
        df[f"{corner}WINS"] + df[f"{corner}LOSSES"]
    )

# Step 2: Calculate Strength of Opponent (opponent Elo or win record)
def compute_opponent_quality(row):
    blue_strength = row['BLUEWINS'] / (row['BLUEWINS'] + row['BLUELOSSES'] + 1)
    red_strength = row['REDWINS'] / (row['REDWINS'] + row['REDLOSSES'] + 1)
    return pd.Series([blue_strength, red_strength], index=['BLUE_OPP_QUALITY', 'RED_OPP_QUALITY'])

df[['BLUE_OPP_QUALITY', 'RED_OPP_QUALITY']] = df.apply(compute_opponent_quality, axis=1)

# Step 3: Fighter Performance Index (FPI) — aggregate score
# Normalize relevant stats for each fighter
from sklearn.preprocessing import MinMaxScaler

fpi_features_red = [
    'REDAVGSIGSTRLANDED', 'REDAVGSUBATT', 'REDAVGTDLANDED', 'RED_SIGSTR_ACC', 'RED_TD_ACC',
    'REDHEIGHTCMS', 'REDREACHCMS', 'REDAGE', 'REDWINS', 'REDLONGESTWINSTREAK', 'REDTOTALTITLEBOUTS',
    'RED_FINISH_RATE', 'BLUE_OPP_QUALITY'
]

fpi_features_blue = [
    'BLUEAVGSIGSTRLANDED', 'BLUEAVGSUBATT', 'BLUEAVGTDLANDED', 'BLUE_SIGSTR_ACC', 'BLUE_TD_ACC',
    'BLUEHEIGHTCMS', 'BLUEREACHCMS', 'BLUEAGE', 'BLUEWINS', 'BLUELONGESTWINSTREAK', 'BLUETOTALTITLEBOUTS',
    'BLUE_FINISH_RATE', 'RED_OPP_QUALITY'
]

# Drop rows with NaNs in any FPI feature to ensure clean normalization
df = df.dropna(subset=fpi_features_red + fpi_features_blue)

scaler = MinMaxScaler()
df[fpi_features_red] = scaler.fit_transform(df[fpi_features_red])
df[fpi_features_blue] = scaler.transform(df[fpi_features_blue])

# Weighted score for each fighter (you can adjust the weights as needed)
df['RED_FPI'] = df[fpi_features_red].mean(axis=1)
df['BLUE_FPI'] = df[fpi_features_blue].mean(axis=1)

# Save result
df[['REDFIGHTER', 'RED_FPI', 'BLUEFIGHTER', 'BLUE_FPI']].to_csv("Data/ilo_fpi_scores.csv", index=False)
print("FPI scores for ILO saved to Data/ilo_fpi_scores.csv")

  exec(code_obj, self.user_global_ns, self.user_ns)
Feature names unseen at fit time:
- BLUEAGE
- BLUEAVGSIGSTRLANDED
- BLUEAVGSUBATT
- BLUEAVGTDLANDED
- BLUEHEIGHTCMS
- ...
Feature names seen at fit time, yet now missing:
- BLUE_OPP_QUALITY
- REDAGE
- REDAVGSIGSTRLANDED
- REDAVGSUBATT
- REDAVGTDLANDED
- ...



FPI scores for ILO saved to Data/ilo_fpi_scores.csv


In [70]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Load dataset
df = pd.read_csv("Data/combined_ufc_dataset.csv")

# Step 1: Define safe division
def safe_div(a, b):
    return np.where(b == 0, 0, a / b)

# Step 2: Compute individual fighter statistics
for corner in ["RED", "BLUE"]:
    df[f"{corner}_SIGSTR_ACC"] = safe_div(df[f"{corner}AVGSIGSTRLANDED"], df[f"{corner}AVGSIGSTRPCT"])
    df[f"{corner}_TD_ACC"] = safe_div(df[f"{corner}AVGTDLANDED"], df[f"{corner}AVGTDPCT"])
    df[f"{corner}_FINISH_RATE"] = safe_div(
        df[f"{corner}WINSBYKO"] + df[f"{corner}WINSBYSUBMISSION"],
        df[f"{corner}WINS"] + df[f"{corner}LOSSES"]
    )

# Step 3: Calculate opponent quality
df["RED_OPP_QUALITY"] = df['BLUEWINS'] / (df['BLUEWINS'] + df['BLUELOSSES'] + 1)
df["BLUE_OPP_QUALITY"] = df['REDWINS'] / (df['REDWINS'] + df['REDLOSSES'] + 1)

# Step 4: Define FPI features
fpi_features_red = [
    'REDAVGSIGSTRLANDED', 'REDAVGSUBATT', 'REDAVGTDLANDED',
    'RED_SIGSTR_ACC', 'RED_TD_ACC', 'REDHEIGHTCMS', 'REDREACHCMS',
    'REDAGE', 'REDWINS', 'REDLONGESTWINSTREAK', 'REDTOTALTITLEBOUTS',
    'RED_FINISH_RATE', 'RED_OPP_QUALITY'
]

fpi_features_blue = [
    'BLUEAVGSIGSTRLANDED', 'BLUEAVGSUBATT', 'BLUEAVGTDLANDED',
    'BLUE_SIGSTR_ACC', 'BLUE_TD_ACC', 'BLUEHEIGHTCMS', 'BLUEREACHCMS',
    'BLUEAGE', 'BLUEWINS', 'BLUELONGESTWINSTREAK', 'BLUETOTALTITLEBOUTS',
    'BLUE_FINISH_RATE', 'BLUE_OPP_QUALITY'
]

# Remove NaNs for clean normalization
df = df.dropna(subset=fpi_features_red + fpi_features_blue)

# Step 5: Normalize features
scaler = MinMaxScaler()
df[fpi_features_red] = scaler.fit_transform(df[fpi_features_red])
df[fpi_features_blue] = scaler.transform(df[fpi_features_blue])

# Step 6: Calculate FPI (aggregated index)
df['RED_FPI'] = df[fpi_features_red].mean(axis=1)
df['BLUE_FPI'] = df[fpi_features_blue].mean(axis=1)

# Step 7: Aggregate to fighter level
red_fpi = df.groupby('REDFIGHTER')['RED_FPI'].mean().reset_index().rename(columns={'REDFIGHTER':'FIGHTER','RED_FPI':'FPI'})
blue_fpi = df.groupby('BLUEFIGHTER')['BLUE_FPI'].mean().reset_index().rename(columns={'BLUEFIGHTER':'FIGHTER','BLUE_FPI':'FPI'})

# Combine red and blue fighter scores
fpi_df = pd.concat([red_fpi, blue_fpi]).groupby('FIGHTER')['FPI'].mean().reset_index()

# Save FPI scores
fpi_df.to_csv("Data/fighter_fpi_scores.csv", index=False)

# Display top fighters
print(fpi_df.sort_values(by='FPI', ascending=False).head(10))

  exec(code_obj, self.user_global_ns, self.user_ns)
Feature names unseen at fit time:
- BLUEAGE
- BLUEAVGSIGSTRLANDED
- BLUEAVGSUBATT
- BLUEAVGTDLANDED
- BLUEHEIGHTCMS
- ...
Feature names seen at fit time, yet now missing:
- REDAGE
- REDAVGSIGSTRLANDED
- REDAVGSUBATT
- REDAVGTDLANDED
- REDHEIGHTCMS
- ...



                FIGHTER       FPI
714   georges st-pierre  0.511206
105      anderson silva  0.477707
1640      randy couture  0.464253
1939         tito ortiz  0.428194
1363        matt hughes  0.427849
683       frankie edgar  0.425312
1027          jose aldo  0.421995
831     jailton almeida  0.416724
390       chuck liddell  0.415372
280      cain velasquez  0.414245


In [74]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Load dataset
df = pd.read_csv("Data/combined_ufc_dataset.csv")

# Step 1: Define safe division
def safe_div(a, b):
    return np.where(b == 0, 0, a / b)

# Step 2: Compute individual fighter statistics and average them
for corner in ["RED", "BLUE"]:
    df[f"{corner}_SIGSTR_ACC"] = safe_div(df[f"{corner}AVGSIGSTRLANDED"], df[f"{corner}AVGSIGSTRPCT"])
    df[f"{corner}_TD_ACC"] = safe_div(df[f"{corner}AVGTDLANDED"], df[f"{corner}AVGTDPCT"])
    df[f"{corner}_FINISH_RATE"] = safe_div(
        df[f"{corner}WINSBYKO"] + df[f"{corner}WINSBYSUBMISSION"],
        df[f"{corner}WINS"] + df[f"{corner}LOSSES"]
    )

# Step 3: Calculate opponent quality, averaging over fights
df["RED_OPP_QUALITY"] = df['BLUEWINS'] / (df['BLUEWINS'] + df['BLUELOSSES'] + 1)
df["BLUE_OPP_QUALITY"] = df['REDWINS'] / (df['REDWINS'] + df['REDLOSSES'] + 1)

# Check intermediate results
print(df[['REDFIGHTER', 'RED_SIGSTR_ACC', 'RED_TD_ACC', 'RED_FINISH_RATE', 'RED_OPP_QUALITY']].head())

# Step 4: Define FPI features
fpi_features_red = [
    'REDAVGSIGSTRLANDED', 'REDAVGSUBATT', 'REDAVGTDLANDED',
    'RED_SIGSTR_ACC', 'RED_TD_ACC', 'REDHEIGHTCMS', 'REDREACHCMS',
    'REDAGE', 'REDWINS', 'REDLONGESTWINSTREAK', 'REDTOTALTITLEBOUTS',
    'RED_FINISH_RATE', 'RED_OPP_QUALITY'
]

fpi_features_blue = [
    'BLUEAVGSIGSTRLANDED', 'BLUEAVGSUBATT', 'BLUEAVGTDLANDED',
    'BLUE_SIGSTR_ACC', 'BLUE_TD_ACC', 'BLUEHEIGHTCMS', 'BLUEREACHCMS',
    'BLUEAGE', 'BLUEWINS', 'BLUELONGESTWINSTREAK', 'BLUETOTALTITLEBOUTS',
    'BLUE_FINISH_RATE', 'BLUE_OPP_QUALITY'
]

# Remove NaNs for clean normalization
df = df.dropna(subset=fpi_features_red + fpi_features_blue)

# Step 5: Normalize features for both red and blue fighters
scaler = MinMaxScaler()

# Apply normalization to both RED and BLUE fighter stats
df[fpi_features_red] = scaler.fit_transform(df[fpi_features_red])
df[fpi_features_blue] = scaler.transform(df[fpi_features_blue])

# Check normalized values for sanity
print(df[fpi_features_red].head())
print(df[fpi_features_blue].head())

# Step 6: Calculate FPI (aggregated index for each fighter)
df['RED_FPI'] = df[fpi_features_red].mean(axis=1)
df['BLUE_FPI'] = df[fpi_features_blue].mean(axis=1)

# Step 7: Aggregate stats to fighter level (average over all available fights)
red_fpi = df.groupby('REDFIGHTER')['RED_FPI'].mean().reset_index().rename(columns={'REDFIGHTER':'FIGHTER','RED_FPI':'FPI'})
blue_fpi = df.groupby('BLUEFIGHTER')['BLUE_FPI'].mean().reset_index().rename(columns={'BLUEFIGHTER':'FIGHTER','BLUE_FPI':'FPI'})

# Check FPI for each fighter before combining
print(red_fpi.head())
print(blue_fpi.head())

# Step 8: Combine the results for both RED and BLUE fighters
fpi_df = pd.concat([red_fpi, blue_fpi]).groupby('FIGHTER')['FPI'].mean().reset_index()

# Check the combined FPI values
print(fpi_df.head())

# Step 9: Save FPI scores to CSV
fpi_df.to_csv("Data/fighter_fpi_scores.csv", index=False)

# Display top 10 fighters by FPI
print(fpi_df.sort_values(by='FPI', ascending=False).head(10))


  exec(code_obj, self.user_global_ns, self.user_ns)


     REDFIGHTER  RED_SIGSTR_ACC  RED_TD_ACC  RED_FINISH_RATE  RED_OPP_QUALITY
0  alex pereira        8.666667        0.14         0.666667         0.600000
1  alex pereira        8.666667        0.14         0.625000         0.666667
2  alex pereira        8.301587        0.17         0.571429         0.777778
3  alex pereira        8.301587        0.17         0.750000         0.800000
4  alex pereira        8.301587        0.17         1.000000         0.750000


Feature names unseen at fit time:
- BLUEAGE
- BLUEAVGSIGSTRLANDED
- BLUEAVGSUBATT
- BLUEAVGTDLANDED
- BLUEHEIGHTCMS
- ...
Feature names seen at fit time, yet now missing:
- REDAGE
- REDAVGSIGSTRLANDED
- REDAVGSUBATT
- REDAVGTDLANDED
- REDHEIGHTCMS
- ...



   REDAVGSIGSTRLANDED  REDAVGSUBATT  REDAVGTDLANDED  RED_SIGSTR_ACC  \
0            0.038723      0.035714          0.0112        0.026398   
1            0.038723      0.035714          0.0112        0.026398   
2            0.037092      0.035714          0.0136        0.025286   
3            0.037092      0.035714          0.0136        0.025286   
4            0.037092      0.035714          0.0136        0.025286   

   RED_TD_ACC  REDHEIGHTCMS  REDREACHCMS    REDAGE   REDWINS  \
0      0.0056      0.915663     0.934911  0.787234  0.242424   
1      0.0056      0.915663     0.934911  0.765957  0.212121   
2      0.0068      0.915663     0.934911  0.765957  0.181818   
3      0.0068      0.915663     0.934911  0.744681  0.121212   
4      0.0068      0.915663     0.934911  0.723404  0.030303   

   REDLONGESTWINSTREAK  REDTOTALTITLEBOUTS  RED_FINISH_RATE  RED_OPP_QUALITY  
0             0.222222              0.3125         0.333333         0.650000  
1             0.222222        

In [75]:
print(df[fpi_features_red].var())
print(df[fpi_features_blue].var())

REDAVGSIGSTRLANDED     0.020171
REDAVGSUBATT           0.005087
REDAVGTDLANDED         0.009314
RED_SIGSTR_ACC         0.020406
RED_TD_ACC             0.013129
REDHEIGHTCMS           0.018311
REDREACHCMS            0.019418
REDAGE                 0.018101
REDWINS                0.024346
REDLONGESTWINSTREAK    0.019721
REDTOTALTITLEBOUTS     0.015544
RED_FINISH_RATE        0.014943
RED_OPP_QUALITY        0.078961
dtype: float64
BLUEAVGSIGSTRLANDED     0.019657
BLUEAVGSUBATT           0.004853
BLUEAVGTDLANDED         0.009782
BLUE_SIGSTR_ACC         0.019474
BLUE_TD_ACC             0.013842
BLUEHEIGHTCMS           0.018249
BLUEREACHCMS            0.019259
BLUEAGE                 0.016974
BLUEWINS                0.017727
BLUELONGESTWINSTREAK    0.014886
BLUETOTALTITLEBOUTS     0.008267
BLUE_FINISH_RATE        0.018074
BLUE_OPP_QUALITY        0.056716
dtype: float64


In [82]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Load dataset
df = pd.read_csv("Data/combined_ufc_dataset.csv")

# Step 1: Define safe division
def safe_div(a, b):
    return np.where(b == 0, 0, a / b)

# Step 2: Compute individual fighter statistics and average them
for corner in ["RED", "BLUE"]:
    df[f"{corner}_SIGSTR_ACC"] = safe_div(df[f"{corner}AVGSIGSTRLANDED"], df[f"{corner}AVGSIGSTRPCT"])
    df[f"{corner}_TD_ACC"] = safe_div(df[f"{corner}AVGTDLANDED"], df[f"{corner}AVGTDPCT"])
    df[f"{corner}_FINISH_RATE"] = safe_div(
        df[f"{corner}WINSBYKO"] + df[f"{corner}WINSBYSUBMISSION"],
        df[f"{corner}WINS"] + df[f"{corner}LOSSES"]
    )

# Step 3: Calculate opponent quality, averaging over fights
df["RED_OPP_QUALITY"] = df['BLUEWINS'] / (df['BLUEWINS'] + df['BLUELOSSES'] + 1)
df["BLUE_OPP_QUALITY"] = df['REDWINS'] / (df['REDWINS'] + df['REDLOSSES'] + 1)

# Step 4: Define FPI features
fpi_features_red = [
    'REDAVGSIGSTRLANDED', 'REDAVGSUBATT', 'REDAVGTDLANDED',
    'RED_SIGSTR_ACC', 'RED_TD_ACC', 'REDHEIGHTCMS', 'REDREACHCMS',
    'REDWINS', 'REDLONGESTWINSTREAK', 'REDTOTALTITLEBOUTS',
    'RED_FINISH_RATE', 'RED_OPP_QUALITY'
]

fpi_features_blue = [
    'BLUEAVGSIGSTRLANDED', 'BLUEAVGSUBATT', 'BLUEAVGTDLANDED',
    'BLUE_SIGSTR_ACC', 'BLUE_TD_ACC', 'BLUEHEIGHTCMS', 'BLUEREACHCMS',
    'BLUEWINS', 'BLUELONGESTWINSTREAK', 'BLUETOTALTITLEBOUTS',
    'BLUE_FINISH_RATE', 'BLUE_OPP_QUALITY'
]

# Step 5: Remove NaNs for clean normalization
df = df.dropna(subset=fpi_features_red + fpi_features_blue)

# Step 6: Normalize features for both red and blue fighters
scaler = MinMaxScaler()

# Apply normalization to both RED and BLUE fighter stats
df[fpi_features_red] = scaler.fit_transform(df[fpi_features_red])
df[fpi_features_blue] = scaler.transform(df[fpi_features_blue])

# Step 7: Define feature weights (higher weight for more important features)
weights = {
    'REDAVGSIGSTRLANDED': 1.2,
    'REDAVGSUBATT': 1.0,
    'REDAVGTDLANDED': 1.1,
    'RED_SIGSTR_ACC': 1.5,
    'RED_TD_ACC': 1.3,
    'REDHEIGHTCMS': 0.1,
    'REDREACHCMS': 0.1,
    'REDWINS': 1.4,
    'REDLONGESTWINSTREAK': 0.9,
    'REDTOTALTITLEBOUTS': 0.5,
    'RED_FINISH_RATE': 1.4,
    'RED_OPP_QUALITY': 1.6,
    
    'BLUEAVGSIGSTRLANDED': 1.2,
    'BLUEAVGSUBATT': 1.0,
    'BLUEAVGTDLANDED': 1.1,
    'BLUE_SIGSTR_ACC': 1.5,
    'BLUE_TD_ACC': 1.3,
    'BLUEHEIGHTCMS': 0.1,
    'BLUEREACHCMS': 0.1,
    'BLUEWINS': 1.4,
    'BLUELONGESTWINSTREAK': 0.9,
    'BLUETOTALTITLEBOUTS': 0.5,
    'BLUE_FINISH_RATE': 1.4,
    'BLUE_OPP_QUALITY': 1.6,
}

# Step 8: Weighted FPI calculation for each fighter
df['RED_FPI'] = df.apply(lambda row: sum(row[feature] * weights.get(feature, 1) for feature in fpi_features_red), axis=1)
df['BLUE_FPI'] = df.apply(lambda row: sum(row[feature] * weights.get(feature, 1) for feature in fpi_features_blue), axis=1)

# Step 9: Aggregate stats to fighter level (average over all available fights)
red_fpi = df.groupby('REDFIGHTER')['RED_FPI'].mean().reset_index().rename(columns={'REDFIGHTER':'FIGHTER','RED_FPI':'FPI'})
blue_fpi = df.groupby('BLUEFIGHTER')['BLUE_FPI'].mean().reset_index().rename(columns={'BLUEFIGHTER':'FIGHTER','BLUE_FPI':'FPI'})

# Step 10: Combine the results for both RED and BLUE fighters
fpi_df = pd.concat([red_fpi, blue_fpi]).groupby('FIGHTER')['FPI'].mean().reset_index()

# Step 11: Save FPI scores to CSV
fpi_df.to_csv("Data/fighter_fpi_scores.csv", index=False)

# Display top 10 fighters by FPI
print(fpi_df.sort_values(by='FPI', ascending=False).head(10))


  exec(code_obj, self.user_global_ns, self.user_ns)
Feature names unseen at fit time:
- BLUEAVGSIGSTRLANDED
- BLUEAVGSUBATT
- BLUEAVGTDLANDED
- BLUEHEIGHTCMS
- BLUELONGESTWINSTREAK
- ...
Feature names seen at fit time, yet now missing:
- REDAVGSIGSTRLANDED
- REDAVGSUBATT
- REDAVGTDLANDED
- REDHEIGHTCMS
- REDLONGESTWINSTREAK
- ...



                FIGHTER       FPI
714   georges st-pierre  4.989363
105      anderson silva  4.212233
683       frankie edgar  4.143791
280      cain velasquez  3.953589
831     jailton almeida  3.912636
1027          jose aldo  3.861214
999           jon fitch  3.775481
1111       kamaru usman  3.741841
997     johny hendricks  3.741133
219           bo nickal  3.736651


In [87]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Load dataset
df = pd.read_csv("Data/combined_ufc_dataset.csv")

# Convert the 'DATE' column to datetime
df['DATE'] = pd.to_datetime(df['DATE'], errors='coerce')

# Step 1: Get the most recent fight date for each fighter (Red and Blue fighters)
last_fight_date = df.groupby('REDFIGHTER')['DATE'].max().reset_index().rename(columns={'REDFIGHTER': 'FIGHTER', 'DATE': 'LAST_FIGHT_DATE'})
last_fight_date_blue = df.groupby('BLUEFIGHTER')['DATE'].max().reset_index().rename(columns={'BLUEFIGHTER': 'FIGHTER', 'DATE': 'LAST_FIGHT_DATE'})

# Combine both RED and BLUE fighter data
last_fight_date = pd.concat([last_fight_date, last_fight_date_blue])

# Remove duplicates (if a fighter appears as both RED and BLUE)
last_fight_date = last_fight_date.drop_duplicates(subset=['FIGHTER'])

# Step 2: Calculate the time difference between today and the last fight date
current_date = pd.to_datetime('today')
last_fight_date['INACTIVITY_PERIOD'] = (current_date - last_fight_date['LAST_FIGHT_DATE']).dt.days / 365  # In years

# Step 3: Filter out fighters who have been inactive for more than 4 years
active_fighters = last_fight_date[last_fight_date['INACTIVITY_PERIOD'] <= 4]

# Step 4: Get the active fighter list
active_fighter_list = active_fighters['FIGHTER'].tolist()

# Step 5: Filter the main dataset to include only active fighters
active_df = df[df['REDFIGHTER'].isin(active_fighter_list) | df['BLUEFIGHTER'].isin(active_fighter_list)]

# Step 6: Define safe division
def safe_div(a, b):
    return np.where(b == 0, 0, a / b)

# Step 7: Compute individual fighter statistics and average them
for corner in ["RED", "BLUE"]:
    active_df[f"{corner}_SIGSTR_ACC"] = safe_div(active_df[f"{corner}AVGSIGSTRLANDED"], active_df[f"{corner}AVGSIGSTRPCT"])
    active_df[f"{corner}_TD_ACC"] = safe_div(active_df[f"{corner}AVGTDLANDED"], active_df[f"{corner}AVGTDPCT"])
    active_df[f"{corner}_FINISH_RATE"] = safe_div(
        active_df[f"{corner}WINSBYKO"] + active_df[f"{corner}WINSBYSUBMISSION"],
        active_df[f"{corner}WINS"] + active_df[f"{corner}LOSSES"]
    )

# Step 8: Calculate opponent quality
active_df["RED_OPP_QUALITY"] = active_df['BLUEWINS'] / (active_df['BLUEWINS'] + active_df['BLUELOSSES'] + 1)
active_df["BLUE_OPP_QUALITY"] = active_df['REDWINS'] / (active_df['REDWINS'] + active_df['REDLOSSES'] + 1)

# Step 9: Define FPI features
fpi_features_red = [
    'REDAVGSIGSTRLANDED', 'REDAVGSUBATT', 'REDAVGTDLANDED',
    'RED_SIGSTR_ACC', 'RED_TD_ACC', 'REDHEIGHTCMS', 'REDREACHCMS',
    'REDWINS', 'REDLONGESTWINSTREAK', 'REDTOTALTITLEBOUTS',
    'RED_FINISH_RATE', 'RED_OPP_QUALITY'
]

fpi_features_blue = [
    'BLUEAVGSIGSTRLANDED', 'BLUEAVGSUBATT', 'BLUEAVGTDLANDED',
    'BLUE_SIGSTR_ACC', 'BLUE_TD_ACC', 'BLUEHEIGHTCMS', 'BLUEREACHCMS',
    'BLUEWINS', 'BLUELONGESTWINSTREAK', 'BLUETOTALTITLEBOUTS',
    'BLUE_FINISH_RATE', 'BLUE_OPP_QUALITY'
]

# Step 10: Remove NaNs for clean normalization
active_df = active_df.dropna(subset=fpi_features_red + fpi_features_blue)

# Step 11: Normalize features for both red and blue fighters
scaler = MinMaxScaler()

# Apply normalization to both RED and BLUE fighter stats
active_df[fpi_features_red] = scaler.fit_transform(active_df[fpi_features_red])
active_df[fpi_features_blue] = scaler.transform(active_df[fpi_features_blue])

# Step 12: Weighted FPI calculation (You can adjust the weights as needed)
weights = {
    'REDAVGSIGSTRLANDED': 1.2,
    'REDAVGSUBATT': 1.0,
    'REDAVGTDLANDED': 1.1,
    'RED_SIGSTR_ACC': 1.5,
    'RED_TD_ACC': 1.3,
    'REDHEIGHTCMS': 0.1,
    'REDREACHCMS': 0.1,
    'REDWINS': 1.7,
    'REDLONGESTWINSTREAK': 1.2,
    'REDTOTALTITLEBOUTS': 1.0,
    'RED_FINISH_RATE': 1.6,
    'RED_OPP_QUALITY': 1.4,
    
    'BLUEAVGSIGSTRLANDED': 1.2,
    'BLUEAVGSUBATT': 1.0,
    'BLUEAVGTDLANDED': 1.1,
    'BLUE_SIGSTR_ACC': 1.5,
    'BLUE_TD_ACC': 1.3,
    'BLUEHEIGHTCMS': 0.1,
    'BLUEREACHCMS': 0.1,
    'BLUEWINS': 1.7,
    'BLUELONGESTWINSTREAK': 1.2,
    'BLUETOTALTITLEBOUTS': 1.0,
    'BLUE_FINISH_RATE': 1.6,
    'BLUE_OPP_QUALITY': 1.4,
}

# Step 13: Calculate the FPI for each fighter using weighted features
active_df['RED_FPI'] = active_df.apply(lambda row: sum(row[feature] * weights.get(feature, 1) for feature in fpi_features_red), axis=1)
active_df['BLUE_FPI'] = active_df.apply(lambda row: sum(row[feature] * weights.get(feature, 1) for feature in fpi_features_blue), axis=1)

# Step 14: Aggregate to fighter level (average over all available fights)
red_fpi = active_df.groupby('REDFIGHTER')['RED_FPI'].mean().reset_index().rename(columns={'REDFIGHTER':'FIGHTER','RED_FPI':'FPI'})
blue_fpi = active_df.groupby('BLUEFIGHTER')['BLUE_FPI'].mean().reset_index().rename(columns={'BLUEFIGHTER':'FIGHTER','BLUE_FPI':'FPI'})

# Step 15: Combine the results for both RED and BLUE fighters
fpi_df = pd.concat([red_fpi, blue_fpi]).groupby('FIGHTER')['FPI'].mean().reset_index()

# Step 16: Add the fighter's age to the FPI DataFrame
fpi_df['AGE'] = fpi_df['FIGHTER'].apply(lambda fighter: active_df[active_df['REDFIGHTER'] == fighter]['REDAGE'].values[0] 
                                          if fighter in active_df['REDFIGHTER'].values else 
                                          active_df[active_df['BLUEFIGHTER'] == fighter]['BLUEAGE'].values[0])

# Step 17: Save FPI scores to CSV (without inactive fighters)
fpi_df.to_csv("Data/fighter_fpi_scores_active_with_age.csv", index=False)

# Step 18: Display top 10 active fighters by FPI and their age
print(fpi_df.sort_values(by='FPI', ascending=False).head(10))


  exec(code_obj, self.user_global_ns, self.user_ns)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentatio

                  FIGHTER       FPI   AGE
586     georges st-pierre  5.232220  31.0
85         anderson silva  4.886703  41.0
433    demetrious johnson  4.708194  30.0
232        cain velasquez  4.583670  36.0
1128      michael bisping  4.397177  38.0
559         frankie edgar  4.324431  41.0
462         dominick cruz  4.259395  31.0
825             jose aldo  4.227658  38.0
929   khabib nurmagomedov  4.209068  32.0
1568         tj dillashaw  4.168206  32.0


In [88]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Load dataset
df = pd.read_csv("Data/combined_ufc_dataset.csv")

# Convert the 'DATE' column to datetime
df['DATE'] = pd.to_datetime(df['DATE'], errors='coerce')

# Step 1: Get the most recent fight date for each fighter (Red and Blue fighters)
last_fight_date = df.groupby('REDFIGHTER')['DATE'].max().reset_index().rename(columns={'REDFIGHTER': 'FIGHTER', 'DATE': 'LAST_FIGHT_DATE'})
last_fight_date_blue = df.groupby('BLUEFIGHTER')['DATE'].max().reset_index().rename(columns={'BLUEFIGHTER': 'FIGHTER', 'DATE': 'LAST_FIGHT_DATE'})

# Combine both RED and BLUE fighter data
last_fight_date = pd.concat([last_fight_date, last_fight_date_blue])

# Remove duplicates (if a fighter appears as both RED and BLUE)
last_fight_date = last_fight_date.drop_duplicates(subset=['FIGHTER'])

# Step 2: Calculate the time difference between today and the last fight date
current_date = pd.to_datetime('today')
last_fight_date['INACTIVITY_PERIOD'] = (current_date - last_fight_date['LAST_FIGHT_DATE']).dt.days / 365  # In years

# Step 3: Filter out fighters who have been inactive for more than 4 years
active_fighters = last_fight_date[last_fight_date['INACTIVITY_PERIOD'] <= 4]

# Step 4: Get the active fighter list
active_fighter_list = active_fighters['FIGHTER'].tolist()

# Step 5: Filter the main dataset to include only active fighters
active_df = df[df['REDFIGHTER'].isin(active_fighter_list) | df['BLUEFIGHTER'].isin(active_fighter_list)]

# Step 6: Define safe division
def safe_div(a, b):
    return np.where(b == 0, 0, a / b)

# Step 7: Compute individual fighter statistics and average them
for corner in ["RED", "BLUE"]:
    active_df[f"{corner}_SIGSTR_ACC"] = safe_div(active_df[f"{corner}AVGSIGSTRLANDED"], active_df[f"{corner}AVGSIGSTRPCT"])
    active_df[f"{corner}_TD_ACC"] = safe_div(active_df[f"{corner}AVGTDLANDED"], active_df[f"{corner}AVGTDPCT"])
    active_df[f"{corner}_FINISH_RATE"] = safe_div(
        active_df[f"{corner}WINSBYKO"] + active_df[f"{corner}WINSBYSUBMISSION"],
        active_df[f"{corner}WINS"] + active_df[f"{corner}LOSSES"]
    )

# Step 8: Calculate opponent quality
active_df["RED_OPP_QUALITY"] = active_df['BLUEWINS'] / (active_df['BLUEWINS'] + active_df['BLUELOSSES'] + 1)
active_df["BLUE_OPP_QUALITY"] = active_df['REDWINS'] / (active_df['REDWINS'] + active_df['REDLOSSES'] + 1)

# Step 9: Define FPI features
fpi_features_red = [
    'REDAVGSIGSTRLANDED', 'REDAVGSUBATT', 'REDAVGTDLANDED',
    'RED_SIGSTR_ACC', 'RED_TD_ACC', 'REDHEIGHTCMS', 'REDREACHCMS',
    'REDWINS', 'REDLONGESTWINSTREAK', 'REDTOTALTITLEBOUTS',
    'RED_FINISH_RATE', 'RED_OPP_QUALITY'
]

fpi_features_blue = [
    'BLUEAVGSIGSTRLANDED', 'BLUEAVGSUBATT', 'BLUEAVGTDLANDED',
    'BLUE_SIGSTR_ACC', 'BLUE_TD_ACC', 'BLUEHEIGHTCMS', 'BLUEREACHCMS',
    'BLUEWINS', 'BLUELONGESTWINSTREAK', 'BLUETOTALTITLEBOUTS',
    'BLUE_FINISH_RATE', 'BLUE_OPP_QUALITY'
]

# Step 10: Remove NaNs for clean normalization
active_df = active_df.dropna(subset=fpi_features_red + fpi_features_blue)

# Step 11: Normalize features for both red and blue fighters
scaler = MinMaxScaler()

# Apply normalization to both RED and BLUE fighter stats
active_df[fpi_features_red] = scaler.fit_transform(active_df[fpi_features_red])
active_df[fpi_features_blue] = scaler.transform(active_df[fpi_features_blue])

# Step 12: Weighted FPI calculation (You can adjust the weights as needed)
weights = {
    'REDAVGSIGSTRLANDED': 1.2,
    'REDAVGSUBATT': 1.0,
    'REDAVGTDLANDED': 1.1,
    'RED_SIGSTR_ACC': 1.5,
    'RED_TD_ACC': 1.3,
    'REDHEIGHTCMS': 0.1,
    'REDREACHCMS': 0.1,
    'REDWINS': 1.7,
    'REDLONGESTWINSTREAK': 1.2,
    'REDTOTALTITLEBOUTS': 1.0,
    'RED_FINISH_RATE': 1.6,
    'RED_OPP_QUALITY': 1.4,
    
    'BLUEAVGSIGSTRLANDED': 1.2,
    'BLUEAVGSUBATT': 1.0,
    'BLUEAVGTDLANDED': 1.1,
    'BLUE_SIGSTR_ACC': 1.5,
    'BLUE_TD_ACC': 1.3,
    'BLUEHEIGHTCMS': 0.1,
    'BLUEREACHCMS': 0.1,
    'BLUEWINS': 1.7,
    'BLUELONGESTWINSTREAK': 1.2,
    'BLUETOTALTITLEBOUTS': 1.0,
    'BLUE_FINISH_RATE': 1.6,
    'BLUE_OPP_QUALITY': 1.4,
}

# Step 13: Calculate the FPI for each fighter using weighted features
active_df['RED_FPI'] = active_df.apply(lambda row: sum(row[feature] * weights.get(feature, 1) for feature in fpi_features_red), axis=1)
active_df['BLUE_FPI'] = active_df.apply(lambda row: sum(row[feature] * weights.get(feature, 1) for feature in fpi_features_blue), axis=1)

# Step 14: Aggregate to fighter level (average over all available fights)
red_fpi = active_df.groupby('REDFIGHTER')['RED_FPI'].mean().reset_index().rename(columns={'REDFIGHTER':'FIGHTER','RED_FPI':'FPI'})
blue_fpi = active_df.groupby('BLUEFIGHTER')['BLUE_FPI'].mean().reset_index().rename(columns={'BLUEFIGHTER':'FIGHTER','BLUE_FPI':'FPI'})

# Step 15: Combine the results for both RED and BLUE fighters
fpi_df = pd.concat([red_fpi, blue_fpi]).groupby('FIGHTER')['FPI'].mean().reset_index()

# Step 16: Add the fighter's age (max age) to the FPI DataFrame
fpi_df['AGE'] = fpi_df['FIGHTER'].apply(lambda fighter: max(
    active_df[active_df['REDFIGHTER'] == fighter]['REDAGE'].max(),
    active_df[active_df['BLUEFIGHTER'] == fighter]['BLUEAGE'].max()
))

# Step 17: Save FPI scores to CSV (without inactive fighters)
fpi_df.to_csv("Data/fighter_fpi_scores_active_with_max_age.csv", index=False)

# Step 18: Display top 10 active fighters by FPI and their age
print(fpi_df.sort_values(by='FPI', ascending=False).head(10))

  exec(code_obj, self.user_global_ns, self.user_ns)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentatio

                  FIGHTER       FPI   AGE
586     georges st-pierre  5.232220  31.0
85         anderson silva  4.886703  45.0
433    demetrious johnson  4.708194  30.0
232        cain velasquez  4.583670   NaN
1128      michael bisping  4.397177  38.0
559         frankie edgar  4.324431  41.0
462         dominick cruz  4.259395  37.0
825             jose aldo  4.227658  38.0
929   khabib nurmagomedov  4.209068  32.0
1568         tj dillashaw  4.168206  36.0


In [115]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Load dataset
df = pd.read_csv("Data/fighter_fpi_normalized.csv")

# Step 1: Convert 'RedDate' to datetime
df['RedDate'] = pd.to_datetime(df['RedDate'], format='%d/%m/%Y', errors='coerce')

# Step 2: Calculate the inactivity period (in years) based on RedDate
current_date = pd.to_datetime('today')
df['INACTIVITY_PERIOD'] = (current_date - df['RedDate']).dt.days / 365  # In years

# Step 3: Filter out fighters who have been inactive for more than 4 years
active_fighters_df = df[df['INACTIVITY_PERIOD'] <= 3]

# Step 4: Define safe division to avoid division by zero
def safe_div(a, b):
    return np.where(b == 0, 0, a / b)

# Step 5: Compute individual fighter statistics (no "Red" or "Blue" dynamic)
active_fighters_df['SIGSTR_ACC'] = safe_div(active_fighters_df['RedAvgSigStrLanded'] + active_fighters_df['BlueAvgSigStrLanded'], active_fighters_df['RedAvgSigStrPct'] + active_fighters_df['BlueAvgSigStrPct'])
active_fighters_df['TD_ACC'] = safe_div(active_fighters_df['RedAvgTDLanded'] + active_fighters_df['BlueAvgTDLanded'], active_fighters_df['RedAvgTDPct'] + active_fighters_df['BlueAvgTDPct'])
active_fighters_df['FINISH_RATE'] = safe_div(
    active_fighters_df['RedWinsByKO'] + active_fighters_df['BlueWinsByKO'] + active_fighters_df['RedWinsBySubmission'] + active_fighters_df['BlueWinsBySubmission'],
    active_fighters_df['RedWins'] + active_fighters_df['BlueWins'] + active_fighters_df['RedLosses'] + active_fighters_df['BlueLosses']
)

# Step 6: Calculate opponent quality (combine Red and Blue opponents)
active_fighters_df['OPP_QUALITY'] = (active_fighters_df['RedWins'] + active_fighters_df['BlueWins']) / (active_fighters_df['RedWins'] + active_fighters_df['BlueWins'] + active_fighters_df['RedLosses'] + active_fighters_df['BlueLosses'] + 1)

# Step 7: Define FPI features (combining Red and Blue fighter stats)
fpi_features = [
    'RedAvgSigStrLanded', 'BlueAvgSigStrLanded', 'RedAvgSubAtt', 'BlueAvgSubAtt', 'RedAvgTDLanded', 'BlueAvgTDLanded',
    'SIGSTR_ACC', 'TD_ACC', 'RedHeightCms', 'BlueHeightCms', 'RedReachCms', 'BlueReachCms', 'RedWins', 'BlueWins',
    'RedLongestWinStreak', 'BlueLongestWinStreak', 'RedTotalTitleBouts', 'BlueTotalTitleBouts', 'FINISH_RATE', 'OPP_QUALITY'
]

# Step 8: Remove NaNs for clean normalization
active_fighters_df = active_fighters_df.dropna(subset=fpi_features)

# Step 9: Normalize features (for all numeric features)
scaler = MinMaxScaler()

# Apply normalization to the features
active_fighters_df[fpi_features] = scaler.fit_transform(active_fighters_df[fpi_features])

# Step 10: Define feature weights (higher weight for more important features)
weights = {
    'RedAvgSigStrLanded': 1.2,
    'BlueAvgSigStrLanded': 1.2,
    'RedAvgSubAtt': 1.0,
    'BlueAvgSubAtt': 1.0,
    'RedAvgTDLanded': 1.1,
    'BlueAvgTDLanded': 1.1,
    'SIGSTR_ACC': 1.5,
    'TD_ACC': 1.3,
    'RedHeightCms': 0.1,
    'BlueHeightCms': 0.1,
    'RedReachCms': 0.1,
    'BlueReachCms': 0.1,
    'RedWins': 1.4,
    'BlueWins': 1.4,
    'RedLongestWinStreak': 0.9,
    'BlueLongestWinStreak': 0.9,
    'RedTotalTitleBouts': 0.5,
    'BlueTotalTitleBouts': 0.5,
    'FINISH_RATE': 1.4,
    'OPP_QUALITY': 2
}

# Step 11: Weighted FPI calculation for each fighter
active_fighters_df['FPI'] = active_fighters_df.apply(lambda row: sum(row[feature] * weights.get(feature, 1) for feature in fpi_features), axis=1)

# Step 12: Aggregate stats to fighter level (average over all available fights)
fpi_df = active_fighters_df.groupby('Fighter')['FPI'].mean().reset_index()

# Step 13: Save FPI scores to CSV
fpi_df.to_csv("Data/fighter_fpi_scores_active.csv", index=False)

# Step 14: Display top 10 fighters by FPI
print(fpi_df.sort_values(by='FPI', ascending=False).head(20))


                   Fighter       FPI
297              Jon Jones  9.527688
302              Jose Aldo  9.350019
415           Max Holloway  8.605211
498       Rafael Dos Anjos  8.429580
518          Robbie Lawler  8.240213
114          Chris Weidman  8.143888
328           Kamaru Usman  7.866836
26       Aljamain Sterling  7.677319
22   Alexander Volkanovski  7.595881
174      Dricus Du Plessis  7.564885
214        Glover Teixeira  7.504076
590          Tony Ferguson  7.486631
347             King Green  7.465324
176         Dustin Poirier  7.437736
105       Charles Oliveira  7.427149
412           Mauricio Rua  7.402977
342              Kevin Lee  7.397354
456              Nate Diaz  7.354178
531         Rose Namajunas  7.275599
563       Stephen Thompson  7.272934


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

In [116]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Load dataset
df = pd.read_csv("Data/combined_ufc_dataset.csv")

# Convert the 'DATE' column to datetime
df['DATE'] = pd.to_datetime(df['DATE'], format='%d/%m/%Y', errors='coerce')

# Step 2: Calculate the inactivity period (in years) based on RedDate
current_date = pd.to_datetime('today')
df['INACTIVITY_PERIOD'] = (current_date - df['DATE']).dt.days / 365  # In years

# Step 3: Filter out fighters who have been inactive for more than 4 years
active_df = df[df['INACTIVITY_PERIOD'] <= 4]

# Step 1: Get the most recent fight date for each fighter (Red and Blue fighters)
#last_fight_date_red = df.groupby('REDFIGHTER')['DATE'].max().reset_index().rename(columns={'REDFIGHTER': 'FIGHTER', 'DATE': 'LAST_FIGHT_DATE'})
#last_fight_date_blue = df.groupby('BLUEFIGHTER')['DATE'].max().reset_index().rename(columns={'BLUEFIGHTER': 'FIGHTER', 'DATE': 'LAST_FIGHT_DATE'})

# Combine both RED and BLUE fighter data
#last_fight_date = pd.concat([last_fight_date_red, last_fight_date_blue])

# Remove duplicates (if a fighter appears as both RED and BLUE)
active_df = active_df.drop_duplicates(subset=['FIGHTER'])

# Step 2: Calculate the time difference between today and the last fight date
##current_date = pd.to_datetime('today')
#last_fight_date['INACTIVITY_PERIOD'] = (current_date - last_fight_date['LAST_FIGHT_DATE']).dt.days / 365  # In years

# Step 3: Filter out fighters who have been inactive for more than 4 years
#active_fighters = last_fight_date[last_fight_date['INACTIVITY_PERIOD'] <= 4]

# Step 4: Get the active fighter list
active_fighter_list = active_df['FIGHTER'].tolist()

# Step 5: Filter the main dataset to include only active fighters
#active_df = df[df['REDFIGHTER'].isin(active_fighter_list) | df['BLUEFIGHTER'].isin(active_fighter_list)]

# Step 6: Define safe division to avoid division by zero
def safe_div(a, b):
    return np.where(b == 0, 0, a / b)

# Step 7: Compute individual fighter statistics and average them
for corner in ["RED", "BLUE"]:
    active_df[f"{corner}_SIGSTR_ACC"] = safe_div(active_df[f"{corner}AVGSIGSTRLANDED"], active_df[f"{corner}AVGSIGSTRPCT"])
    active_df[f"{corner}_TD_ACC"] = safe_div(active_df[f"{corner}AVGTDLANDED"], active_df[f"{corner}AVGTDPCT"])
    active_df[f"{corner}_FINISH_RATE"] = safe_div(
        active_df[f"{corner}WINSBYKO"] + active_df[f"{corner}WINSBYSUBMISSION"],
        active_df[f"{corner}WINS"] + active_df[f"{corner}LOSSES"]
    )

# Step 8: Calculate opponent quality (combine Red and Blue opponents)
active_df["RED_OPP_QUALITY"] = active_df['BLUEWINS'] / (active_df['BLUEWINS'] + active_df['BLUELOSSES'] + 1)
active_df["BLUE_OPP_QUALITY"] = active_df['REDWINS'] / (active_df['REDWINS'] + active_df['REDLOSSES'] + 1)

# Step 9: Define FPI features (combining Red and Blue fighter stats)
fpi_features_red = [
    'REDAVGSIGSTRLANDED', 'REDAVGSUBATT', 'REDAVGTDLANDED',
    'RED_SIGSTR_ACC', 'RED_TD_ACC', 'REDHEIGHTCMS', 'REDREACHCMS',
    'REDWINS', 'REDLONGESTWINSTREAK', 'REDTOTALTITLEBOUTS',
    'RED_FINISH_RATE', 'RED_OPP_QUALITY'
]

fpi_features_blue = [
    'BLUEAVGSIGSTRLANDED', 'BLUEAVGSUBATT', 'BLUEAVGTDLANDED',
    'BLUE_SIGSTR_ACC', 'BLUE_TD_ACC', 'BLUEHEIGHTCMS', 'BLUEREACHCMS',
    'BLUEWINS', 'BLUELONGESTWINSTREAK', 'BLUETOTALTITLEBOUTS',
    'BLUE_FINISH_RATE', 'BLUE_OPP_QUALITY'
]

# Step 10: Remove NaNs for clean normalization
active_df = active_df.dropna(subset=fpi_features_red + fpi_features_blue)

# Step 11: Normalize features for both red and blue fighters
scaler = MinMaxScaler()

# Apply normalization to both RED and BLUE fighter stats
active_df[fpi_features_red] = scaler.fit_transform(active_df[fpi_features_red])
active_df[fpi_features_blue] = scaler.transform(active_df[fpi_features_blue])

# Step 12: Weighted FPI calculation (You can adjust the weights as needed)
weights = {
    'REDAVGSIGSTRLANDED': 1.2,
    'REDAVGSUBATT': 1.0,
    'REDAVGTDLANDED': 1.1,
    'RED_SIGSTR_ACC': 1.5,
    'RED_TD_ACC': 1.3,
    'REDHEIGHTCMS': 0.1,
    'REDREACHCMS': 0.1,
    'REDWINS': 1.7,
    'REDLONGESTWINSTREAK': 1.2,
    'REDTOTALTITLEBOUTS': 1.0,
    'RED_FINISH_RATE': 1.6,
    'RED_OPP_QUALITY': 1.4,
    
    'BLUEAVGSIGSTRLANDED': 1.2,
    'BLUEAVGSUBATT': 1.0,
    'BLUEAVGTDLANDED': 1.1,
    'BLUE_SIGSTR_ACC': 1.5,
    'BLUE_TD_ACC': 1.3,
    'BLUEHEIGHTCMS': 0.1,
    'BLUEREACHCMS': 0.1,
    'BLUEWINS': 1.7,
    'BLUELONGESTWINSTREAK': 1.2,
    'BLUETOTALTITLEBOUTS': 1.0,
    'BLUE_FINISH_RATE': 1.6,
    'BLUE_OPP_QUALITY': 1.4,
}

# Step 13: Calculate the FPI for each fighter using weighted features
active_df['RED_FPI'] = active_df.apply(lambda row: sum(row[feature] * weights.get(feature, 1) for feature in fpi_features_red), axis=1)
active_df['BLUE_FPI'] = active_df.apply(lambda row: sum(row[feature] * weights.get(feature, 1) for feature in fpi_features_blue), axis=1)

# Step 14: Aggregate to fighter level (average over all available fights)
red_fpi = active_df.groupby('REDFIGHTER')['RED_FPI'].mean().reset_index().rename(columns={'REDFIGHTER':'FIGHTER','RED_FPI':'FPI'})
blue_fpi = active_df.groupby('BLUEFIGHTER')['BLUE_FPI'].mean().reset_index().rename(columns={'BLUEFIGHTER':'FIGHTER','BLUE_FPI':'FPI'})

# Step 15: Combine the results for both RED and BLUE fighters
fpi_df = pd.concat([red_fpi, blue_fpi]).groupby('FIGHTER')['FPI'].mean().reset_index()

# Step 16: Save FPI scores to CSV (without inactive fighters)
fpi_df.to_csv("Data/fighter_fpi_scores_active.csv", index=False)

# Step 17: Display top 10 active fighters by FPI
print(fpi_df.sort_values(by='FPI', ascending=False).head(10))

  exec(code_obj, self.user_global_ns, self.user_ns)


                   FIGHTER       FPI
459              jon jones  7.065783
368          igor severino  6.286534
47            amanda nunes  6.218470
639           max holloway  6.178670
503           kamaru usman  6.142934
65       anthony hernandez  6.101456
468              jose aldo  6.045072
708              nick diaz  5.961110
36   alexander volkanovski  5.921017
897           tom aspinall  5.899773


Feature names unseen at fit time:
- BLUEAVGSIGSTRLANDED
- BLUEAVGSUBATT
- BLUEAVGTDLANDED
- BLUEHEIGHTCMS
- BLUELONGESTWINSTREAK
- ...
Feature names seen at fit time, yet now missing:
- REDAVGSIGSTRLANDED
- REDAVGSUBATT
- REDAVGTDLANDED
- REDHEIGHTCMS
- REDLONGESTWINSTREAK
- ...



In [121]:
import pandas as pd

# Load Elo and FPI data
elo_df = pd.read_csv("Data/elo_ratings.csv")  # Elo scores
fpi_df = pd.read_csv("Data/fighter_fpi_scores_active.csv")  # FPI scores

# Merge the Elo and FPI scores on the 'FIGHTER' column
merged_df = pd.merge(elo_df, fpi_df, on='FIGHTER', how='inner')

# Step 1: Normalize Elo and FPI scores to the range [0, 1]
merged_df['Elo_normalized'] = (merged_df['ELO'] - merged_df['ELO'].min()) / (merged_df['ELO'].max() - merged_df['ELO'].min())
merged_df['FPI_normalized'] = (merged_df['FPI'] - merged_df['FPI'].min()) / (merged_df['FPI'].max() - merged_df['FPI'].min())

# Step 2: Define the weights for Elo and FPI
Weight_Elo = 0.5
Weight_FPI = 0.5

# Step 3: Calculate ILO score (weighted average of Elo and FPI)
merged_df['ILO'] = (Weight_Elo * merged_df['Elo_normalized']) + (Weight_FPI * merged_df['FPI_normalized'])

# Step 4: Save ILO scores to CSV
merged_df.to_csv("Data/fighter_ilo_scores.csv", index=False)

# Step 5: Display top 10 fighters by ILO
print(merged_df.sort_values(by='ILO', ascending=False).head(20))


                   FIGHTER          ELO       FPI  Elo_normalized  \
0                jon jones  1233.745055  7.065783        1.000000   
40            max holloway  1183.836658  6.178670        0.844093   
55            amanda nunes  1179.101872  6.218470        0.829302   
150        islam makhachev  1198.898307  5.528914        0.891143   
1         charles oliveira  1168.005424  5.817112        0.794638   
211  alexander volkanovski  1151.562666  5.921017        0.743273   
170           kamaru usman  1134.128399  6.142934        0.688811   
20          dustin poirier  1150.561087  5.758512        0.740144   
246      merab dvalishvili  1146.714930  5.749791        0.728130   
99       aljamain sterling  1154.547409  5.457942        0.752597   
473        khamzat chimaev  1133.877817  5.889480        0.688028   
174   valentina shevchenko  1171.060516  5.051808        0.804182   
479           tom aspinall  1126.582865  5.899773        0.665240   
192         belal muhammad  1186.4