In [None]:
import os
os.chdir(r"D:\PythonApps\ufc_complete_dataset")

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

from tqdm import tqdm
from IPython.core.display import HTML
from IPython.display import display

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Historical features
- wins
- looses
- total fights
- wins ratio

In [None]:
def get_wins_lost_streak(past_df, fighter) -> tuple[int, int]:
    wins_streak, lost_streak = 0, 0
    last_corner = "Red" if past_df["r_fighter"].to_list()[-1] == fighter else "Blue"
    last_result = 1 if past_df["winner"].to_list()[-1] == last_corner else 0
    if last_result:
        wins_streak += 1
    else:
        lost_streak += 1
    
    i = 1
    while True:
        try:
            corner = "Red" if past_df["r_fighter"].to_list()[-1-i] == fighter else "Blue"
            result = 1 if past_df["winner"].to_list()[-1-i] == corner else 0
        except:
            break
        
        if result and wins_streak:
            wins_streak += 1
        elif result and lost_streak:
            break
        elif not result and wins_streak:
            break
        elif not result and lost_streak:
            lost_streak += 1
        
        i += 1
    
    return wins_streak, lost_streak

In [None]:
def get_historical_features(
        fighter: str,
        current_event_id: int,
        df: pd.DataFrame
) -> tuple[int, int, int]:
    past_df = df[
        (df["event_id"] < current_event_id)
        & (
            (df["r_fighter"] == fighter)
            | (df["b_fighter"] == fighter)
        )
    ]
    if len(past_df):
        wins = len(
            past_df[
                (
                    (past_df["r_fighter"] == fighter)
                    & (past_df["winner"] == "Red")
                )
                | (
                    (past_df["b_fighter"] == fighter)
                    & (past_df["winner"] == "Blue")
                )
            ]
        )
        looses = len(past_df) - wins
        total = wins + looses
        title_fights = past_df["is_title_bout"].astype(int).sum()
        wins_streak, lost_streak = get_wins_lost_streak(past_df, fighter)
    else:
        wins, looses, total, title_fights, wins_streak, lost_streak = 0, 0, 0, 0, 0, 0
    return wins, looses, total, title_fights, wins_streak, lost_streak


## Prepare base df

In [None]:
df = (
    pd.read_csv("resources/large_dataset.csv", sep=",")
    [["event_name", "r_fighter", "b_fighter", "winner", "weight_class", "is_title_bout"]]
    .reset_index()
    .sort_values("index", ascending=False)
    .drop("index", axis=1)
    .reset_index(drop=True)
)
events = list(df["event_name"].unique())
df["event_id"] = df["event_name"].apply(lambda x: events.index(x))
df

## Get base historical features

In [None]:
rf_hist_wins, rf_hist_looses, rf_hist_total, rf_title_fights, r_wins_streak, r_lost_streak = [], [], [], [], [], []
bf_hist_wins, bf_hist_looses, bf_hist_total, bf_title_fights, b_wins_streak, b_lost_streak = [], [], [], [], [], []

for i in tqdm(range(len(df))):
    w, l, t, tf, ws, ls = get_historical_features(
        fighter=df["r_fighter"].iloc[i], 
        current_event_id=df["event_id"].iloc[i],
        df=df 
    )
    rf_hist_wins.append(w)
    rf_hist_looses.append(l) 
    rf_hist_total.append(t)
    rf_title_fights.append(tf)
    r_wins_streak.append(ws)
    r_lost_streak.append(ls)

    w, l, t, tf, ws, ls = get_historical_features(
        fighter=df["b_fighter"].iloc[i], 
        current_event_id=df["event_id"].iloc[i],
        df=df 
    )
    bf_hist_wins.append(w)
    bf_hist_looses.append(l) 
    bf_hist_total.append(t)
    bf_title_fights.append(tf)
    b_wins_streak.append(ws)
    b_lost_streak.append(ls)


assert len(rf_hist_wins) == len(rf_hist_looses)
assert len(rf_hist_wins) == len(rf_hist_total)
assert len(rf_hist_wins) == len(rf_title_fights)
assert len(rf_hist_wins) == len(bf_hist_wins)
assert len(rf_hist_wins) == len(bf_hist_looses)
assert len(rf_hist_wins) == len(bf_hist_total)
assert len(rf_hist_wins) == len(bf_title_fights)
assert len(rf_hist_wins) == len(r_wins_streak)
assert len(rf_hist_wins) == len(r_lost_streak)
assert len(rf_hist_wins) == len(b_wins_streak)
assert len(rf_hist_wins) == len(b_lost_streak)


df["r_fighter_hist_wins"] = rf_hist_wins
df["r_fighter_hist_looses"] = rf_hist_looses
df["r_fighter_hist_total"] = rf_hist_total
df["r_fighter_title_fights"] = rf_title_fights
df["r_wins_streak"] = r_wins_streak
df["r_lost_streak"] = r_lost_streak

df["b_fighter_hist_wins"] = bf_hist_wins
df["b_fighter_hist_looses"] = bf_hist_looses
df["b_fighter_hist_total"] = bf_hist_total
df["b_fighter_title_fights"] = bf_title_fights
df["b_wins_streak"] = b_wins_streak
df["b_lost_streak"] = b_lost_streak

In [None]:
df.sample(5)

## Count historical ratio features

In [None]:
r_wins_ratio, b_wins_ratio = [], []
r_tf_ratio, b_tf_ratio = [], []

for i in tqdm(range(len(df))):
    r_wins, r_tf, r_total = df.iloc[i]["r_fighter_hist_wins"], df.iloc[i]["r_fighter_title_fights"], df.iloc[i]["r_fighter_hist_total"]
    if r_total:
        r_ratio = r_wins / r_total
        tf_ratio = r_tf / r_total
    else:
        r_ratio, tf_ratio = 0, 0
    r_wins_ratio.append(r_ratio)
    r_tf_ratio.append(tf_ratio)

    b_wins, b_tf, b_total = df.iloc[i]["b_fighter_hist_wins"], df.iloc[i]["b_fighter_title_fights"], df.iloc[i]["b_fighter_hist_total"]
    if b_total:
        b_ratio = b_wins / b_total
        tf_ratio = b_tf / b_total
    else:
        b_ratio, tf_ratio = 0, 0
    b_wins_ratio.append(b_ratio)
    b_tf_ratio.append(tf_ratio)


assert len(r_wins_ratio) == len(b_wins_ratio)
assert len(r_wins_ratio) == len(r_tf_ratio)
assert len(r_wins_ratio) == len(b_tf_ratio)


df["r_wins_ratio"] = r_wins_ratio
df["b_wins_ratio"] = b_wins_ratio
df["r_tf_ratio"] = r_tf_ratio
df["b_tf_ratio"] = b_tf_ratio

In [None]:
df.sample(5)

# Relation between historical features and wins

## Createa dataframe with:
- diffs between Red and Blue fighter historical features
```
X_Row = features(RED) - features(BLUE)
```
- final winner
```
Y = 1 if Red wins else 0
```

## Results
**Basing of cohens-d statistic all effects are weak**
- winners have better win ratio
- winners have less fights (younger?)
- winners have less historical losts
- winners have longer wins streak
- loosers have longer losts streak


In [None]:
features = [
    "fighter_hist_wins",
    "fighter_hist_looses", 
    "fighter_hist_total",
    "fighter_title_fights",
    "wins_ratio",
    "tf_ratio",
    "wins_streak",
    "lost_streak"
]
r_features = [f"r_{x}" for x in features]
b_features = [f"b_{x}" for x in features]


diff_data = []
for i in tqdm(range(len(df))):
    diff_vector = []
    for j, f in enumerate(features):
        r = df.iloc[i][r_features[j]]
        b = df.iloc[i][b_features[j]]
        diff = float(r - b)
        diff_vector.append(diff)
    win = df.iloc[i]["winner"]
    win_rank = 1 if df.iloc[i]["winner"] == "Red" else 0
    diff_vector.append(win)
    diff_vector.append(win_rank)
    diff_data.append(diff_vector)

diff_df = pd.DataFrame(
    data=diff_data,
    columns=features + ["winner", "winner_rank"]
)

In [None]:
fig = go.Figure()


graph_df = diff_df[diff_df.winner == "Red"].reset_index(drop=True)
fig.add_trace(
    go.Scatter(
        name="Wins (+1SD)",
        x=features,
        y=[m+s for m, s in zip(graph_df[features].mean(), graph_df[features].std())],
        marker_color="green",
    )
)
fig.add_trace(
    go.Scatter(
        name="Wins (-1SD)",
        x=features,
        y=[m-s for m, s in zip(graph_df[features].mean(), graph_df[features].std())],
        marker_color="green",
        fill='tonexty', # fill area between trace0 and trace1
    )
)


graph_df = diff_df[diff_df.winner == "Blue"].reset_index(drop=True)
fig.add_trace(
    go.Scatter(
        name="Looses (+1SD)",
        x=features,
        y=[m+s for m, s in zip(graph_df[features].mean(), graph_df[features].std())],
        marker_color="red",
    )
)
fig.add_trace(
    go.Scatter(
        name="Looses (-1SD)",
        x=features,
        y=[m-s for m, s in zip(graph_df[features].mean(), graph_df[features].std())],
        marker_color="red",
        fill='tonexty', # fill area between trace0 and trace1
    )
)


fig.update_layout(
    title="<b>Wins and looses</b><br>Features have relatively big variance",
    width=1200,
    height=500
)

fig.show()



In [None]:
def cohens_d(serie_a: pd.Series, serie_b: pd.Series) -> float:
    ma, mb = serie_a.mean(), serie_b.mean()
    stda, stdb = serie_a.std(), serie_b.std()
    na, nb = len(serie_a), len(serie_b)

    pooled_std = np.sqrt(((na - 1) * stda**2 + (nb - 1) * stdb**2) / (na + nb - 2))
    return (ma - mb) / pooled_std


In [None]:
cohens_d_values = [
    cohens_d(
        diff_df[diff_df.winner == "Red"][f],
        diff_df[diff_df.winner == "Blue"][f]
    )
    for f in features
]
diffs_stats = pd.DataFrame(
    dict(
        feature=features,
        cohens_d=cohens_d_values
    )
).round(2)

In [None]:
fig = px.bar(
    diffs_stats,
    orientation="h",
    x="cohens_d",
    y="feature",
    text="cohens_d",
    width=800,
    title="<b>Historical features show small diffs between winners and loosers</b>"
)
fig.show()

# Fighting stats (per fight)

## Base Features
- "SLpM" -> Significant Strikes Landed per Minute,
- "sig_str_acc" -> Significant Striking Accuracy
- "SApM" -> Significant Strikes Absorbed per Minute
- "str_def" -> Significant Strike Defence (the % of opponents strikes that did not land)
- "td_avg" -> Average Takedowns Landed per 15 minutes
- "td_acc" -> Takedown Accuracy
- "td_def" -> Takedown Defense (the % of opponents TD attempts that did not land)
- "sub_avg" -> Average Submissions Attempted per 15 minutes


## New aggregation features
- significant_srtikes: mean of "SLpM", "sig_str_acc"
- damage_defense: mean of "SApM", "str_def"
- offensive_grappling: mean of "td_avg", "td_acc"
- defensive_grappling: td_def
- submissions: "sub_avg" 

In [None]:
df_fighters = pd.read_csv("resources/fighter_stats.csv")
df_fighters

In [None]:
df_fighters["stance"].fillna("Other", inplace=True)
df_fighters

In [None]:
damage = [
    "SLpM", # Significant Strikes Landed per Minute,
    "sig_str_acc",  # Significant Striking Accuracy
]
defense = [
    "SApM", # Significant Strikes Absorbed per Minute
    "str_def",  # Significant Strike Defence (the % of opponents strikes that did not land)
]
offensive_wrestling = [
    "td_avg",   # Average Takedowns Landed per 15 minutes
    "td_acc"    # Takedown Accuracy
]
defensive_wrestling = "td_def"    # Takedown Defense (the % of opponents TD attempts that did not land)
bjj = "sub_avg" # Average Submissions Attempted per 15 minutes
base_style_features = damage + defense + offensive_wrestling + [defensive_wrestling, bjj]

In [None]:
scaler = MinMaxScaler()
df_fighters[[f"{x}_norm" for x in base_style_features]] = np.round(scaler.fit_transform(df_fighters[base_style_features]), 3)
df_fighters["significant_strikes"] = df_fighters[[f"{x}_norm" for x in damage]].mean(axis=1)
df_fighters["damage_defense"] = df_fighters[[f"{x}_norm" for x in defense]].mean(axis=1)
df_fighters["offensive_grappling"] = df_fighters[[f"{x}_norm" for x in offensive_wrestling]].mean(axis=1)
df_fighters.rename(columns={"td_def_norm": "defensive_grappling"}, inplace=True)
df_fighters.rename(columns={"sub_avg_norm": "submissions"}, inplace=True)
df_fighters

In [None]:
feature_cols = [f"{x}_norm" for x in base_style_features[:-2]] + ["significant_strikes", "damage_defense", "offensive_grappling", "defensive_grappling", "submissions"]

df_red = df[["event_name", "r_fighter"]].merge(
    df_fighters[["name"] + feature_cols],
    left_on="r_fighter",
    right_on="name",
    how="left"
)[["event_name", "r_fighter"] + feature_cols]
df_red.rename(
    columns={x: f"r_{x}" for x in feature_cols},
    inplace=True
)

df_blue = df[["event_name", "b_fighter"]].merge(
    df_fighters[["name"] + feature_cols],
    left_on="b_fighter",
    right_on="name",
    how="left"
)[["event_name", "b_fighter"] + feature_cols]
df_blue.rename(
    columns={x: f"b_{x}" for x in feature_cols},
    inplace=True
)

In [None]:
df = (
    df
    .merge(
        df_red,
        on=["event_name", "r_fighter"],
        how="left"
    )
    .merge(
        df_blue,
        on=["event_name", "b_fighter"],
        how="left"
    )
)

In [None]:
r_features = [f"r_{x}" for x in feature_cols]
b_features = [f"b_{x}" for x in feature_cols]


diff_data = []
for i in tqdm(range(len(df))):
    diff_vector = []
    for j, f in enumerate(feature_cols):
        r = df.iloc[i][r_features[j]]
        b = df.iloc[i][b_features[j]]
        diff = float(r - b)
        diff_vector.append(diff)
    win = df.iloc[i]["winner"]
    win_rank = 1 if df.iloc[i]["winner"] == "Red" else 0
    diff_vector.append(win)
    diff_vector.append(win_rank)
    diff_data.append(diff_vector)

diff_df = pd.DataFrame(
    data=diff_data,
    columns=feature_cols + ["winner", "winner_rank"]
)

In [None]:
diff_df

In [None]:
fig = go.Figure()


graph_df = diff_df[diff_df.winner == "Red"].reset_index(drop=True)
fig.add_trace(
    go.Scatter(
        name="Wins (+1SD)",
        x=feature_cols,
        y=[m+s for m, s in zip(graph_df[feature_cols].mean(), graph_df[feature_cols].std())],
        marker_color="green",
    )
)
fig.add_trace(
    go.Scatter(
        name="Wins (-1SD)",
        x=feature_cols,
        y=[m-s for m, s in zip(graph_df[feature_cols].mean(), graph_df[feature_cols].std())],
        marker_color="green",
        fill='tonexty', # fill area between trace0 and trace1
    )
)


graph_df = diff_df[diff_df.winner == "Blue"].reset_index(drop=True)
fig.add_trace(
    go.Scatter(
        name="Looses (+1SD)",
        x=feature_cols,
        y=[m+s for m, s in zip(graph_df[feature_cols].mean(), graph_df[feature_cols].std())],
        marker_color="red",
    )
)
fig.add_trace(
    go.Scatter(
        name="Looses (-1SD)",
        x=feature_cols,
        y=[m-s for m, s in zip(graph_df[feature_cols].mean(), graph_df[feature_cols].std())],
        marker_color="red",
        fill='tonexty', # fill area between trace0 and trace1
    )
)


fig.update_layout(
    title="<b>Wins and looses</b><br>Features have relatively big variance",
    width=1500,
    height=500
)

fig.show()

In [None]:
cohens_d_values = [
    cohens_d(
        diff_df[diff_df.winner == "Red"][f],
        diff_df[diff_df.winner == "Blue"][f]
    )
    for f in feature_cols
]
diffs_stats = pd.DataFrame(
    dict(
        feature=feature_cols,
        cohens_d=cohens_d_values
    )
).round(2)

In [None]:
fig = px.bar(
    diffs_stats,
    orientation="h",
    x="cohens_d",
    y="feature",
    text="cohens_d",
    width=1000,
    height=700,
    title="<b>Offensive damage and deffensive wrestling skills have the biggest impact on a fight result</b><br>Damage defense and offensive wrestling are slightly less important"
)
fig.show()

In [None]:
df.to_csv("resources/df_features.csv")