In [None]:
import os
os.chdir(r"D:\PythonApps\ufc_complete_dataset")

In [None]:
!pip install uv
!uv pip install -r requirements.txt --system
!uv pip install matplotlib --system

In [None]:
import shap
import mlflow

import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler

from lightgbm import LGBMClassifier

from tqdm import tqdm
from IPython.core.display import HTML
from IPython.display import display

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [None]:
import warnings
warnings.filterwarnings('ignore')

# Prepare dataset

In [None]:
df = pd.read_csv("resources/df_features.csv", index_col=0)

In [None]:
feature_cols = [
'fighter_hist_wins',
'fighter_hist_looses', 
'fighter_hist_total',
'fighter_title_fights', 
'wins_ratio', 
'tf_ratio',
'wins_streak',
'lost_streak',
'SLpM_norm', 
'sig_str_acc_norm', 
'SApM_norm',
'str_def_norm', 
'td_avg_norm', 
'td_acc_norm',
'significant_strikes', 
'damage_defense', 
'offensive_grappling',
'defensive_grappling', 
'submissions'
]
r_features = [f"r_{x}" for x in feature_cols]
b_features = [f"b_{x}" for x in feature_cols]

In [None]:
diff_data = []
for i in tqdm(range(len(df))):
    diff_vector = []
    for j, f in enumerate(feature_cols):
        r = df.iloc[i][r_features[j]]
        b = df.iloc[i][b_features[j]]
        diff = float(r - b)
        diff_vector.append(diff)
    win = df.iloc[i]["winner"]
    win_rank = 1 if df.iloc[i]["winner"] == "Red" else 0
    diff_vector.append(win)
    diff_vector.append(win_rank)
    diff_data.append(diff_vector)

diff_df = pd.DataFrame(
    data=diff_data,
    columns=feature_cols + ["winner", "winner_rank"]
)

## Delete records with no data

In [None]:
is_empty_data = []
for i in range(len(diff_df)):
    row = diff_df.iloc[i][feature_cols].to_list()
    is_empty = True
    for x in row:
        if bool(x):
            is_empty = False
            break
    is_empty_data.append(is_empty)

diff_df["is_empty"] = is_empty_data
diff_df = diff_df[diff_df["is_empty"] == False].reset_index(drop=True).drop("is_empty", axis=1)

## Normalization

In [None]:
scaler = MinMaxScaler()
diff_df[feature_cols] = scaler.fit_transform(diff_df[feature_cols])

## Train / test split

In [None]:
train_df = diff_df[:-200]
test_df = diff_df[-200:]


x_train = train_df[feature_cols].to_numpy()
x_test = test_df[feature_cols].to_numpy()
y_train = train_df["winner_rank"].to_numpy()
y_test = test_df["winner_rank"].to_numpy()


print("x_train", x_train.shape)
print("x_test", x_test.shape)
print("y_train", y_train.shape)
print("y_test", y_test.shape)

# Train LGBM

In [None]:
mlflow_run_id = "fbc3d4d9233e4dbe8b29b60d8154b0c0"
mlflow.set_tracking_uri("file:///tmp/mlflow_2")
runs_df = mlflow.search_runs(experiment_ids=["0"])
best_run = runs_df[runs_df["run_id"] == mlflow_run_id].iloc[0]
best_run

In [None]:
params = dict(
    num_leaves=int(best_run["params.num_leaves"]),
    max_depth=int(best_run["params.max_depth"]),
    learning_rate=float(best_run["params.learning_rate"]),
    n_estimators=int(best_run["params.n_estimators"]),
    subsample=float(best_run["params.subsample"]),
    colsample_bytree=float(best_run["params.colsample_bytree"]),
    reg_alpha=float(best_run["params.reg_alpha"]),
    reg_lambda=float(best_run["params.reg_lambda"])
)
model = LGBMClassifier(**params, verbose=-1)
model.fit(x_train, y_train)

# SHAP interpretation

In [None]:
shap.initjs()

In [None]:
explainer = shap.TreeExplainer(model)
shap_values = explainer(x_train)

In [None]:
shap.summary_plot(shap_values, train_df[feature_cols])