In [None]:
import os
os.chdir(r"D:\PythonApps\ufc_complete_dataset")

In [None]:
import mlflow

import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

from tqdm import tqdm
from IPython.core.display import HTML
from IPython.display import display

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
mlflow.set_tracking_uri("file:///tmp/mlflow_2")

# Prepare dataset

In [None]:
df = pd.read_csv("resources/df_features.csv", index_col=0)
df.sample(5)

In [None]:
df.columns

In [None]:
feature_cols = [
'fighter_hist_wins',
'fighter_hist_looses', 
'fighter_hist_total',
'fighter_title_fights', 
'wins_ratio', 
'tf_ratio',
'wins_streak',
'lost_streak',
'SLpM_norm', 
'sig_str_acc_norm', 
'SApM_norm',
'str_def_norm', 
'td_avg_norm', 
'td_acc_norm',
'significant_strikes', 
'damage_defense', 
'offensive_grappling',
'defensive_grappling', 
'submissions'
]
r_features = [f"r_{x}" for x in feature_cols]
b_features = [f"b_{x}" for x in feature_cols]

In [None]:
diff_data = []
for i in tqdm(range(len(df))):
    diff_vector = []
    for j, f in enumerate(feature_cols):
        r = df.iloc[i][r_features[j]]
        b = df.iloc[i][b_features[j]]
        diff = float(r - b)
        diff_vector.append(diff)
    win = df.iloc[i]["winner"]
    win_rank = 1 if df.iloc[i]["winner"] == "Red" else 0
    diff_vector.append(win)
    diff_vector.append(win_rank)
    diff_data.append(diff_vector)

diff_df = pd.DataFrame(
    data=diff_data,
    columns=feature_cols + ["winner", "winner_rank"]
)

In [None]:
diff_df.sample(5)

In [None]:
diff_df.info()

## Delete records with no data

In [None]:
is_empty_data = []
for i in range(len(diff_df)):
    row = diff_df.iloc[i][feature_cols].to_list()
    is_empty = True
    for x in row:
        if bool(x):
            is_empty = False
            break
    is_empty_data.append(is_empty)

diff_df["is_empty"] = is_empty_data
diff_df = diff_df[diff_df["is_empty"] == False].reset_index(drop=True).drop("is_empty", axis=1)

In [None]:
diff_df

## Normalization

In [None]:
scaler = MinMaxScaler()
diff_df[feature_cols] = scaler.fit_transform(diff_df[feature_cols])

In [None]:
diff_df.sample(5)

## Train / test split

In [None]:
train_df = diff_df[:-200]
test_df = diff_df[-200:]


x_train = train_df[feature_cols].to_numpy()
x_test = test_df[feature_cols].to_numpy()
y_train = train_df["winner_rank"].to_numpy()
y_test = test_df["winner_rank"].to_numpy()


print("x_train", x_train.shape)
print("x_test", x_test.shape)
print("y_train", y_train.shape)
print("y_test", y_test.shape)

# ML: model selection

In [None]:
models_dict = (
    (
        "logistic_regression", 
        LogisticRegression
    ),
    (
        "decision_tree",
        DecisionTreeClassifier
    ),
    (
        "random_forest",
        RandomForestClassifier
    ),
    (
        "gradient_boost",
        GradientBoostingClassifier
    ),
    (
        "xgboost",
        XGBClassifier
    ),
    (
        "lgbm",
        LGBMClassifier
    )
)

In [None]:
for test in models_dict:
     with mlflow.start_run():
        print()
        print("-"*100)
        print(f'WORKING ON MODEL: {test[0]}')

        model = test[1]()
        model.fit(x_train, y_train)
        train_acc = accuracy_score(y_train, model.predict(x_train))
        test_acc = accuracy_score(y_test, model.predict(x_test))

        mlflow.log_param("model", test[0])
        mlflow.log_metric("train_acc", train_acc)
        mlflow.log_metric("test_acc", test_acc)

In [None]:
runs_df = mlflow.search_runs(experiment_ids=["0"])[:len(models_dict)]
runs_df

In [None]:
fig = go.Figure()

x_data = runs_df["params.model"].to_list()
y_data = runs_df["metrics.train_acc"].to_list()
fig.add_trace(
    go.Bar(
        name="Train",
        x=x_data,
        y=y_data,
        text=[round(y, 3) for y in y_data],
        marker_color="teal"
    )
)

x_data = runs_df["params.model"].to_list()
y_data = runs_df["metrics.test_acc"].to_list()
fig.add_trace(
    go.Bar(
        name="Test",
        x=x_data,
        y=y_data,
        text=[round(y, 3) for y in y_data],
        marker_color="orange"
    )
)

fig.update_layout(
    # title=f"<b>AUC PR values for tested models</b><br>Cross val reps = {cross_val_reps}",
    width=1000,
    height=600,
    yaxis=dict(range=(0, 1.1))
)

fig.show()

# ML: hyperparameter tuning

In [None]:
history = []
loops = 1000

In [None]:
def return_params() -> dict:
    return dict(
        num_leaves = np.random.randint(10, 251),
        max_depth = np.random.randint(10, 501),
        learning_rate = np.random.randint(1, 1000) / 1000,
        n_estimators = np.random.randint(10, 501),
        subsample = min(np.random.randint(90, 110) / 100, 1),
        colsample_bytree = min(np.random.randint(90, 110) / 100, 1),
        reg_alpha = np.random.randint(1, 100) / 1000,
        reg_lambda = np.random.randint(1, 100) / 1000
    )

In [None]:
for i in tqdm(range(loops)):
    with mlflow.start_run():
        params = return_params()
        model = LGBMClassifier(**params, verbose=-1)
        model.fit(x_train, y_train)
        train_acc = accuracy_score(y_train, model.predict(x_train))
        test_acc = accuracy_score(y_test, model.predict(x_test))

        mlflow.log_param("model", f"lgbm_tuning_{i}")
        for param, value in params.items():
            mlflow.log_param(param, value)
        mlflow.log_metric("train_acc", train_acc)
        mlflow.log_metric("test_acc", test_acc)

In [None]:
cols = [
'run_id',
'metrics.test_acc', 
'metrics.train_acc',
'params.n_estimators',
'params.reg_alpha', 
'params.subsample', 
'params.num_leaves',
'params.max_depth', 
'params.colsample_bytree', 
'params.reg_lambda',
'params.learning_rate', 
'params.model'
]
params_cols = [
'params.n_estimators',
'params.reg_alpha', 
'params.subsample', 
'params.num_leaves',
'params.max_depth', 
'params.colsample_bytree', 
'params.reg_lambda',
'params.learning_rate'
]
runs_df = mlflow.search_runs(experiment_ids=["0"])[:loops][cols]
runs_df["train_test_diff"] = runs_df['metrics.train_acc'] - runs_df['metrics.test_acc']

In [None]:
for col in params_cols:
    
    fig = go.Figure()

    fig.add_trace(
        go.Scattergl(
            x=runs_df[col].astype(float),
            y=runs_df["metrics.test_acc"],
            mode="markers",
            marker=dict(size=3)
        )
    )

    fig.update_layout(
        title=f"{col} x test acc",
        width=500,
        height=400
    )

    fig.show()

In [None]:
runs_df.sort_values(["metrics.test_acc", "train_test_diff"], ascending=[False, True])