In [None]:
import pickle
from itertools import cycle

import matplotlib.pyplot as plt
import numpy
import pandas as pd
import seaborn as sns
import xgboost as xgb
from sklearn.metrics import (
    auc,
    classification_report,
    confusion_matrix,
    roc_auc_score,
    roc_curve,
)
from sklearn.model_selection import train_test_split

In [None]:
# load sample data
home_stat = pd.read_csv(
    "/Users/paraspokharel/Programming/pitchProphet/pitchProphet/data/pre_processing/processed/home_stats_432rows.csv"
)
away_stat = pd.read_csv(
    "/Users/paraspokharel/Programming/pitchProphet/pitchProphet/data/pre_processing/processed/away_stats_432rows.csv"
)
match_stat = pd.read_csv(
    "/Users/paraspokharel/Programming/pitchProphet/pitchProphet/data/pre_processing/processed/match_info_with_labels_432rows.csv"
)

In [None]:
# prepare training data


if "Unnamed: 0" in home_stat.columns:
    home_stat.drop(columns=["Unnamed: 0"], inplace=True)
if "Unnamed: 0" in away_stat.columns:
    away_stat.drop(columns=["Unnamed: 0"], inplace=True)

# rename columns
home_stat.columns = ["h" + col for col in home_stat.columns]
away_stat.columns = ["a" + col for col in away_stat.columns]
x_df = pd.concat([home_stat, away_stat], axis=1)
y_df = match_stat[["label"]]

x_df = x_df.apply(pd.to_numeric)
y_df = y_df.apply(pd.to_numeric)

x_train, x_test = train_test_split(x_df, test_size=0.3, random_state=42)
y_train, y_test = train_test_split(y_df, test_size=0.3, random_state=42)

print(x_train, y_train)

In [None]:
# Train boost with softprob to get probabilities
model = xgb.XGBClassifier(
    objective="multi:softprob",
    num_class=3,
    learning_rate=0.1,
    max_depth=6,
    n_estimators=100,
)

# Fit the model with early stopping and evaluation metric
model.fit(
    x_train, y_train, eval_set=[(x_train, y_train), (x_test, y_test)], verbose=False
)

In [None]:
# evaluate
eval_result = model.evals_result()

# get predictions
y_pred = model.predict(x_test)
y_pred_proba = model.predict_proba(x_test)

# convert y_test DataFrame object to numpy array and flatten
y_test_np = y_test.values.ravel()

# calculate ROC AUC score for multiclass
try:
    # one-vs-Rest ROC AUC
    auc_ovr = roc_auc_score(y_test_np, y_pred_proba, multi_class="ovr", average="macro")
    print(f"One-vs-Rest ROC AUC: {auc_ovr:.3f}")

    # calculate accuracy
    accuracy = (y_pred == y_test_np).mean()
    print(f"Accuracy: {accuracy:.3f}")

except Exception as e:
    print(f"Error calculating ROC AUC: {str(e)}")

In [None]:
# download model (pickle converts to byte stream)
pickle.dump(model, open(r"../xgb_model.pkl", "wb"))

In [None]:
# Plots

# confusion matrix
plt.figure(figsize=(10, 8))
cm = confusion_matrix(y_test_np, y_pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.title("Confusion Matrix")
plt.ylabel("True Label")
plt.xlabel("Predicted Label")
plt.show()

# feature importance
plt.figure(figsize=(12, 6))
feature_importance = pd.DataFrame(
    {"feature": x_train.columns, "importance": model.feature_importances_}
)
feature_importance = feature_importance.sort_values("importance", ascending=False)[
    :20
]  # Top 20 features
sns.barplot(x="importance", y="feature", data=feature_importance)
plt.title("Top 20 Feature Importance")
plt.show()

# report
print("\nClassification Report:")
print(
    classification_report(
        y_test_np, y_pred, target_names=["Home Win", "Draw", "Away Win"]
    )
)

# probablity distributoin
plt.figure(figsize=(15, 5))
for i, label in enumerate(["Home Win", "Draw", "Away Win"]):
    plt.subplot(1, 3, i + 1)
    sns.histplot(y_pred_proba[:, i], bins=20)
    plt.title(f"Probability Distribution for {label}")
    plt.xlabel("Predicted Probability")
plt.tight_layout()
plt.show()

# ROC Curves
plt.figure(figsize=(10, 8))
n_classes = 3
fpr = {}
tpr = {}
roc_auc = {}
classes = ["Home Win", "Draw", "Away Win"]

# calculate ROC curve and ROC area for each class
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve((y_test == i).astype(int), y_pred_proba[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# plot ROC curves
colors = ["blue", "red", "green"]
for i, color in enumerate(colors):
    plt.plot(
        fpr[i],
        tpr[i],
        color=color,
        lw=2,
        label=f"ROC curve of {classes[i]} (AUC = {roc_auc[i]:0.2f})",
    )

plt.plot([0, 1], [0, 1], "k--", lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curves for Multi-class")
plt.legend(loc="lower right")
plt.grid(True)
plt.show()