In [86]:
# match_pipeline.py
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# -------------------------
# Configuration and paths
# -------------------------
BASE_DIR = Path(r"C:\Users\viaud\Downloads\scrape_ufc_stats")

fighters_train_csv = BASE_DIR / "data" / "train" / "fighter_stats_train.csv"
fighters_test_csv  = BASE_DIR / "data" / "test" / "fighter_stats_test.csv"
matches_csv        = BASE_DIR / "data" / "matchups.csv"

In [87]:
stat_features = [
    "wins","losses","draws","height_cm","weight_in_kg","reach_in_cm",
    "significant_strikes_landed_per_minute","significant_striking_accuracy",
    "significant_strikes_absorbed_per_minute","significant_strike_defence",
    "average_takedowns_landed_per_15_minutes","takedown_accuracy",
    "takedown_defense","average_submissions_attempted_per_15_minutes"
]

In [88]:
def normalize_name(s):
    """Lowercase, strip, collapse whitespace for robust joins."""
    if pd.isna(s):
        return ""
    s = str(s).strip().lower()
    s = " ".join(s.split())
    return s

In [89]:
def load_data(fighters_path, matches_path):
    # 1. Read the CSVs into DataFrames
    fighters = pd.read_csv(fighters_path)
    matches  = pd.read_csv(matches_path)

    # 2. Rename match columns to what the rest of the code expects
    matches = matches.rename(columns={
        "Fighter_A_Name": "Fighter_A",   # change these if your real column names differ
        "Fighter_B_Name": "Fighter_B",
    })

    # 3. Normalize names
    fighters["name_norm"] = fighters["name"].apply(normalize_name)
    matches["Fighter_A_name_norm"] = matches["Fighter_A"].apply(normalize_name)
    matches["Fighter_B_name_norm"] = matches["Fighter_B"].apply(normalize_name)

    return fighters, matches



In [90]:
# Merge fighter stats into matches
# -------------------------
def attach_fighter_stats_to_matches(fighters, matches, stat_features):
    # ensure unique index on normalized fighter name
    if fighters["name_norm"].duplicated().any():
        dupes = fighters[fighters["name_norm"].duplicated(keep=False)][["name","name_norm"]]
        print("Warning duplicate fighter names found. Consider disambiguating by id or event.")
        print(dupes.head(10))

    # prepare prefixed stat tables
    a_stats = fighters.set_index("name_norm")[stat_features].add_prefix("Fighter_A_").reset_index()
    b_stats = fighters.set_index("name_norm")[stat_features].add_prefix("Fighter_B_").reset_index()

    # merge A
    matches = matches.merge(a_stats, left_on="Fighter_A_name_norm", right_on="name_norm", how="left", validate="m:1")
    matches = matches.drop(columns=["name_norm"])
    # merge B
    matches = matches.merge(b_stats, left_on="Fighter_B_name_norm", right_on="name_norm", how="left", validate="m:1")
    matches = matches.drop(columns=["name_norm"])

    # compute difference features (A - B)
    for s in stat_features:
        col_a = f"Fighter_A_{s}"
        col_b = f"Fighter_B_{s}"
        matches[s + "_diff"] = pd.to_numeric(matches[col_a], errors="coerce") - pd.to_numeric(matches[col_b], errors="coerce")

    return matches


In [91]:
# Label handling
# -------------------------
def extract_label(matches):
    """
    Try to produce a binary label column named '_label_binary' where:
      1 => Fighter_A won
      0 => Fighter_B won
    Accepts Winner_A numeric (1/0), or winner name strings.
    """
    # common numeric column
    if "Winner_A" in matches.columns:
        matches["_label_binary"] = pd.to_numeric(matches["Winner_A"], errors="coerce")
        # if values are 1/2 or 'A'/'B' handle below
    else:
        # try common names
        label_cols = [c for c in matches.columns if c.lower() in ("winner","result","outcome","label")]
        if label_cols:
            col = label_cols[0]
            # if winner is name, compare normalized names
            if matches[col].dtype == object:
                winner_norm = matches[col].apply(normalize_name)
                matches["_label_binary"] = (winner_norm == matches["Fighter_A_name_norm"]).astype(float)
            else:
                matches["_label_binary"] = pd.to_numeric(matches[col], errors="coerce")
        else:
            matches["_label_binary"] = np.nan

    # If label uses 'A'/'B' strings
    if matches["_label_binary"].dtype == object:
        matches["_label_binary"] = matches["_label_binary"].astype(str).str.lower().map({"a":1,"b":0,"fighter_a":1,"fighter_b":0})

    return matches

In [92]:
# Train model
# -------------------------
def train_model(matches, stat_features, test_size=0.2, random_state=42):
    feature_cols = [s + "_diff" for s in stat_features]
    matches = extract_label(matches)
    # drop rows with missing features or label
    df = matches.dropna(subset=feature_cols + ["_label_binary"]).copy()
    X = df[feature_cols].astype(float)
    y = df["_label_binary"].astype(int)

    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=test_size, random_state=random_state, stratify=y)
    model = XGBClassifier(n_estimators=200, max_depth=4, use_label_encoder=False, eval_metric="logloss", random_state=random_state)
    model.fit(X_train, y_train)

    preds = model.predict(X_val)
    probs = model.predict_proba(X_val)[:, 1]  # prob of class 1 (A wins)
    print("Validation Accuracy:", accuracy_score(y_val, preds))
    print(classification_report(y_val, preds))

    return model, feature_cols

In [93]:
# Predict function
# -------------------------
def predict_winner(fighter_a_name, fighter_b_name, fighters_all_df, model, stat_features=stat_features):
    """
    Inputs:
      - fighter_a_name, fighter_b_name: strings (names as in fighters_all_df 'name' column)
      - fighters_all_df: the fighter-level DataFrame (same as train_df you used earlier)
      - model: trained classifier with predict_proba
    Returns:
      - winner_name: predicted winner string (fighter_a_name or fighter_b_name)
      - confidence: probability of predicted class (float between 0 and 1)
    """
    # normalize and lookup
    fighters = fighters_all_df.copy()
    fighters["name_norm"] = fighters["name"].apply(normalize_name)
    a_norm = normalize_name(fighter_a_name)
    b_norm = normalize_name(fighter_b_name)

    # find rows
    row_a = fighters[fighters["name_norm"] == a_norm]
    row_b = fighters[fighters["name_norm"] == b_norm]

    # fallback: try partial match if exact not found
    if row_a.empty:
        row_a = fighters[fighters["name_norm"].str.contains(a_norm.split()[0])].head(1)
    if row_b.empty:
        row_b = fighters[fighters["name_norm"].str.contains(b_norm.split()[0])].head(1)

    if row_a.empty or row_b.empty:
        missing = []
        if row_a.empty: missing.append(fighter_a_name)
        if row_b.empty: missing.append(fighter_b_name)
        raise ValueError(f"Could not find fighter stats for: {missing}. Check names or provide normalized aliases.")

    # build feature vector (A - B)
    a_row = row_a.iloc[0]
    b_row = row_b.iloc[0]
    feat_values = []
    for s in stat_features:
        a_val = pd.to_numeric(a_row.get(s, np.nan), errors="coerce")
        b_val = pd.to_numeric(b_row.get(s, np.nan), errors="coerce")
        diff = (a_val if not pd.isna(a_val) else 0.0) - (b_val if not pd.isna(b_val) else 0.0)
        feat_values.append(diff)

    X = np.array(feat_values).reshape(1, -1).astype(float)
    # predict
    if hasattr(model, "predict_proba"):
        prob = model.predict_proba(X)[0]
        # assume class 1 => Fighter_A wins
        prob_a = prob[1] if model.classes_[1] == 1 else prob[0]
    else:
        # fallback to predict only
        pred = model.predict(X)[0]
        prob_a = 1.0  # no probability available

    pred_label = 1 if prob_a >= 0.5 else 0
    winner = fighter_a_name if pred_label == 1 else fighter_b_name
    confidence = prob_a if pred_label == 1 else 1 - prob_a
    return winner, float(confidence)

In [94]:
def attach_fighter_stats_to_matches(fighters, matches, stat_features):
    # work on a copy to avoid side effects
    fighters = fighters.copy()

    # ensure unique index on normalized fighter name
    if fighters["name_norm"].duplicated().any():
        dupes = fighters[fighters["name_norm"].duplicated(keep=False)][["name", "name_norm"]]
        print("Warning: duplicate fighter names found in fighters_all_df. Deduplicating by name_norm (keeping last).")
        print(dupes.head(10))

        # keep the last occurrence (you can change to 'first' if you prefer)
        fighters = fighters.drop_duplicates(subset="name_norm", keep="last")

    # after this, name_norm is unique
    # prepare prefixed stat tables
    a_stats = (
        fighters
        .set_index("name_norm")[stat_features]
        .add_prefix("Fighter_A_")
        .reset_index()
    )
    b_stats = (
        fighters
        .set_index("name_norm")[stat_features]
        .add_prefix("Fighter_B_")
        .reset_index()
    )

    # merge A (many matches -> one fighter row)
    matches = matches.merge(
        a_stats,
        left_on="Fighter_A_name_norm",
        right_on="name_norm",
        how="left",
        validate="m:1"   # now safe, right side is unique
    )
    matches = matches.drop(columns=["name_norm"])

    # merge B
    matches = matches.merge(
        b_stats,
        left_on="Fighter_B_name_norm",
        right_on="name_norm",
        how="left",
        validate="m:1"
    )
    matches = matches.drop(columns=["name_norm"])

    # compute difference features (A - B)
    for s in stat_features:
        col_a = f"Fighter_A_{s}"
        col_b = f"Fighter_B_{s}"
        matches[s + "_diff"] = (
            pd.to_numeric(matches[col_a], errors="coerce")
            - pd.to_numeric(matches[col_b], errors="coerce")
        )

    return matches


In [95]:
def load_fighters_all(train_path, test_path):
    fighters_train = pd.read_csv(train_path)
    fighters_test  = pd.read_csv(test_path)

    fighters_all = pd.concat([fighters_train, fighters_test], ignore_index=True)
    fighters_all["name_norm"] = fighters_all["name"].apply(normalize_name)
    return fighters_all


In [96]:
if __name__ == "__main__":
    # 1. Load fighters for training (only train file) + matches
    fighters_train_df, matches_df = load_data(fighters_train_csv, matches_csv)

    matches_with_features = attach_fighter_stats_to_matches(
        fighters_train_df,
        matches_df,
        stat_features
    )
    matches_with_features.to_csv("matches_with_features.csv", index=False)
    print("Saved matches_with_features.csv")

    model, feature_cols = train_model(matches_with_features, stat_features)

    # 2. Load ALL fighters (train + test) for prediction lookups
    fighters_all_df = load_fighters_all(fighters_train_csv, fighters_test_csv)

    # 3. Use fighters_all_df in predict_winner
    winner, confidence = predict_winner(
        "Merab Dvalishvili",
        "Petr Yan",
        fighters_all_df,
        model
    )
    print(f"Predicted winner: {winner} with {confidence*100:.1f}% confidence")


               name        name_norm
0      Istela Nunes     istela nunes
1         Mark Weir        mark weir
3        Joe Warren       joe warren
4     Fernie Garcia    fernie garcia
5      Steve Berger     steve berger
6     Nick Osipczak    nick osipczak
7       Chris Price      chris price
8          CJ Marsh         cj marsh
12   Rashad Coulter   rashad coulter
13  Talita Nogueira  talita nogueira
Saved matches_with_features.csv
Validation Accuracy: 0.623175965665236


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


              precision    recall  f1-score   support

           0       0.61      0.62      0.61       566
           1       0.64      0.63      0.63       599

    accuracy                           0.62      1165
   macro avg       0.62      0.62      0.62      1165
weighted avg       0.62      0.62      0.62      1165

Predicted winner: Merab Dvalishvili with 90.3% confidence


In [97]:
winner, confidence = predict_winner(
    "Kennedy Nzechukwu",
    "Marcus Buchecha",
    fighters_all_df,
    model
)
print(f"Predicted winner: {winner} with {confidence*100:.1f}% confidence")


Predicted winner: Kennedy Nzechukwu with 94.8% confidence


In [98]:
winner, confidence = predict_winner(
    "Cesar Almeida",
    "Cezary Oleksiejczuk",
    fighters_all_df,
    model
)
print(f"Predicted winner: {winner} with {confidence*100:.1f}% confidence")


Predicted winner: Cezary Oleksiejczuk with 99.3% confidence


In [99]:
winner, confidence = predict_winner(
    "Melquizael Costa",
    "Morgan Charriere",
    fighters_all_df,
    model
)
print(f"Predicted winner: {winner} with {confidence*100:.1f}% confidence")


Predicted winner: Melquizael Costa with 84.5% confidence


In [101]:
winner, confidence = predict_winner(
    "Brandon Royval",
    "Manel Kape",
    fighters_all_df,
    model
)
print(f"Predicted winner: {winner} with {confidence*100:.1f}% confidence")


Predicted winner: Brandon Royval with 50.1% confidence
