In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import defaultdict, deque
import joblib

from sklearn.metrics import accuracy_score, roc_auc_score, log_loss
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss, classification_report, confusion_matrix

import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline

In [17]:
def load_matches(folder="atp_matches"):
    folder = Path(folder)
    dfs = []
    for f in sorted(folder.glob("*.csv")):
        df = pd.read_csv(f)
        df["tourney_date"] = df["tourney_date"].astype(int)
        dfs.append(df)
    if not dfs:
        raise FileNotFoundError(f"Ingen CSV-filer fundet i {folder.resolve()}")
    return pd.concat(dfs, ignore_index=True)

df_train = load_matches("atp_matches")               # 2000-2023
df_2024  = pd.read_csv("atp_matches_2024.csv")       # 2024 ligger udenfor
df_2024["tourney_date"] = df_2024["tourney_date"].astype(int)

df = pd.concat([df_train, df_2024], ignore_index=True)

print("Loaded rows:", len(df))
print("Date range:", df["tourney_date"].min(), "to", df["tourney_date"].max())


Loaded rows: 74906
Date range: 20000103 to 20241218


In [18]:
def add_date_features(df):
    df = df.copy()
    dt = pd.to_datetime(df["tourney_date"].astype(str), format="%Y%m%d", errors="coerce")
    df["date_year"] = dt.dt.year
    df["date_doy"]  = dt.dt.dayofyear
    return df

def add_lastN_overall_and_surface(df, k=10):
    df = df.copy()
    sort_cols = ["tourney_date"] + (["match_num"] if "match_num" in df.columns else [])
    df = df.sort_values(sort_cols).reset_index(drop=True)

    hist_overall = defaultdict(lambda: deque(maxlen=k))
    hist_surface = defaultdict(lambda: deque(maxlen=k))

    w_ov_wins, w_ov_n, l_ov_wins, l_ov_n = [], [], [], []
    w_sf_wins, w_sf_n, l_sf_wins, l_sf_n = [], [], [], []

    for _, row in df.iterrows():
        wid, lid = row["winner_id"], row["loser_id"]
        surf = str(row.get("surface", "UNK"))

        wh, lh = hist_overall[wid], hist_overall[lid]
        w_ov_wins.append(sum(wh)); w_ov_n.append(len(wh))
        l_ov_wins.append(sum(lh)); l_ov_n.append(len(lh))

        wsh, lsh = hist_surface[(wid, surf)], hist_surface[(lid, surf)]
        w_sf_wins.append(sum(wsh)); w_sf_n.append(len(wsh))
        l_sf_wins.append(sum(lsh)); l_sf_n.append(len(lsh))

        # update AFTER match
        wh.append(1); lh.append(0)
        wsh.append(1); lsh.append(0)

    df[f"winner_last{k}_wr"] = np.array(w_ov_wins) / np.maximum(1, np.array(w_ov_n))
    df[f"winner_last{k}_n"]  = w_ov_n
    df[f"loser_last{k}_wr"]  = np.array(l_ov_wins) / np.maximum(1, np.array(l_ov_n))
    df[f"loser_last{k}_n"]   = l_ov_n

    df[f"winner_surf_last{k}_wr"] = np.array(w_sf_wins) / np.maximum(1, np.array(w_sf_n))
    df[f"winner_surf_last{k}_n"]  = w_sf_n
    df[f"loser_surf_last{k}_wr"]  = np.array(l_sf_wins) / np.maximum(1, np.array(l_sf_n))
    df[f"loser_surf_last{k}_n"]   = l_sf_n
    return df

def add_global_elo(df, base=1500.0, rho=0.85):
    df = df.copy()
    sort_cols = ["tourney_date"] + (["match_num"] if "match_num" in df.columns else [])
    df = df.sort_values(sort_cols).reset_index(drop=True)

    k_by_year = [(2022, 64.0), (2018, 48.0), (2010, 40.0), (2000, 32.0)]
    def K_for_year(year):
        for y0, kval in k_by_year:
            if year >= y0: return float(kval)
        return 48.0

    def best_of_mult(row): return 1.25 if row.get("best_of", 3) == 5 else 1.0
    def level_mult(row):
        lvl = str(row.get("tourney_level", ""))
        return 1.25 if lvl=="G" else 1.15 if lvl=="M" else 1.05 if lvl=="A" else 1.0

    elo = defaultdict(lambda: base)
    w_pre, l_pre = [], []
    current_year = None

    for _, row in df.iterrows():
        y = int(str(row["tourney_date"])[:4])
        if current_year is None: current_year = y
        if y != current_year:
            for pid in list(elo.keys()):
                elo[pid] = base + rho * (elo[pid] - base)
            current_year = y

        wid, lid = row["winner_id"], row["loser_id"]
        ew, el = elo[wid], elo[lid]
        w_pre.append(ew); l_pre.append(el)

        pw = 1.0 / (1.0 + 10 ** ((el - ew) / 400.0))
        K = K_for_year(y) * best_of_mult(row) * level_mult(row)

        elo[wid] = ew + K * (1 - pw)
        elo[lid] = el + K * (0 - (1 - pw))

    df["winner_elo_pre"] = w_pre
    df["loser_elo_pre"]  = l_pre
    return df

def add_surface_elo(df, base=1500.0, rho=0.85, k_surface_scale=0.70):
    df = df.copy()
    sort_cols = ["tourney_date"] + (["match_num"] if "match_num" in df.columns else [])
    df = df.sort_values(sort_cols).reset_index(drop=True)

    k_by_year = [(2022, 48.0), (2018, 36.0), (2010, 32.0), (2000, 24.0)]
    def K_for_year(year):
        for y0, kval in k_by_year:
            if year >= y0: return float(kval)
        return 32.0

    def best_of_mult(row): return 1.25 if row.get("best_of", 3) == 5 else 1.0
    def level_mult(row):
        lvl = str(row.get("tourney_level", ""))
        return 1.25 if lvl=="G" else 1.15 if lvl=="M" else 1.05 if lvl=="A" else 1.0

    selo = defaultdict(lambda: base)
    w_pre, l_pre = [], []
    current_year = None

    for _, row in df.iterrows():
        y = int(str(row["tourney_date"])[:4])
        if current_year is None: current_year = y
        if y != current_year:
            for key in list(selo.keys()):
                selo[key] = base + rho * (selo[key] - base)
            current_year = y

        surf = str(row.get("surface", "UNK"))
        wid, lid = row["winner_id"], row["loser_id"]
        ew, el = selo[(wid, surf)], selo[(lid, surf)]
        w_pre.append(ew); l_pre.append(el)

        pw = 1.0 / (1.0 + 10 ** ((el - ew) / 400.0))
        K = K_for_year(y) * k_surface_scale * best_of_mult(row) * level_mult(row)

        selo[(wid, surf)] = ew + K * (1 - pw)
        selo[(lid, surf)] = el + K * (0 - (1 - pw))

    df["winner_surfelo_pre"] = w_pre
    df["loser_surfelo_pre"]  = l_pre
    return df

df = add_date_features(df)
df = add_global_elo(df)
df = add_surface_elo(df)

df = add_lastN_overall_and_surface(df, k=3)
df = add_lastN_overall_and_surface(df, k=10)

print("Features built. Example cols:", [c for c in df.columns if "elo" in c][:6])


Features built. Example cols: ['winner_elo_pre', 'loser_elo_pre', 'winner_surfelo_pre', 'loser_surfelo_pre']


In [19]:
def build_pairwise_dataset(df):
    base = df.copy()

    cols_p = {
        "age": ("winner_age", "loser_age"),
        "ht": ("winner_ht", "loser_ht"),
        "rank": ("winner_rank", "loser_rank"),
        "rank_points": ("winner_rank_points", "loser_rank_points"),
        "ioc": ("winner_ioc", "loser_ioc"),
        "hand": ("winner_hand", "loser_hand"),
        "last3_wr": ("winner_last3_wr", "loser_last3_wr"),
        "last3_n": ("winner_last3_n", "loser_last3_n"),
        "elo": ("winner_elo_pre", "loser_elo_pre"),
        "surfelo": ("winner_surfelo_pre", "loser_surfelo_pre"),
        "last10_wr": ("winner_last10_wr", "loser_last10_wr"),
        "last10_n": ("winner_last10_n", "loser_last10_n"),
        "surf_last10_wr": ("winner_surf_last10_wr", "loser_surf_last10_wr"),
        "surf_last10_n": ("winner_surf_last10_n", "loser_surf_last10_n"),
    }

    def make_view(swap=False):
        out = pd.DataFrame(index=base.index)
        for c in ["surface", "tourney_level", "best_of", "round", "tourney_date", "match_num"]:
            if c in base.columns:
                out[c] = base[c]

        # medtag dato-features her
        out["date_year"] = base["date_year"]
        out["date_doy"]  = base["date_doy"]

        for name, (cw, cl) in cols_p.items():
            if not swap:
                out[f"p1_{name}"] = base[cw]
                out[f"p2_{name}"] = base[cl]
            else:
                out[f"p1_{name}"] = base[cl]
                out[f"p2_{name}"] = base[cw]
        out["y"] = 1 if not swap else 0
        return out

    return pd.concat([make_view(False), make_view(True)], ignore_index=True)

def elo_prob_from_diff(diff):
    return 1.0 / (1.0 + 10 ** (-diff / 400.0))

pair = build_pairwise_dataset(df)

# diff-features (matcher jeres PyTorch-tanke)
pair["diff_age"]         = pair["p1_age"] - pair["p2_age"]
pair["diff_ht"]          = pair["p1_ht"]  - pair["p2_ht"]
pair["diff_rank"]        = pair["p1_rank"] - pair["p2_rank"]
pair["diff_rank_points"] = pair["p1_rank_points"] - pair["p2_rank_points"]

pair["diff_last3_wr"] = pair["p1_last3_wr"] - pair["p2_last3_wr"]
pair["diff_last3_n"]  = pair["p1_last3_n"]  - pair["p2_last3_n"]

pair["diff_elo"]     = pair["p1_elo"] - pair["p2_elo"]
pair["diff_surfelo"] = pair["p1_surfelo"] - pair["p2_surfelo"]
pair["elo_p"]        = elo_prob_from_diff(pair["diff_elo"])
pair["surfelo_p"]    = elo_prob_from_diff(pair["diff_surfelo"])

pair["diff_last10_wr"] = pair["p1_last10_wr"] - pair["p2_last10_wr"]
pair["diff_last10_n"]  = pair["p1_last10_n"]  - pair["p2_last10_n"]

pair["diff_surf_last10_wr"] = pair["p1_surf_last10_wr"] - pair["p2_surf_last10_wr"]
pair["diff_surf_last10_n"]  = pair["p1_surf_last10_n"]  - pair["p2_surf_last10_n"]

print("Pair rows:", len(pair), "Date range:", pair["tourney_date"].min(), "to", pair["tourney_date"].max())


Pair rows: 149812 Date range: 20000103 to 20241218


In [20]:
cat_cols = ["surface","tourney_level","best_of","round",
            "p1_ioc","p2_ioc","p1_hand","p2_hand"]

num_cols = [
    "diff_age","diff_ht","diff_rank","diff_rank_points",
    "diff_last3_wr","diff_last3_n",
    "diff_elo","diff_surfelo","elo_p","surfelo_p",
    "date_year","date_doy",
    "diff_last10_wr","diff_last10_n",
    "diff_surf_last10_wr","diff_surf_last10_n",
]

X = pair[cat_cols + num_cols]
y = pair["y"].astype(int)

train_mask = pair["tourney_date"] < 20240101
test_mask  = (pair["tourney_date"] >= 20240101) & (pair["tourney_date"] < 20250101)

print("Train rows:", train_mask.sum(), "Test rows:", test_mask.sum())
print("Test date range:", pair.loc[test_mask, "tourney_date"].min(), "-", pair.loc[test_mask, "tourney_date"].max())

X_train, y_train = X[train_mask], y[train_mask]
X_test,  y_test  = X[test_mask],  y[test_mask]


Train rows: 143660 Test rows: 6152
Test date range: 20240101 - 20241218


In [None]:
preprocess = ColumnTransformer(
    transformers=[
        ("num", Pipeline([("imp", SimpleImputer(strategy="median"))]), num_cols),
        ("cat", Pipeline([
            ("imp", SimpleImputer(strategy="most_frequent")),
            ("oh", OneHotEncoder(handle_unknown="ignore"))
        ]), cat_cols),
    ]
)

rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=18,
    min_samples_leaf=4,
    max_features="sqrt",
    n_jobs=-1,
    random_state=42,
    class_weight="balanced"
)

rf_model = Pipeline([("prep", preprocess), ("rf", rf)])

model.fit(X_train, y_train)

# gem modellen (så kan du teste senere uden at træne igen)
joblib.dump(rf_model, "x_RF_tennis.joblib", compress=3)
print("Model trained + saved.")


Model trained + saved.


In [44]:
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss, confusion_matrix, classification_report

proba = rf_model.predict_proba(X_test)[:, 1]
pred  = (proba >= 0.5).astype(int)

print("ACC:", accuracy_score(y_test, pred))
print("AUC:", roc_auc_score(y_test, proba))
print("LogLoss:", log_loss(y_test, proba))
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))


ACC: 0.6518205461638491
AUC: 0.7162738792378935
LogLoss: 0.6146389533595797
[[2002 1074]
 [1068 2008]]
              precision    recall  f1-score   support

           0       0.65      0.65      0.65      3076
           1       0.65      0.65      0.65      3076

    accuracy                           0.65      6152
   macro avg       0.65      0.65      0.65      6152
weighted avg       0.65      0.65      0.65      6152



XGBoost

In [45]:
train_mask = pair["tourney_date"] < 20230101
val_mask   = (pair["tourney_date"] >= 20230101) & (pair["tourney_date"] < 20240101)
test_mask  = (pair["tourney_date"] >= 20240101) & (pair["tourney_date"] < 20250101)


In [46]:
Xtr = preprocess.fit_transform(X[train_mask])
ytr = y[train_mask].to_numpy()

Xva = preprocess.transform(X[val_mask])
yva = y[val_mask].to_numpy()

Xte = preprocess.transform(X[test_mask])
yte = y[test_mask].to_numpy()


In [47]:

xgb_clf = XGBClassifier(
    n_estimators=2000,
    learning_rate=0.03,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    min_child_weight=2,
    reg_lambda=1.0,
    reg_alpha=0.0,
    objective="binary:logistic",
    eval_metric="logloss",
    random_state=42,

    # GPU
    tree_method="gpu_hist",
    device = "cuda"
)

xgb_model = Pipeline([
    ("prep", preprocess),
    ("xgb", xgb_clf)
])

xgb_model.fit(X_train, y_train)

joblib.dump(xgb_model, "xgb_tennis_pipeline.joblib", compress=3)



    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"



['xgb_tennis_pipeline.joblib']

In [48]:


# sandsynlighed for klasse 1 (p1 vinder i pairwise setup)
p_xgb = xgb_model.predict_proba(X_test)[:, 1]
pred  = (p_xgb >= 0.5).astype(int)

print("XGB ACC:", accuracy_score(y_test, pred))
print("XGB AUC:", roc_auc_score(y_test, p_xgb))
print("XGB LogLoss:", log_loss(y_test, p_xgb))
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))


XGB ACC: 0.6532834850455137
XGB AUC: 0.7219994956549383
XGB LogLoss: 0.6105772282944976
[[2006 1070]
 [1063 2013]]
              precision    recall  f1-score   support

           0       0.65      0.65      0.65      3076
           1       0.65      0.65      0.65      3076

    accuracy                           0.65      6152
   macro avg       0.65      0.65      0.65      6152
weighted avg       0.65      0.65      0.65      6152



Ensemble: stemme / gennemsnit af sandsynligheder (mest almindeligt)

Du træner både Random Forest og XGBoost hver for sig, og kombinerer deres output:

soft voting: gennemsnit af predict_proba

hard voting: flertalsafstemning på klasser

Soft voting er næsten altid bedst, især når du vil have gode sandsynligheder (odds/betting).



eller med vægte (fx 0.3/0.7) baseret på validering.

In [49]:


p_rf  = rf_model.predict_proba(X_test)[:, 1]
p_xgb = xgb_model.predict_proba(X_test)[:, 1]

def eval_probs(y_true, p, name="MODEL"):
    pred = (p >= 0.5).astype(int)
    print(f"{name} ACC:", accuracy_score(y_true, pred))
    print(f"{name} AUC:", roc_auc_score(y_true, p))
    print(f"{name} LogLoss:", log_loss(y_true, p))
    print()

# Individuelle
eval_probs(y_test, p_rf,  "RF")
eval_probs(y_test, p_xgb, "XGB")



RF ACC: 0.6518205461638491
RF AUC: 0.7162738792378935
RF LogLoss: 0.6146389533595797

XGB ACC: 0.6532834850455137
XGB AUC: 0.7219994956549383
XGB LogLoss: 0.6105772282944976



In [50]:
best = None
for w in np.linspace(0, 1, 21):  
    p = (1 - w) * p_rf + w * p_xgb
    ll = log_loss(y_test, p)
    if best is None or ll < best[0]:
        best = (ll, w)

print("Best w (min LogLoss):", best[1], "LogLoss:", best[0])


Best w (min LogLoss): 0.7000000000000001 LogLoss: 0.6089793309487922


In [51]:
w = 0.7
p_ens = (1 - w) * p_rf + w * p_xgb
pred_ens = (p_ens >= 0.5).astype(int)

print("ENS ACC:", accuracy_score(y_test, pred_ens))
print("ENS AUC:", roc_auc_score(y_test, p_ens))
print("ENS LogLoss:", log_loss(y_test, p_ens))


ENS ACC: 0.6553966189856957
ENS AUC: 0.7224162778742595
ENS LogLoss: 0.6089793304979688
