# ATP match prediction (PyTorch, **no embeddings**)

Denne notebook viser en *embedding-fri* baseline, hvor vi:
- loader ATP-match CSV'er (fx `atp_matches/atp_matches_2000.csv` ... `atp_matches_2024.csv`)
- laver **player1/player2**-eksempler + label (1 hvis player1 vinder)
- beregner **performance sidste 3 kampe** *kun ud fra tidligere kampe* (ingen leakage)
- one-hot encoder kategorier (surface, level, lande) og standardiserer numeriske features
- træner en 2-lags MLP (TwoLayerNet-stil) med `BCEWithLogitsLoss`

> Split: train = 2000–2023, test = 2024


In [960]:

#importere bare filer her og vælger device

import pandas as pd
import numpy as np
from pathlib import Path
from collections import defaultdict, deque
import math


import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device


device(type='cuda')

## 1) Load data

min struktur:
```
projekt uge/
  atp_matches/
    atp_matches_2000.csv
    ...
    atp_matches_2023.csv
  atp_matches_2024.csv

Hvis jere filer hedder noget andet, justér `load_matches(...)`.


In [961]:
# en function som gå in i mappen atp_matches 
def load_matches(folder="atp_matches"):
    folder = Path(folder)
    all_dfs = []

    #et for loop og function som går igennem alle csv filer i mappen
    for f in sorted(folder.glob("*.csv")):
        df = pd.read_csv(f)
        df["tourney_date"] = df["tourney_date"].astype(int)
        all_dfs.append(df)
    out = pd.concat(all_dfs, ignore_index=True)
    return out

df = load_matches("atp_matches")
df.shape, df.columns[:20]

# printet ses en index med alle de forsekllige emner som er columns. 

((71830, 49),
 Index(['tourney_id', 'tourney_name', 'surface', 'draw_size', 'tourney_level',
        'tourney_date', 'match_num', 'winner_id', 'winner_seed', 'winner_entry',
        'winner_name', 'winner_hand', 'winner_ht', 'winner_ioc', 'winner_age',
        'loser_id', 'loser_seed', 'loser_entry', 'loser_name', 'loser_hand'],
       dtype='object'))

## 2) Sortér kronologisk + beregn *sidste 3 kampe* performance

Vi skal sikre at "sidste 3" kun bruger **kampe før den aktuelle kamp**.
Vi sorterer derfor efter `tourney_date` og `match_num` (hvis findes).


In [962]:
# funktion der altså udregner vores last_3, last_10, og surf_last_10 parametre
def add_form_features(df: pd.DataFrame, k_short: int = 3, k_long: int = 10) -> pd.DataFrame:
    """Beregn pre-match form-features uden leakage.

    Vi gemmer én historik pr spiller (maxlen=k_long) og tager så last-k_short fra den.
    Surface-form gemmes separat pr (spiller, surface) med maxlen=k_long.
    """
    df = df.sort_values(["tourney_date", "match_num"] if "match_num" in df.columns else ["tourney_date"],
                        kind="mergesort").reset_index(drop=True)

    hist = defaultdict(lambda: deque(maxlen=k_long))                 # player -> last results (0/1)
    hist_surf = defaultdict(lambda: deque(maxlen=k_long))            # (player, surface) -> last results

    
    # opretter lister for hvor mange sejre sidste (3,10) games, hvor mange tabte og hvor mange spillet
    w_last3_wr, w_last3_n = [], []
    l_last3_wr, l_last3_n = [], []
    w_last10_wr, w_last10_n = [], []
    l_last10_wr, l_last10_n = [], []
    w_slast10_wr, w_slast10_n = [], []
    l_slast10_wr, l_slast10_n = [], []

    for _, row in df.iterrows():
        wid, lid = row["winner_id"], row["loser_id"]
        surf = row.get("surface") or "UNK"
        if pd.isna(surf):
            surf = "UNK"

        wh = hist[wid]
        lh = hist[lid]

        # last3 (fra last10 deque)
        wh3 = list(wh)[-k_short:]
        lh3 = list(lh)[-k_short:]
        # winrate calculations
        w_last3_n.append(len(wh3)); w_last3_wr.append((sum(wh3) / len(wh3)) if wh3 else 0.0)
        l_last3_n.append(len(lh3)); l_last3_wr.append((sum(lh3) / len(lh3)) if lh3 else 0.0)

        # last10 (hele deque)
        # winrate calculations
        w_last10_n.append(len(wh)); w_last10_wr.append((sum(wh) / len(wh)) if wh else 0.0)
        l_last10_n.append(len(lh)); l_last10_wr.append((sum(lh) / len(lh)) if lh else 0.0)

        # surface last10
        wsh = hist_surf[(wid, surf)]
        lsh = hist_surf[(lid, surf)]
        # winrate calculations
        w_slast10_n.append(len(wsh)); w_slast10_wr.append((sum(wsh) / len(wsh)) if wsh else 0.0)
        l_slast10_n.append(len(lsh)); l_slast10_wr.append((sum(lsh) / len(lsh)) if lsh else 0.0)

        # opdatér historik efter vi har læst features
        wh.append(1); lh.append(0)
        wsh.append(1); lsh.append(0)

    out = df.copy()
    out["winner_last3_n"] = w_last3_n  # er antal kampe spillet maks 3
    out["winner_last3_wr"] = w_last3_wr  # er antal vundede kampe ud af de 3
    out["loser_last3_n"] = l_last3_n   # antal kampe modstandere har vundet ud af de 3
    out["loser_last3_wr"] = l_last3_wr   # antal kampe modstandere har spillet maks 3

    out["winner_last10_n"] = w_last10_n
    out["winner_last10_wr"] = w_last10_wr
    out["loser_last10_n"] = l_last10_n
    out["loser_last10_wr"] = l_last10_wr

    out["winner_surf_last10_n"] = w_slast10_n
    out["winner_surf_last10_wr"] = w_slast10_wr
    out["loser_surf_last10_n"] = l_slast10_n
    out["loser_surf_last10_wr"] = l_slast10_wr
    return out


Her udregnes ELO, både overordnet ELO og surface sepcefik ELO som hænger sammen med parametren "surface". 

In [963]:
def add_elo_features(
    df: pd.DataFrame,
    base: float = 1500.0,
    rho: float = 0.85,
    k_by_year = None,
    k_default_global: float = 48.0,
    k_default_surface: float = 32.0,
    k_surface_scale: float = 0.70,
    use_best_of: bool = True,
    use_level: bool = True,
) -> pd.DataFrame:

    #Season reset ved årsskifte: elo = base + rho*(elo-base)
    #K kan afhænge af år + justeres af best_of og turneringsniveau
    df = df.sort_values(["tourney_date", "match_num"] if "match_num" in df.columns else ["tourney_date"],
                        kind="mergesort").reset_index(drop=True)

    if k_by_year is None:
        # nyere kampe vægtes højere
        k_by_year = [(2022, 64.0), (2018, 48.0), (2010, 40.0), (2000, 32.0)]

    def K_for_year(year: int, default_k: float) -> float:
        for y0, kval in k_by_year:
            if year >= y0:
                return float(kval)
        return float(default_k)

    def best_of_mult(row) -> float:
        if not use_best_of:
            return 1.0
        try:
            bo = int(row.get("best_of", 3))
        except Exception:
            return 1.0
        return 1.10 if bo >= 5 else 1.0

    def level_mult(row) -> float:
        if not use_level:
            return 1.0
        lvl = row.get("tourney_level", None)
        return {"G": 1.10, "M": 1.05, "A": 1.02, "B": 1.00}.get(str(lvl), 1.0)

    elo_g = defaultdict(lambda: float(base))
    elo_s = defaultdict(lambda: float(base))  # (player, surface) -> elo

    current_year = None
    w_g_pre, l_g_pre = [], []
    w_s_pre, l_s_pre = [], []

    for _, row in df.iterrows():
        y = int(row["tourney_date"]) // 10000
        if current_year is None:
            current_year = y
        elif y != current_year:
            # season reset
            for pid in list(elo_g.keys()):
                elo_g[pid] = base + rho * (elo_g[pid] - base)
            for key in list(elo_s.keys()):
                elo_s[key] = base + rho * (elo_s[key] - base)
            current_year = y

        wid, lid = row["winner_id"], row["loser_id"]
        surf = row.get("surface") or "UNK"
        if pd.isna(surf):
            surf = "UNK"

        # global pre
        egw, egl = elo_g[wid], elo_g[lid]
        w_g_pre.append(egw); l_g_pre.append(egl)

        p_w = 1.0 / (1.0 + 10 ** ((egl - egw) / 400.0))    # formel for at udregne sandsynligheden for den med højest ELO vinder. 
        K_g = K_for_year(y, k_default_global) * best_of_mult(row) * level_mult(row)  #her er hvor den finder hvor stor en ændring i ELO der skal laves
        elo_g[wid] = egw + K_g * (1 - p_w)  # Her opdatere den så endeligt ELO for vinder og nedenunder taber
        elo_g[lid] = egl + K_g * (0 - (1 - p_w))

        # surface pre
        esw, esl = elo_s[(wid, surf)], elo_s[(lid, surf)]
        w_s_pre.append(esw); l_s_pre.append(esl)

        p_sw = 1.0 / (1.0 + 10 ** ((esl - esw) / 400.0))
        K_s = K_for_year(y, k_default_surface) * k_surface_scale * best_of_mult(row) * level_mult(row)
        elo_s[(wid, surf)] = esw + K_s * (1 - p_sw)
        elo_s[(lid, surf)] = esl + K_s * (0 - (1 - p_sw))

    out = df.copy()
    out["winner_elo_pre"] = w_g_pre
    out["loser_elo_pre"] = l_g_pre
    out["winner_surfelo_pre"] = w_s_pre
    out["loser_surfelo_pre"] = l_s_pre
    return out


## 3) Byg et rigtigt supervised datasæt (player1 vs player2)

Vigtig pointe: hvis du bruger rækkerne direkte ("winner_*" og "loser_*"), så er label altid 1 → modellen lærer intet.

Løsning: lav **to** eksempler pr. kamp:
- (player1 = winner, player2 = loser, y=1)
- (player1 = loser, player2 = winner, y=0)

Så får du et balanceret datasæt.


In [964]:

def build_pairwise_dataset(df):
    base = df.copy()

    # feature-kilder (kun pre-match features!)
    # OBS: brug IKKE w_ace, w_svpt osv. (det er efter-match stats = leakage)
    # parametre om spillerne som bruges i modellen
    cols_p = {
        "age": ("winner_age", "loser_age"),
        "ht": ("winner_ht", "loser_ht"),
        "rank": ("winner_rank", "loser_rank"),
        "rank_points": ("winner_rank_points", "loser_rank_points"),
        "ioc": ("winner_ioc", "loser_ioc"),
        "hand": ("winner_hand", "loser_hand"),
        "last3_wr": ("winner_last3_wr", "loser_last3_wr"),
        "last3_n": ("winner_last3_n", "loser_last3_n"),
        "elo": ("winner_elo_pre", "loser_elo_pre"),
        "surfelo": ("winner_surfelo_pre", "loser_surfelo_pre"),
        "last10_wr": ("winner_last10_wr", "loser_last10_wr"),
        "last10_n":  ("winner_last10_n",  "loser_last10_n"),
        "surf_last10_wr": ("winner_surf_last10_wr", "loser_surf_last10_wr"),
        "surf_last10_n":  ("winner_surf_last10_n",  "loser_surf_last10_n"),

    }

    # match-level features, parametre om turneringen som bruges i modellen. 
    MATCH_COLS = ["surface", "tourney_level", "best_of", "round", "tourney_date", ]
    match_cols = [c for c in MATCH_COLS if c in base.columns]
    out = base[match_cols].copy()


    def make_view(swap=False):
        out = pd.DataFrame()
        # match features kopieres over i out
        for c in match_cols:
            out[c] = base[c]

        # Her gør vi modellen uafhængig fra vinder og taber. men til player 1 og player 2. 
        # trikket er både at køre dataen på at p1 vinder og at p1 taber ved at bytte på player id. 
        # så den stadig ved hvem der vandt men ved ikke at det er id nr 1 hver gang. 
        for name, (cw, cl) in cols_p.items():
            if swap:
                out[f"p1_{name}"] = base[cl]
                out[f"p2_{name}"] = base[cw]
            else:
                out[f"p1_{name}"] = base[cw]
                out[f"p2_{name}"] = base[cl]

        out["p1_id"] = base["loser_id"] if swap else base["winner_id"]
        out["p2_id"] = base["winner_id"] if swap else base["loser_id"]
        # y værdien brugt i swap y=1 betyder p1 vandt med swap=False, ved swap=True er y=0 p1 vandt.
        # derfor gætter modellen ikke bare y=1 hver gang. 
        out["y"] = 0 if swap else 1
        return out

    a = make_view(swap=False)
    b = make_view(swap=True)
    out = pd.concat([a,b], ignore_index=True)
    return out


## 4) Train/test split (2000–2023 / 2024)

Vi udleder år fra `tourney_date` (format YYYYMMDD).


In [None]:
EVAL_MODE = "houston"   # her vælger du ift hvad modellen skal teste på du kan indtaste disse valgmuligheder: "wimbledon", "full2024", "canadian", "estoril", "miami", "marrakech"

PATH_WIM  = "2024_wimbeldon.csv"
PATH_2024 = "atp_matches_2024.csv"
PATH_CAN  = "canadian_open.csv"   
PATH_EST = "estoril_open_2024_with_bet365.csv"
PATH_HOU = "houston_open_2024_with_bet365.csv"
PATH_MIA = "Miami_odds.csv"
PATH_MAR = "marrakech_open_2024_with_bet365.csv"

# load historik
df_all = load_matches("atp_matches")
df_all["tourney_date"] = df_all["tourney_date"].astype(int)

# vælg test raw (uden odds)
if EVAL_MODE == "wimbledon":
    df_test_raw = pd.read_csv(PATH_WIM)
elif EVAL_MODE == "full2024":
    df_test_raw = pd.read_csv(PATH_2024)
elif EVAL_MODE == "canadian":
    df_test_raw = pd.read_csv(PATH_CAN)
elif EVAL_MODE == "estoril":
    df_test_raw = pd.read_csv(PATH_EST)
elif EVAL_MODE == "houston":
    df_test_raw = pd.read_csv(PATH_HOU)
elif EVAL_MODE == "miami":
    df_test_raw = pd.read_csv(PATH_MIA)
elif EVAL_MODE == "marrakech":
    df_test_raw = pd.read_csv(PATH_MAR)
else:
    raise ValueError("EVAL_MODE skal være 'wimbledon', 'full2024' eller 'canadian'")

df_test_raw["tourney_date"] = df_test_raw["tourney_date"].astype(int)

# kombiner for at få pre-match features (sort stabilt)
df = pd.concat([df_all, df_test_raw], ignore_index=True)
df = df.sort_values(["tourney_date"], kind="mergesort").reset_index(drop=True)

# nye samlede feature-funktioner
df = add_form_features(df, k_short=3, k_long=10)
df = add_elo_features(df, rho=0.85, k_surface_scale=0.7)

# split i RAW space
df["year"] = (df["tourney_date"] // 10000).astype(int)
train_raw = df[df["year"].between(2000, 2023)].copy()

if EVAL_MODE in ("wimbledon", "canadian"):
    # samme match-logik som før
    if "tourney_id" in df_test_raw.columns and "tourney_id" in df.columns:
        ids = set(df_test_raw["tourney_id"].astype(str).unique())
        test_raw = df[df["tourney_id"].astype(str).isin(ids)].copy()
    elif "tourney_name" in df_test_raw.columns and "tourney_name" in df.columns:
        keys = set(zip(df_test_raw["tourney_name"].astype(str),
                       df_test_raw["tourney_date"].astype(int)))
        mask = [k in keys for k in zip(df["tourney_name"].astype(str),
                                       df["tourney_date"].astype(int))]
        test_raw = df[pd.Series(mask, index=df.index)].copy()
    else:
        raise ValueError("Kan ikke matche test-turnering: mangler tourney_id og tourney_name.")
else:
    test_raw = df[df["year"] == 2024].copy()

# byg pairwise separat (samme som før)
train_df = build_pairwise_dataset(train_raw).reset_index(drop=True)
test_df  = build_pairwise_dataset(test_raw).reset_index(drop=True)

print(
    f"EVAL_MODE={EVAL_MODE} | train matches={len(train_raw)} | test matches={len(test_raw)} | "
    f"train rows={len(train_df)} | test rows={len(test_df)}"
)


EVAL_MODE=houston | train matches=71830 | test matches=27 | train rows=143660 | test rows=54


In [966]:
def add_elo_diff_features(df):
    # diff features
    df["diff_elo"]      = df["p1_elo"]      - df["p2_elo"]
    df["diff_surfelo"]  = df["p1_surfelo"]  - df["p2_surfelo"]

    # elo win-prob (vectoriseret)
    df["elo_p"]      = 1.0 / (1.0 + np.power(10.0, -df["diff_elo"]     / 400.0))
    df["surfelo_p"]  = 1.0 / (1.0 + np.power(10.0, -df["diff_surfelo"] / 400.0))
    return df

train_df = add_elo_diff_features(train_df)
test_df  = add_elo_diff_features(test_df)

## 5) Preprocessing uden embeddings

- **Kategorier**: one-hot (surface, tourney_level, round, hand, ioc)
- **Dato**: brug fx *år* og *dag-i-året* i stedet for rå YYYYMMDD
- **Numeriske**: fill NaN + standardiser (fit på train)

> OBS: Player-ID som one-hot kan gøre modellen stor og “memoriserende”. Start uden ID; tilføj først hvis nødvendigt.


In [967]:
# ---------- date-features ----------
def add_date_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    d = pd.to_datetime(
        df["tourney_date"].astype(int).astype(str),
        format="%Y%m%d",
        errors="coerce",
    )
    df["date_year"] = d.dt.year.fillna(0).astype(int)
    df["date_doy"]  = d.dt.dayofyear.fillna(0).astype(int)
    return df

train_df = add_date_features(train_df)
test_df  = add_date_features(test_df)

# ---------- diffs (spillerfeatures) ----------
DIFF_BASE = [
    "age", "ht", "rank", "rank_points",
    "last3_wr", "last3_n",
    "elo", "surfelo",
    "last10_wr", "last10_n",
    "surf_last10_wr", "surf_last10_n",
]

def add_diff_features(df: pd.DataFrame, cols=DIFF_BASE) -> pd.DataFrame:
    df = df.copy()
    for c in cols:
        df[f"diff_{c}"] = df[f"p1_{c}"] - df[f"p2_{c}"]
    return df

train_df = add_diff_features(train_df)
test_df  = add_diff_features(test_df)


def add_elo_prob_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df["elo_p"]     = 1.0 / (1.0 + np.power(10.0, -df["diff_elo"]     / 400.0))
    df["surfelo_p"] = 1.0 / (1.0 + np.power(10.0, -df["diff_surfelo"] / 400.0))
    return df

train_df = add_elo_prob_features(train_df)
test_df  = add_elo_prob_features(test_df)

# ---------- feature-valg ----------
use_player_id_onehot = False  # True hvis du vil one-hot p1_id/p2_id

cat_cols = [
    "surface", "tourney_level", "best_of", "round",
    "p1_ioc", "p2_ioc", "p1_hand", "p2_hand",
]
if use_player_id_onehot:
    cat_cols += ["p1_id", "p2_id"]



num_cols = [
    "diff_age","diff_ht","diff_rank","diff_rank_points",
    "diff_last3_wr","diff_last3_n",
    "diff_elo","diff_surfelo",
    "elo_p","surfelo_p",
    "date_year","date_doy",
    "diff_last10_wr","diff_last10_n",
    "diff_surf_last10_wr","diff_surf_last10_n",
]

# ---------- NaN-handling ----------
for c in cat_cols:
    train_df[c] = train_df[c].fillna("UNK").astype(str)
    test_df[c]  = test_df[c].fillna("UNK").astype(str)

for c in num_cols:
    train_df[c] = pd.to_numeric(train_df[c], errors="coerce")
    test_df[c]  = pd.to_numeric(test_df[c],  errors="coerce")

# fyld numeric NaN med train-median (samme median bruges på test)
train_medians = train_df[num_cols].median(numeric_only=True)
train_df[num_cols] = train_df[num_cols].fillna(train_medians)
test_df[num_cols]  = test_df[num_cols].fillna(train_medians)

# ---------- One-hot ----------
train_cat = pd.get_dummies(train_df[cat_cols], prefix=cat_cols)
test_cat  = pd.get_dummies(test_df[cat_cols],  prefix=cat_cols)

# align test til train-kolonner
test_cat = test_cat.reindex(columns=train_cat.columns, fill_value=0)


# ---------- Standardisering (fit på train) ----------
mu = train_df[num_cols].mean()
sd = train_df[num_cols].std().replace(0, 1.0)

train_num = (train_df[num_cols] - mu) / sd
test_num  = (test_df[num_cols]  - mu) / sd

# ---------- X/y ----------
X_train = pd.concat([train_num, train_cat], axis=1).astype(np.float32)
X_test  = pd.concat([test_num,  test_cat],  axis=1).astype(np.float32)

y_train = train_df["y"].astype(np.float32).values
y_test  = test_df["y"].astype(np.float32).values

X_train.shape, X_test.shape


((143660, 280), (54, 280))

## 6) PyTorch Dataset + DataLoader


In [968]:
class TennisDataset(Dataset):
    def __init__(self, X, y):
        # X kan være DataFrame eller numpy array
        if hasattr(X, "to_numpy"):
            X = X.to_numpy(dtype=np.float32, copy=False)
        else:
            X = np.asarray(X, dtype=np.float32)

        y = np.asarray(y, dtype=np.float32).reshape(-1, 1)

        self.X = torch.from_numpy(X)
        self.y = torch.from_numpy(y)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

batch_size = 4096  # justér efter RAM/GPU
train_ds = TennisDataset(X_train, y_train)
test_ds  = TennisDataset(X_test,  y_test)

train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True,  num_workers=0)
test_loader  = DataLoader(test_ds,  batch_size=batch_size, shuffle=False, num_workers=0)

next(iter(train_loader))[0].shape


torch.Size([4096, 280])

## 7) TwoLayerNet-lignende model (binary classification)

Output er **logits** (ingen sigmoid i forward), og vi bruger `BCEWithLogitsLoss`.


In [969]:
# Ryddet MLP (logits output)
class MLP(nn.Module):
    _ACTS = {
        "silu": nn.SiLU,
        "gelu": nn.GELU,
        "lrelu": lambda: nn.LeakyReLU(0.01),
        "relu": nn.ReLU,
    }

    def __init__(self, D_in: int, H: int = 128, dropout: float = 0.2, act: str = "gelu"):
        super().__init__()
        act_layer = self._ACTS.get(act, nn.ReLU)
        act_layer = act_layer() if callable(act_layer) else nn.ReLU()

        self.net = nn.Sequential(
            nn.Linear(D_in, H),
            nn.BatchNorm1d(H),
            act_layer,
            nn.Dropout(dropout),
            nn.Linear(H, 1),
        )

    def forward(self, x):
        return self.net(x)  # logits


D_in = X_train.shape[1]
model = MLP(D_in, H=256, dropout=0.2, act="gelu").to(device)

criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4)

model


MLP(
  (net): Sequential(
    (0): Linear(in_features=280, out_features=256, bias=True)
    (1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): GELU(approximate='none')
    (3): Dropout(p=0.2, inplace=False)
    (4): Linear(in_features=256, out_features=1, bias=True)
  )
)

## 8) Train + eval loops (accuracy + logloss)

Accuracy er fin som start, men logloss fortæller mere om sandsynlighederne.


In [970]:
@torch.no_grad()
def evaluate(model, loader, threshold=0.5):
    model.eval()
    total_loss, correct, n = 0.0, 0, 0

    for xb, yb in loader:
        xb, yb = xb.to(device), yb.to(device)

        logits = model(xb)
        loss = criterion(logits, yb)

        total_loss += loss.item() * xb.size(0)

        preds = (torch.sigmoid(logits) >= threshold).float()
        correct += (preds == yb).sum().item()
        n += xb.size(0)

    return total_loss / n, correct / n


def train(model, train_loader, test_loader, epochs=10, threshold=0.5):
    for ep in range(1, epochs + 1):
        model.train()
        running, n = 0.0, 0

        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)

            optimizer.zero_grad(set_to_none=True)
            logits = model(xb)
            loss = criterion(logits, yb)
            loss.backward()
            optimizer.step()

            running += loss.item() * xb.size(0)
            n += xb.size(0)

        train_loss = running / n
        test_loss, test_acc = evaluate(model, test_loader, threshold=threshold)
        print(
            f"Epoch {ep:02d} | train_loss={train_loss:.4f} | "
            f"test_loss={test_loss:.4f} | test_acc@{threshold:.2f}={test_acc:.4f}"
        )



pos = float((y_train == 1).sum())
neg = float((y_train == 0).sum())
if pos > 0:
    pos_weight = torch.tensor([neg / pos], device=device)
    criterion = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight)

train(model, train_loader, test_loader, epochs=1, threshold=0.50)

torch.save(model.state_dict(), "model.pt")


Epoch 01 | train_loss=0.6036 | test_loss=0.6633 | test_acc@0.50=0.6667


Tager kun data med >80 sikkerhed med i målingen af acc. 

In [971]:
@torch.no_grad()
def evaluate_high_conf(model, loader, conf_thr=0.80, pred_thr=0.50):
    model.eval()
    correct = kept = total = 0

    for xb, yb in loader:
        xb = xb.to(device)
        y = yb.to(device).view(-1).float()

        p = torch.sigmoid(model(xb)).view(-1)
        preds = (p >= pred_thr).float()

        conf = torch.maximum(p, 1.0 - p)          
        mask = conf >= conf_thr

        k = int(mask.sum().item())
        kept += k
        total += y.numel()

        if k:
            correct += int((preds[mask] == y[mask]).sum().item())

    coverage = kept / total if total else 0.0
    acc = correct / kept if kept else float("nan")
    return acc, coverage, kept, total


In [972]:
for thr in [0.5, 0.5236, 0.60, 0.70, 0.80, 0.85, 0.90]:
    acc, cov, kept, total = evaluate_high_conf(model, test_loader, conf_thr=thr)
    print(f"thr={thr:.2f} | acc={acc:.4f} | coverage={cov:.2%} ({kept}/{total})")


thr=0.50 | acc=0.6667 | coverage=100.00% (54/54)
thr=0.52 | acc=0.6905 | coverage=77.78% (42/54)
thr=0.60 | acc=0.7222 | coverage=33.33% (18/54)
thr=0.70 | acc=0.5000 | coverage=7.41% (4/54)
thr=0.80 | acc=nan | coverage=0.00% (0/54)
thr=0.85 | acc=nan | coverage=0.00% (0/54)
thr=0.90 | acc=nan | coverage=0.00% (0/54)


Hurtig konfidens udregning

In [985]:
def CI(acc, n, z=1.96):
    p = acc
    center = (p + (z**2)/(2*n)) / (1 + (z**2)/n)
    half = (z * math.sqrt((p*(1-p)/n) + (z**2)/(4*n**2))) / (1 + (z**2)/n)
    return center - half, center + half

# For vores model med alle gæt
l, h = CI(0.6552, 3077)
print(f"Accuracy:",(0.6552*100),"%")
print((l*100), (h*100))

# For Bet365
l2, h2 = CI(0.6913, 2611)
print(f"Accuracy:", (0.6913*100),"%")
print((l2*100), (h2*100))

# For vores model med kun gæt når sikkerhed >=52.36
l3, h3 = CI(0.6681, 3077)
print("Accuracy:",(0.6681*100),"%")
print((l2*100), (h2*100))

Accuracy: 65.52 %
63.822149619618386 67.17914561087967
Accuracy: 69.13 %
67.33101292133955 70.87277731972134
Accuracy: 66.81 %
67.33101292133955 70.87277731972134


Konfidens interval for ELO

In [974]:
w = 0.45  # manuelt gættet gennem forsøg
score1 = w*test_df["p1_surfelo"].values + (1-w)*test_df["p1_elo"].values
score2 = w*test_df["p2_surfelo"].values + (1-w)*test_df["p2_elo"].values

pred_combo = (score1 >= score2).astype(int)
acc_combo = (pred_combo == test_df["y"].values).mean()
print(f"Baseline acc (combo w={w}):", acc_combo)

print(acc_combo)

Baseline acc (combo w=0.45): 0.6296296296296297
0.6296296296296297


In [988]:
lo, hi = CI(acc_combo, 3077)
print(acc_combo*100,"%")
print((lo*100), (hi*100))

62.96296296296296 %
61.24149588338311 64.65210217569076


BETTINNG  (virker ikke hvis "full2024" er valgt som test)

In [976]:
STAKE = 1000
THRESHOLDS = [0.50, 0.5236, 0.60, 0.70, 0.80, 0.85, 0.90]

# Normalisér eval mode
EVAL_MODE = str(EVAL_MODE).strip().lower()

ODDS_CSV = {
    "wimbeldon": "atp_odds_wimbeldon_2024.csv",
    "wimbledon": "atp_odds_wimbeldon_2024.csv",
    "canadian":  "canadian_open_odds.csv",
    "estoril":   "estoril_open_2024_with_bet365.csv",
    "houston": "houston_open_2024_with_bet365.csv",
    "miami": "Miami_odds.csv",
    "marrakech": "marrakech_open_2024_with_bet365.csv",
}.get(EVAL_MODE)

if ODDS_CSV is None:
    raise ValueError("EVAL_MODE skal være 'wimbeldon/wimbledon' eller 'canadian' eller 'estoril'.")

def make_merge_key(df):
    td = df["tourney_date"].astype(str).str.strip().str.replace(r"\.0$", "", regex=True)
    mn = df["match_num"].astype(str).str.strip().str.replace(r"\.0$", "", regex=True)
    return td + "_" + mn

model.eval()
with torch.no_grad():
    all_probs = np.concatenate([
        torch.sigmoid(model(xb.to(device))).view(-1).cpu().numpy()
        for xb, _ in test_loader
    ])

n_matches = len(test_raw)
if len(all_probs) != 2 * n_matches:
    raise ValueError(f"Forventer 2 rækker pr kamp: probs={len(all_probs)} vs 2*n={2*n_matches}")

# BLOK-ORDER
p_a = all_probs[:n_matches]          # swap=False
p_b = all_probs[n_matches:]          # swap=True
p_win = (p_a + (1.0 - p_b)) / 2.0

matches = test_raw.reset_index(drop=True)
matches["merge_key"] = make_merge_key(matches)
matches["p_winner_beats_loser"] = p_win
matches["confidence"] = np.maximum(p_win, 1.0 - p_win)
matches["pick_side"] = np.where(p_win >= 0.5, "winner", "loser")
matches["correct"] = (matches["pick_side"] == "winner").astype(int)

# ---- ODDS MERGE (minimal fix) ----
odds = pd.read_csv(ODDS_CSV)

# FIX 1: rens kolonnenavne (BOM/whitespace)
odds.columns = odds.columns.str.replace("\ufeff", "", regex=False).str.strip()

odds["merge_key"] = make_merge_key(odds)
odds = odds.drop_duplicates("merge_key")

matches = matches.merge(odds[["merge_key","winner_odds","loser_odds"]], on="merge_key", how="left")

# FIX 2: hvis merge laver _x/_y kolonner, så normalisér tilbage til winner_odds/loser_odds
wcol = "winner_odds" if "winner_odds" in matches.columns else (
       "winner_odds_y" if "winner_odds_y" in matches.columns else
       "winner_odds_x" if "winner_odds_x" in matches.columns else None)

lcol = "loser_odds" if "loser_odds" in matches.columns else (
       "loser_odds_y" if "loser_odds_y" in matches.columns else
       "loser_odds_x" if "loser_odds_x" in matches.columns else None)

if wcol is None or lcol is None:
    raise ValueError(
        "Finder ikke odds-kolonner efter merge. "
        f"Odds-kolonner fundet i matches: {[c for c in matches.columns if 'odds' in c.lower()]}"
    )

matches["winner_odds"] = pd.to_numeric(matches[wcol], errors="coerce")
matches["loser_odds"]  = pd.to_numeric(matches[lcol], errors="coerce")
# ---- slut odds merge ----

has_odds = matches["winner_odds"].notna() & matches["loser_odds"].notna()
matches["profit_if_bet"] = np.where(
    ~has_odds, np.nan,
    np.where(matches["pick_side"].eq("winner"),
             np.where(matches["correct"].eq(1), STAKE*(matches["winner_odds"]-1.0), -STAKE),
             -STAKE)  # loser-picks taber altid i winner/loser-format
)

rows = []
for thr in THRESHOLDS:
    sel = matches[matches["confidence"] >= thr].dropna(subset=["profit_if_bet"])
    if len(sel) == 0:
        rows.append([thr, 0, "0/0", np.nan, 0.0, np.nan, np.nan])
        continue
    c = int(sel["correct"].sum()); t = len(sel)
    prof = float(sel["profit_if_bet"].sum())
    rows.append([thr, t, f"{c}/{t}", c/t, prof, prof/(STAKE*t), float(sel["confidence"].mean())])

print(pd.DataFrame(rows, columns=["threshold","bets","correct/total","winrate","profit_DKK","ROI","avg_conf"]).to_string(index=False))


 threshold  bets correct/total  winrate  profit_DKK       ROI  avg_conf
    0.5000    27         18/27 0.666667       790.0  0.029259  0.584996
    0.5236    21         14/21 0.666667       230.0  0.010952  0.607274
    0.6000    11          8/11 0.727273       390.0  0.035455  0.648671
    0.7000     1           0/1 0.000000     -1000.0 -1.000000  0.754410
    0.8000     0           0/0      NaN         0.0       NaN       NaN
    0.8500     0           0/0      NaN         0.0       NaN       NaN
    0.9000     0           0/0      NaN         0.0       NaN       NaN


Hvis man bettede på de mindste odds igennem hele turneringen. (better ikke på kaampe med lige odds)

In [977]:
STAKE = 1000

# Normalisér eval mode
EVAL_MODE = str(EVAL_MODE).strip().lower()

CSV_PATH = {
    "wimbeldon": "atp_odds_wimbeldon_2024.csv",
    "wimbledon": "atp_odds_wimbeldon_2024.csv",
    "canadian":  "canadian_open_odds.csv",
    "estoril":   "estoril_open_2024_with_bet365.csv",
    "houston": "houston_open_2024_with_bet365.csv",
    "miami": "MIami_odds.csv",
    "marrakech": "marrakech_open_2024_with_bet365.csv",
}.get(EVAL_MODE)

if CSV_PATH is None:
    raise ValueError("EVAL_MODE skal være 'wimbeldon/wimbledon' eller 'canadian' eller 'estoril'.")

df = pd.read_csv(CSV_PATH)

# Minimal fix: rens kolonnenavne (BOM/whitespace)
df.columns = df.columns.str.replace("\ufeff", "", regex=False).str.strip()

df["ow"] = pd.to_numeric(df["winner_odds"], errors="coerce")
df["ol"] = pd.to_numeric(df["loser_odds"],  errors="coerce")
df = df.dropna(subset=["ow","ol"])

df = df[df["ow"] != df["ol"]]                 # ingen bets ved lige odds

fav_is_winner = df["ow"] < df["ol"]            # favorit = laveste odds
fav_odds = np.where(fav_is_winner, df["ow"], df["ol"])
net_profit = np.where(fav_is_winner, STAKE*(fav_odds-1.0), -STAKE)

bets = len(df)
correct = int(fav_is_winner.sum())
acc = correct / bets if bets else np.nan
profit = float(net_profit.sum())
roi = profit / (STAKE * bets) if bets else np.nan

print(f"BETS: {bets}")
print(f"CORRECT: {correct}/{bets}  (acc={acc*100:.2f}%)")
print(f"NET PROFIT: {profit:.2f} DKK")
print(f"ROI: {roi*100:.2f}%")


BETS: 25
CORRECT: 16/25  (acc=64.00%)
NET PROFIT: -1030.00 DKK
ROI: -4.12%


Tested profit for de forskellige turneringer ved høje thresholds for mere sikker betting

In [989]:
Miami_profit_80 = [-120, -120, -330, -260, 50]
Miami_profit_85= [190, 190, 190, 190, 340]
Miami_profit_90 = [40, 40, 40, 40, 40]

wimbledon_profit_80 = [4910, 4910, 5530, 5020, 5780]
wimbledon_profit_85 = [4200, 4200, 4200, 4200, 4310]
wimbledon_profit_90 = [90, 90, -900, 90, -860]

canadian_profit_80 = [220, 220, 220, 290, 220]
canadian_profit_85 = [160, 160, 160, 160, 0]
canadian_profit_90 = [0, 0, 0, 0, 0]

estoril_profit_80 = [250, 0, 0, 0, 250]
estoril_profit_85 = [0, 0, 0, 0, 0]
estoril_profit_90 = [0, 0, 0, 0, 0] 

marrakech_profit_80 = [0, 250, 250, 0, 0]
marrakech_profit_85 = [0, 0, 0, 0, 0]
marrakech_profit_90 = [0, 0, 0, 0, 0]

houston_profit_80 = [0, 0, 0, 0, 0]
houston_profit_85 = [0, 0, 0, 0, 0]
houston_profit_90 = [0, 0, 0, 0, 0]

lowest_profit_80 = 190+4200+0+0+0+0
highest_profit_80 = 340+4310+160+0+0+0
print(lowest_profit_80)
print(highest_profit_80)

4390
4810
