# EPL XGBoost + Bayesian DixonColes + Transformer
> ****:  Elo DC OOF  H2H//  
> 2025-11-25

****
1. Elo baseline
2. Dixon-Coles OOF fold posterior
3. H2H features
4. Time decay: lambda=0.00325/day (tau~308d)
5. Draw handling: balanced weights + draw boost x3 (no SMOTE)
6. Calibration: Isotonic


## 1. Introduction

### Environment Setup

In [1]:
import pytensor
print("PyTensor cxx:", pytensor.config.cxx)
import os

import pickle
import warnings
import random
from collections import defaultdict, deque
from math import exp, factorial
import itertools
from copy import deepcopy
import dill
import json

warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

# Set random seeds for reproducibility
np.random.seed(42)
random.seed(42)

from sklearn.inspection import permutation_importance
from sklearn.preprocessing import StandardScaler
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import (
    log_loss, 
    classification_report, 
    f1_score,
    accuracy_score,
    confusion_matrix
)
from sklearn.linear_model import LogisticRegression
from sklearn.isotonic import IsotonicRegression
from sklearn.inspection import permutation_importance
from sklearn.feature_selection import mutual_info_classif
from sklearn.ensemble import RandomForestClassifier

# Imbalanced Learning 
from imblearn.over_sampling import SMOTE

# XGBoost 
import xgboost as xgb

import optuna
# PyTorch & Deep Learning 
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import GradScaler, autocast

# Bayesian Modeling (PyMC) 
import pymc as pm
import pytensor
import pytensor.tensor as at

# Check PyTensor compiler
print("PyTensor cxx:", pytensor.config.cxx)

# Device Setup 
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", DEVICE)
if DEVICE.type == "cuda":
    print("GPU:", torch.cuda.get_device_name(0))
print(f"Using device: {DEVICE}")
print("imports loaded successfully")

PyTensor cxx: /usr/bin/g++
PyTensor cxx: /usr/bin/g++
Using device: cuda
GPU: NVIDIA GeForce RTX 3060 Laptop GPU
Using device: cuda
imports loaded successfully


### 0. Leakage protection

In [2]:
DIRECT_LEAK_COLS = [
    'FTHG', 'FTAG', 'HTHG', 'HTAG', 'FTR', 'HTR',
    'HS', 'AS'
]

DERIVED_LEAK_COLS = [
    'shots_for', 'shots_against'
]

# Some of the features should not be removed as "leak features"
# but they're imported within training after. So we still remove them here.

FEATURES_ADD_AFTER = [
    'HST', 'AST', 'HC', 'AC', 
    'HF', 'AF', 'HY', 'AY', 'HR', 'AR',
    'sot_for', 'sot_against',
    'corners_for', 'corners_against',
    'shot_diff', 'sot_diff', 'corner_diff',
    'foul_diff', 'ycard_diff', 'rcard_diff',
    'shot_accuracy', 'opp_shot_accuracy',
    'shot_acc_home', 'shot_acc_away',
]

ALL_LEAK_COLS = list(set(DIRECT_LEAK_COLS + DERIVED_LEAK_COLS + FEATURES_ADD_AFTER))
len_leak_revise = list(set(DIRECT_LEAK_COLS + DERIVED_LEAK_COLS))

def remove_leak_features(feature_list, verbose=True):
    """Remove leakage features"""
    leaked = [f for f in feature_list if f in ALL_LEAK_COLS]
    if verbose and leaked:
        print(f" remove {len(leaked)} leakage features: {leaked}")
    clean = [f for f in feature_list if f not in ALL_LEAK_COLS]
    if verbose:
        print(f"✅ Features after cleaning: {len(feature_list)} → {len(clean)}")
    return clean

print(f"{len(len_leak_revise)} of leaked features has been defined")

10 of leaked features has been defined


## 2. Data Import

In [3]:
DATA_PATH = "epl-training.csv"
df = pd.read_csv(DATA_PATH)

df["Date"] = pd.to_datetime(df["Date"], dayfirst=True, errors="coerce")
df = df.dropna(subset=["Date", "HomeTeam", "AwayTeam"]).copy()

match_key = ["Date", "HomeTeam", "AwayTeam"]
df = df.sort_values(match_key).drop_duplicates(subset=match_key, keep="last").reset_index(drop=True)

def clean_value(val):
    try:
        return float(val)
    except:
        return 0.0

def safe_ratio(a, b):
    return a / b if (b not in [0, None] and not pd.isna(b)) else 0.0

optional_map = {
    "HS": "shots_for",
    "AS": "shots_against",
    "HST": "sot_for",
    "AST": "sot_against",
    "HC": "corners_for",
    "AC": "corners_against",
}

for raw, newc in optional_map.items():
    if raw in df.columns:
        df[newc] = df[raw].apply(clean_value)

if "sot_for" in df.columns and "shots_for" in df.columns:
    df["shot_accuracy"] = [safe_ratio(hst, hs) for hst, hs in zip(df["sot_for"], df["shots_for"])]
if "sot_against" in df.columns and "shots_against" in df.columns:
    df["opp_shot_accuracy"] = [safe_ratio(ast, ss) for ast, ss in zip(df["sot_against"], df["shots_against"])]

label_map = {"H": 0, "D": 1, "A": 2}
df["y"] = df["FTR"].map(label_map).astype(int)

def season_label(d, season_start_month=8):
    y = d.year
    m = d.month
    start_year = y if m >= season_start_month else y - 1
    return f"{start_year}/{start_year+1}"

df["Season"] = df["Date"].apply(season_label)
df = df.sort_values("Date").reset_index(drop=True)
cleaned_df = df.copy()

# remove leak features
print("cleaned_df shape:", cleaned_df.shape)
print(cleaned_df[["Date","HomeTeam","AwayTeam","FTHG","FTAG","FTR","y","Season"]].head())

cleaned_df shape: (9500, 32)
        Date  HomeTeam       AwayTeam  FTHG  FTAG FTR  y     Season
0 2000-08-19  Charlton       Man City   4.0   0.0   H  0  2000/2001
1 2000-08-19   Chelsea       West Ham   4.0   2.0   H  0  2000/2001
2 2000-08-19  Coventry  Middlesbrough   1.0   3.0   A  2  2000/2001
3 2000-08-19     Derby    Southampton   2.0   2.0   D  1  2000/2001
4 2000-08-19     Leeds        Everton   2.0   0.0   H  0  2000/2001


In [4]:
class FeatureEngineering:
    def __init__(self):
        self.state = None
        self._init_state()
    
    def _init_state(self):
        self.state = {
            'team_results': defaultdict(lambda: deque(maxlen=5)),
            'home_results_5': defaultdict(lambda: deque(maxlen=5)),
            'away_results_5': defaultdict(lambda: deque(maxlen=5)),
            'team_goals_for': defaultdict(lambda: deque(maxlen=5)),
            'team_goals_against': defaultdict(lambda: deque(maxlen=5)),
            'gd_history': defaultdict(lambda: deque(maxlen=5)),
            'season_points': defaultdict(lambda: defaultdict(int)),
            'season_gd': defaultdict(lambda: defaultdict(int)),
            'h2h_history': defaultdict(list),
            'last_match_date': {},
            'referee_stats': defaultdict(lambda: {'H': 0, 'D': 0, 'A': 0, 'yellow': deque(maxlen=20)}),
            'elo': defaultdict(lambda: 1500.0),
            'elo_att': defaultdict(lambda: 1500.0),
            'elo_def': defaultdict(lambda: 1500.0),
            'Current_season': None,
            'team_draw_overall': defaultdict(lambda: deque(maxlen=10)),
        }
    
    def get_state(self):
        return deepcopy(self.state)
    
    def set_state(self, state):
        self.state = deepcopy(state)
    
    def compute_features(self, df, is_train=True):
        df = df.sort_values("Date").copy()
        features = defaultdict(list)
        
        for idx, row in df.iterrows():
            h, a = row["HomeTeam"], row["AwayTeam"]
            season = row["Season"]
            date = row["Date"]
            
            # record
            h_results = list(self.state['team_results'][h])
            a_results = list(self.state['team_results'][a])
            form_h = sum(h_results) / (len(h_results) * 3) if h_results else 0.5
            form_a = sum(a_results) / (len(a_results) * 3) if a_results else 0.5
            features['form_home'].append(form_h)
            features['form_away'].append(form_a)
            features['form_diff'].append(form_h - form_a)
            
            # home/away
            h5_home = list(self.state['home_results_5'][h])
            a5_away = list(self.state['away_results_5'][a])
            features['L5HWR'].append(sum(1 for r in h5_home if r == 'H')/len(h5_home) if h5_home else 0.45)
            features['L5HDR'].append(sum(1 for r in h5_home if r == 'D')/len(h5_home) if h5_home else 0.25)
            features['L5AWR'].append(sum(1 for r in a5_away if r == 'A')/len(a5_away) if a5_away else 0.30)
            features['L5ADR'].append(sum(1 for r in a5_away if r == 'D')/len(a5_away) if a5_away else 0.25)
            
            # goal
            gf_h = list(self.state['team_goals_for'][h])
            ga_h = list(self.state['team_goals_against'][h])
            gf_a = list(self.state['team_goals_for'][a])
            ga_a = list(self.state['team_goals_against'][a])
            features['goals_pm_home'].append(np.mean(gf_h) if gf_h else 1.3)
            features['goals_pm_away'].append(np.mean(gf_a) if gf_a else 1.3)
            features['conceded_pm_home'].append(np.mean(ga_h) if ga_h else 1.3)
            features['conceded_pm_away'].append(np.mean(ga_a) if ga_a else 1.3)
            
            gd_h = list(self.state['gd_history'][h])
            gd_a = list(self.state['gd_history'][a])
            features['gd_pm_home'].append(np.mean(gd_h) if gd_h else 0.0)
            features['gd_pm_away'].append(np.mean(gd_a) if gd_a else 0.0)
            features['gd_pm_diff'].append(features['gd_pm_home'][-1] - features['gd_pm_away'][-1])
            
            # standings
            pts_h = self.state['season_points'][season][h]
            pts_a = self.state['season_points'][season][a]
            gd_season_h = self.state['season_gd'][season][h]
            gd_season_a = self.state['season_gd'][season][a]
            features['points_home'].append(pts_h)
            features['points_away'].append(pts_a)
            features['points_diff'].append(pts_h - pts_a)
            features['season_gd_diff'].append(gd_season_h - gd_season_a)
            
            # position
            all_teams = list(self.state['season_points'][season].keys())
            if all_teams:
                sorted_teams = sorted(all_teams,
                    key=lambda t: (self.state['season_points'][season][t], self.state['season_gd'][season][t]),
                    reverse=True)
                pos_h = sorted_teams.index(h) + 1 if h in sorted_teams else 10
                pos_a = sorted_teams.index(a) + 1 if a in sorted_teams else 10
            else:
                pos_h, pos_a = 10, 10
            features['position_home'].append(pos_h)
            features['position_away'].append(pos_a)
            features['position_diff'].append(pos_a - pos_h)
            
            # H2H
            key = tuple(sorted([h, a]))
            past_h2h = self.state['h2h_history'][key][-10:]
            if past_h2h:
                h_wins = sum(1 for r in past_h2h if (r['home'] == h and r['result'] == 'H') or (r['home'] == a and r['result'] == 'A'))
                draws = sum(1 for r in past_h2h if r['result'] == 'D')
                total = len(past_h2h)
                features['h2h_home_rate'].append(h_wins / total)
                features['h2h_draw_rate'].append(draws / total)
                features['h2h_away_rate'].append((total - h_wins - draws) / total)
            else:
                features['h2h_home_rate'].append(0.45)
                features['h2h_draw_rate'].append(0.25)
                features['h2h_away_rate'].append(0.30)
            
            # rest
            last_h = self.state['last_match_date'].get(h)
            last_a = self.state['last_match_date'].get(a)
            rest_h = (date - last_h).days if last_h else 7
            rest_a = (date - last_a).days if last_a else 7
            features['rest_days_home'].append(np.clip(rest_h, 1, 21))
            features['rest_days_away'].append(np.clip(rest_a, 1, 21))
            features['rest_diff'].append(rest_h - rest_a)
            
            # referee
            ref = row.get("Referee", "Unknown")
            ref_stats = self.state['referee_stats'][ref]
            total_ref = ref_stats['H'] + ref_stats['D'] + ref_stats['A']
            if total_ref > 0:
                features['ref_home_rate'].append(ref_stats['H'] / total_ref)
                features['ref_draw_rate'].append(ref_stats['D'] / total_ref)
                features['ref_away_rate'].append(ref_stats['A'] / total_ref)
            else:
                features['ref_home_rate'].append(0.45)
                features['ref_draw_rate'].append(0.25)
                features['ref_away_rate'].append(0.30)
            features['ref_matches'].append(total_ref)
            features['ref_home_bias'].append(features['ref_home_rate'][-1] - features['ref_away_rate'][-1])
            
            # Elo
            if self.state['Current_season'] is None:
                self.state['Current_season'] = season
            if season != self.state['Current_season'] and is_train:
                for team in list(self.state['elo'].keys()):
                    self.state['elo'][team] = 0.7 * self.state['elo'][team] + 0.3 * 1500.0
                    self.state['elo_att'][team] = 0.7 * self.state['elo_att'][team] + 0.3 * 1500.0
                    self.state['elo_def'][team] = 0.7 * self.state['elo_def'][team] + 0.3 * 1500.0
                self.state['Current_season'] = season
            
            features['elo_home'].append(self.state['elo'][h])
            features['elo_away'].append(self.state['elo'][a])
            features['elo_diff'].append(self.state['elo'][h] - self.state['elo'][a] + 75)
            features['elo_att_home'].append(self.state['elo_att'][h])
            features['elo_def_home'].append(self.state['elo_def'][h])
            features['elo_att_away'].append(self.state['elo_att'][a])
            features['elo_def_away'].append(self.state['elo_def'][a])
            
            # draw tendency
            draw_hist_h = list(self.state['team_draw_overall'][h])
            draw_hist_a = list(self.state['team_draw_overall'][a])
            features['draw_prop_home'].append(np.mean(draw_hist_h) if draw_hist_h else 0.25)
            features['draw_prop_away'].append(np.mean(draw_hist_a) if draw_hist_a else 0.25)
            features['draw_prop_sum'].append(features['draw_prop_home'][-1] + features['draw_prop_away'][-1])
            
            # update state
            if is_train and 'FTR' in row and pd.notna(row['FTR']):
                ftr = row['FTR']
                gh = row.get('FTHG', 0) or 0
                ga = row.get('FTAG', 0) or 0
                
                if ftr == 'H':
                    self.state['team_results'][h].append(3)
                    self.state['team_results'][a].append(0)
                    self.state['season_points'][season][h] += 3
                elif ftr == 'D':
                    self.state['team_results'][h].append(1)
                    self.state['team_results'][a].append(1)
                    self.state['season_points'][season][h] += 1
                    self.state['season_points'][season][a] += 1
                else:
                    self.state['team_results'][h].append(0)
                    self.state['team_results'][a].append(3)
                    self.state['season_points'][season][a] += 3
                
                self.state['home_results_5'][h].append(ftr)
                self.state['away_results_5'][a].append(ftr)
                self.state['team_goals_for'][h].append(gh)
                self.state['team_goals_against'][h].append(ga)
                self.state['team_goals_for'][a].append(ga)
                self.state['team_goals_against'][a].append(gh)
                self.state['gd_history'][h].append(gh - ga)
                self.state['gd_history'][a].append(ga - gh)
                self.state['season_gd'][season][h] += (gh - ga)
                self.state['season_gd'][season][a] += (ga - gh)
                self.state['h2h_history'][key].append({'home': h, 'result': ftr})
                
                ref = row.get("Referee", "Unknown")
                self.state['referee_stats'][ref][ftr] += 1
                
                exp_h = 1 / (1 + 10 ** ((self.state['elo'][a] - self.state['elo'][h] - 75) / 400))
                act_h = 1 if ftr == 'H' else (0.5 if ftr == 'D' else 0)
                K = 20
                self.state['elo'][h] += K * (act_h - exp_h)
                self.state['elo'][a] += K * ((1 - act_h) - (1 - exp_h))
                
                K_att = 20 * (1 + np.log1p(abs(gh - ga)))
                lam_h = np.exp((self.state['elo_att'][h] - self.state['elo_def'][a] + 75) / 400)
                lam_a = np.exp((self.state['elo_att'][a] - self.state['elo_def'][h]) / 400)
                self.state['elo_att'][h] += K_att * (gh - lam_h)
                self.state['elo_def'][a] += K_att * (lam_h - gh)
                self.state['elo_att'][a] += K_att * (ga - lam_a)
                self.state['elo_def'][h] += K_att * (lam_a - ga)
                
                is_draw = 1 if ftr == 'D' else 0
                self.state['team_draw_overall'][h].append(is_draw)
                self.state['team_draw_overall'][a].append(is_draw)
            
            self.state['last_match_date'][h] = date
            self.state['last_match_date'][a] = date
        for col, values in features.items():
            df[col] = values
            
        return df

#  basic form + gd_pm_*
def add_basic_form_features(df):
    df = df.sort_values("Date").copy()
    team_results = defaultdict(lambda: deque(maxlen=5))
    form_home, form_away = [], []
    win_streak_home, win_streak_away = [], []
    unbeaten_home, unbeaten_away = [], []
    gd_history = defaultdict(lambda: deque(maxlen=5))
    gd_pm_home, gd_pm_away = [], []
    gd_pm_diff = []
    for idx, row in df.iterrows():
        h, a = row["HomeTeam"], row["AwayTeam"]
        # form
        h_results = list(team_results[h])
        if h_results:
            form_home.append(sum(h_results) / (len(h_results) * 3))
            win_streak_home.append(sum(1 for r in reversed(h_results) if r == 3))
            unbeaten_home.append(sum(1 for r in reversed(h_results) if r >= 1))
        else:
            form_home.append(0.0)
            win_streak_home.append(0)
            unbeaten_home.append(0)
        a_results = list(team_results[a])
        if a_results:
            form_away.append(sum(a_results) / (len(a_results) * 3))
            win_streak_away.append(sum(1 for r in reversed(a_results) if r == 3))
            unbeaten_away.append(sum(1 for r in reversed(a_results) if r >= 1))
        else:
            form_away.append(0.0)
            win_streak_away.append(0)
            unbeaten_away.append(0)
        # result(home 3, 1, 0)
        res_h = 3 if row["FTR"] == 'H' else (1 if row["FTR"] == 'D' else 0)
        res_a = 3 if row["FTR"] == 'A' else (1 if row["FTR"] == 'D' else 0)
        team_results[h].append(res_h)
        team_results[a].append(res_a)
        if gd_history[h]:
            gd_pm_home.append(np.mean(gd_history[h]))
        else:
            gd_pm_home.append(0.0)
        if gd_history[a]:
            gd_pm_away.append(np.mean(gd_history[a]))
        else:
            gd_pm_away.append(0.0)
        gd_pm_diff.append(gd_pm_home[-1] - gd_pm_away[-1])
        gd_history[h].append(row["FTHG"] - row["FTAG"])
        gd_history[a].append(row["FTAG"] - row["FTHG"])
    df["form_home"] = form_home
    df["form_away"] = form_away
    df["win_streak_home"] = win_streak_home
    df["win_streak_away"] = win_streak_away
    df["unbeaten_home"] = unbeaten_home
    df["unbeaten_away"] = unbeaten_away
    df["gd_pm_home"] = gd_pm_home
    df["gd_pm_away"] = gd_pm_away
    df["gd_pm_diff"] = gd_pm_diff
    return df

#  L5/L10 Feature
def add_venue_specific_form(df):
    df = df.sort_values("Date").copy()
    home_results_5 = defaultdict(lambda: deque(maxlen=5))
    home_results_10 = defaultdict(lambda: deque(maxlen=10))
    away_results_5 = defaultdict(lambda: deque(maxlen=5))
    away_results_10 = defaultdict(lambda: deque(maxlen=10))
    L5HWR, L5HDR, L5HLR = [], [], []
    L5AWR, L5ADR, L5ALR = [], [], []
    L10HWR, L10HDR, L10HLR = [], [], []
    L10AWR, L10ADR, L10ALR = [], [], []
    for idx, row in df.iterrows():
        h, a = row["HomeTeam"], row["AwayTeam"]
        res = row["FTR"]
        # home5/10
        h5 = list(home_results_5[h])
        L5HWR.append(sum(1 for r in h5 if r == 'H')/len(h5) if h5 else 0.0)
        L5HDR.append(sum(1 for r in h5 if r == 'D')/len(h5) if h5 else 0.0)
        L5HLR.append(sum(1 for r in h5 if r == 'A')/len(h5) if h5 else 0.0)
        h10 = list(home_results_10[h])
        L10HWR.append(sum(1 for r in h10 if r == 'H')/len(h10) if h10 else 0.0)
        L10HDR.append(sum(1 for r in h10 if r == 'D')/len(h10) if h10 else 0.0)
        L10HLR.append(sum(1 for r in h10 if r == 'A')/len(h10) if h10 else 0.0)
        # away5/10
        a5 = list(away_results_5[a])
        L5AWR.append(sum(1 for r in a5 if r == 'H')/len(a5) if a5 else 0.0)
        L5ADR.append(sum(1 for r in a5 if r == 'D')/len(a5) if a5 else 0.0)
        L5ALR.append(sum(1 for r in a5 if r == 'A')/len(a5) if a5 else 0.0)
        a10 = list(away_results_10[a])
        L10AWR.append(sum(1 for r in a10 if r == 'H')/len(a10) if a10 else 0.0)
        L10ADR.append(sum(1 for r in a10 if r == 'D')/len(a10) if a10 else 0.0)
        L10ALR.append(sum(1 for r in a10 if r == 'A')/len(a10) if a10 else 0.0)
        # resultrecord
        home_results_5[h].append(res)
        away_results_5[a].append(res)
        home_results_10[h].append(res)
        away_results_10[a].append(res)
    df["L5HWR"] = L5HWR; df["L5HDR"] = L5HDR; df["L5HLR"] = L5HLR
    df["L10HWR"] = L10HWR; df["L10HDR"] = L10HDR; df["L10HLR"] = L10HLR
    df["L5AWR"] = L5AWR; df["L5ADR"] = L5ADR; df["L5ALR"] = L5ALR
    df["L10AWR"] = L10AWR; df["L10ADR"] = L10ADR; df["L10ALR"] = L10ALR
    df["L5_home_adv"] = np.array(L5HWR) - np.array(L5AWR)
    df["L10_home_adv"] = np.array(L10HWR) - np.array(L10AWR)
    return df

def add_prematch_shot_form_v2(df, window=6):
    df = df.sort_values("Date").copy()
    
    if not all(c in df.columns for c in ["HS", "AS", "HST", "AST"]):
        print("skip")
        return df
    
    team_history = defaultdict(lambda: deque(maxlen=window))
    
    # Result
    results = {
        'shots_for_pm_home': [], 'sot_for_pm_home': [],
        'shots_against_pm_home': [], 'sot_against_pm_home': [],
        'shots_for_pm_away': [], 'sot_for_pm_away': [],
        'shots_against_pm_away': [], 'sot_against_pm_away': [],
    }
    
    for idx, row in df.iterrows():
        home_team, away_team = row['HomeTeam'], row['AwayTeam']
        
        h_hist = list(team_history[home_team])
        if h_hist:
            results['shots_for_pm_home'].append(np.mean([h['shots'] for h in h_hist]))
            results['sot_for_pm_home'].append(np.mean([h['sot'] for h in h_hist]))
            results['shots_against_pm_home'].append(np.mean([h['shots_ag'] for h in h_hist]))
            results['sot_against_pm_home'].append(np.mean([h['sot_ag'] for h in h_hist]))
        else:
            for k in ['shots_for_pm_home', 'sot_for_pm_home', 'shots_against_pm_home', 'sot_against_pm_home']:
                results[k].append(np.nan)
        
        a_hist = list(team_history[away_team])
        if a_hist:
            results['shots_for_pm_away'].append(np.mean([h['shots'] for h in a_hist]))
            results['sot_for_pm_away'].append(np.mean([h['sot'] for h in a_hist]))
            results['shots_against_pm_away'].append(np.mean([h['shots_ag'] for h in a_hist]))
            results['sot_against_pm_away'].append(np.mean([h['sot_ag'] for h in a_hist]))
        else:
            for k in ['shots_for_pm_away', 'sot_for_pm_away', 'shots_against_pm_away', 'sot_against_pm_away']:
                results[k].append(np.nan)
        
        team_history[home_team].append({
            'shots': row['HS'], 'sot': row['HST'],
            'shots_ag': row['AS'], 'sot_ag': row['AST']
        })
        team_history[away_team].append({
            'shots': row['AS'], 'sot': row['AST'],
            'shots_ag': row['HS'], 'sot_ag': row['HST']
        })
    
    #  df
    for col, values in results.items():
        df[col] = values
    
    df['shots_pm_diff'] = df['shots_for_pm_home'] - df['shots_for_pm_away']
    df['sot_pm_diff'] = df['sot_for_pm_home'] - df['sot_for_pm_away']
    df['shot_accuracy_pm_home'] = df['sot_for_pm_home'] / df['shots_for_pm_home'].replace(0, np.nan)
    df['shot_accuracy_pm_away'] = df['sot_for_pm_away'] / df['shots_for_pm_away'].replace(0, np.nan)
    df['shot_accuracy_pm_diff'] = df['shot_accuracy_pm_home'] - df['shot_accuracy_pm_away']
    
    # padding NaN
    pm_cols = [c for c in df.columns if '_pm' in c and 'shot' in c.lower()]
    for col in pm_cols:
        df[col] = df[col].fillna(df[col].median() if df[col].notna().any() else 0)
    
    return df

def add_prematch_corners_fouls(df, window=6):
    df = df.sort_values("Date").copy()
    
    has_corners = all(c in df.columns for c in ["HC", "AC"])
    has_fouls = all(c in df.columns for c in ["HF", "AF"])
    has_cards = all(c in df.columns for c in ["HY", "AY"])
    
    if not (has_corners or has_fouls or has_cards):
        print(f"cornar/yellow/red cards data is missing, skip")
        return df
    
    # record
    team_corners = defaultdict(lambda: deque(maxlen=window))
    team_fouls = defaultdict(lambda: deque(maxlen=window))
    team_yellows = defaultdict(lambda: deque(maxlen=window))
    
    results = defaultdict(list)
    
    for idx, row in df.iterrows():
        home_team, away_team = row['HomeTeam'], row['AwayTeam']
        
        # corner
        if has_corners:
            h_corners = list(team_corners[home_team])
            a_corners = list(team_corners[away_team])
            
            results['corners_for_pm_home'].append(np.mean([c[0] for c in h_corners]) if h_corners else np.nan)
            results['corners_against_pm_home'].append(np.mean([c[1] for c in h_corners]) if h_corners else np.nan)
            results['corners_for_pm_away'].append(np.mean([c[0] for c in a_corners]) if a_corners else np.nan)
            results['corners_against_pm_away'].append(np.mean([c[1] for c in a_corners]) if a_corners else np.nan)
        
        # foul
        if has_fouls:
            h_fouls = list(team_fouls[home_team])
            a_fouls = list(team_fouls[away_team])
            
            results['fouls_pm_home'].append(np.mean([f[0] for f in h_fouls]) if h_fouls else np.nan)
            results['fouls_against_pm_home'].append(np.mean([f[1] for f in h_fouls]) if h_fouls else np.nan)
            results['fouls_pm_away'].append(np.mean([f[0] for f in a_fouls]) if a_fouls else np.nan)
            results['fouls_against_pm_away'].append(np.mean([f[1] for f in a_fouls]) if a_fouls else np.nan)
        
        # yellow card
        if has_cards:
            h_yellows = list(team_yellows[home_team])
            a_yellows = list(team_yellows[away_team])
            
            results['yellows_pm_home'].append(np.mean(h_yellows) if h_yellows else np.nan)
            results['yellows_pm_away'].append(np.mean(a_yellows) if a_yellows else np.nan)
        
        if has_corners:
            team_corners[home_team].append((row['HC'], row['AC']))
            team_corners[away_team].append((row['AC'], row['HC']))
        if has_fouls:
            team_fouls[home_team].append((row['HF'], row['AF']))
            team_fouls[away_team].append((row['AF'], row['HF']))
        if has_cards:
            team_yellows[home_team].append(row['HY'])
            team_yellows[away_team].append(row['AY'])
    
    #  df
    for col, values in results.items():
        df[col] = values
    
    if has_corners:
        df['corners_pm_diff'] = df['corners_for_pm_home'] - df['corners_for_pm_away']
    if has_fouls:
        df['fouls_pm_diff'] = df['fouls_pm_home'] - df['fouls_pm_away']
    if has_cards:
        df['yellows_pm_diff'] = df['yellows_pm_home'] - df['yellows_pm_away']
    
    # padding NaN
    new_cols = list(results.keys())
    for col in new_cols:
        if col in df.columns:
            df[col] = df[col].fillna(df[col].median() if df[col].notna().any() else 0)
    
    print(f"corner, foul, yellow/red cards history features are added")
    return df

def add_advanced_h2h_features(df, max_matches=10, half_life_seasons=2.0):
    df = df.sort_values("Date").copy()
    lambda_decay = np.log(2) / half_life_seasons

    h2h_history = defaultdict(list)
    h2h_home_rate_td = []
    h2h_draw_rate_td = []
    h2h_away_rate_td = []
    h2h_matches_td = []
    h2h_goal_diff_avg = []

    for idx, row in df.iterrows():
        h, a = row["HomeTeam"], row["AwayTeam"]
        d = row["Date"]
        # key(,homeaway)
        key = tuple(sorted([h, a]))

        past = h2h_history[key]
        # max_matches 
        past = past[-max_matches:]

        if past:
            weights = []
            home_win_w, draw_w, away_win_w = 0.0, 0.0, 0.0
            gd_weighted_sum = 0.0
            total_w = 0.0

            for rec in past:
                days_diff = (d - rec["date"]).days
                year_diff = max(days_diff / 365.25, 0.0)
                w = np.exp(-lambda_decay * year_diff)
                weights.append(w)
                total_w += w

                past_home = rec["home"]
                past_result = rec["result"]
                past_gd = rec["goal_diff"]  # past_home 

                if past_home == h:
                    gd_view_h = past_gd
                    if past_result == "H":
                        home_win_w += w
                    elif past_result == "D":
                        draw_w += w
                    else:  # 'A'
                        away_win_w += w
                else:
                    # home yes a,home h away
                    gd_view_h = -past_gd
                    if past_result == "H":
                    # home a, a → home h lose
                        away_win_w += w
                    elif past_result == "D":
                        draw_w += w
                    else:  # 'A',away h, h
                        home_win_w += w

                gd_weighted_sum += gd_view_h * w

            if total_w > 0:
                h2h_home_rate_td.append(home_win_w / total_w)
                h2h_draw_rate_td.append(draw_w / total_w)
                h2h_away_rate_td.append(away_win_w / total_w)
                h2h_matches_td.append(total_w)
                h2h_goal_diff_avg.append(gd_weighted_sum / total_w)
            else:
                h2h_home_rate_td.append(0.45)
                h2h_draw_rate_td.append(0.25)
                h2h_away_rate_td.append(0.30)
                h2h_matches_td.append(0.0)
                h2h_goal_diff_avg.append(0.0)
        else:
            h2h_home_rate_td.append(0.45)
            h2h_draw_rate_td.append(0.25)
            h2h_away_rate_td.append(0.30)
            h2h_matches_td.append(0.0)
            h2h_goal_diff_avg.append(0.0)

        gh, ga = row["FTHG"], row["FTAG"]
        if gh > ga:
            res = "H"
        elif gh == ga:
            res = "D"
        else:
            res = "A"
        h2h_history[key].append({
            "date": d,
            "home": h,
            "result": res,
            "goal_diff": gh - ga
        })

    df["h2h_home_rate_td"] = h2h_home_rate_td
    df["h2h_draw_rate_td"] = h2h_draw_rate_td
    df["h2h_away_rate_td"] = h2h_away_rate_td
    df["h2h_matches_td"] = h2h_matches_td
    df["h2h_goal_diff_avg"] = h2h_goal_diff_avg

    print("Added advanced H2H features (time-decayed & directional)")
    return df

def add_referee_features(df):
    df = df.sort_values("Date").copy()
    if "Referee" not in df.columns:
        return df
    referee_stats = defaultdict(lambda: {'H':0, 'D':0, 'A':0, 'yellow':[], 'red':[], 'fouls':[]})
    ref_home_rate, ref_draw_rate, ref_away_rate = [], [], []
    ref_avg_yellow, ref_avg_red, ref_avg_fouls = [], [], []
    ref_matches = []
    for idx, row in df.iterrows():
        ref = row["Referee"]
        stats = referee_stats[ref]
        total = stats['H'] + stats['D'] + stats['A']
        if total > 0:
            ref_home_rate.append(stats['H']/total)
            ref_draw_rate.append(stats['D']/total)
            ref_away_rate.append(stats['A']/total)
        else:
            ref_home_rate.append(0.0); ref_draw_rate.append(0.0); ref_away_rate.append(0.0)
        ref_avg_yellow.append(np.mean(stats['yellow']) if stats['yellow'] else 0.0)
        ref_avg_red.append(np.mean(stats['red']) if stats['red'] else 0.0)
        ref_avg_fouls.append(np.mean(stats['fouls']) if stats['fouls'] else 0.0)
        ref_matches.append(total)
        stats['yellow'].append(row["HY"] + row["AY"])
        stats['red'].append(row["HR"] + row["AR"])
        stats['fouls'].append(row["HF"] + row["AF"])
        stats[row["FTR"]] += 1
    df["ref_home_rate_v2"] = ref_home_rate
    df["ref_draw_rate_v2"] = ref_draw_rate
    df["ref_away_rate_v2"] = ref_away_rate
    df["ref_avg_yellow"] = ref_avg_yellow
    df["ref_avg_red"] = ref_avg_red
    df["ref_avg_fouls"] = ref_avg_fouls
    df["ref_matches_v2"] = ref_matches
    df["ref_home_bias_v2"] = np.array(ref_home_rate) - np.array(ref_away_rate)
    return df

def add_all_advanced_features(df):
    df = df.sort_values("Date").copy()
    
    team_results = defaultdict(lambda: deque(maxlen=5))
    
    form_home, form_away = [], []
    win_streak_home, win_streak_away = [], []
    unbeaten_home, unbeaten_away = [], []
    
    for idx, row in df.iterrows():
        h, a = row["HomeTeam"], row["AwayTeam"]
        
        # home team
        h_results = list(team_results[h])
        if h_results:
            form_home.append(sum(h_results) / (len(h_results) * 3))
            streak = sum(1 for r in reversed(h_results) if r == 3)
            win_streak_home.append(min(streak, len([r for r in reversed(h_results) if r == 3])))
            unbeaten = sum(1 for i, r in enumerate(reversed(h_results)) if r >= 1 and all(x >= 1 for x in list(reversed(h_results))[:i+1]))
            unbeaten_home.append(unbeaten)
        else:
            form_home.append(0.5)
            win_streak_home.append(0)
            unbeaten_home.append(0)
        
        # away team
        a_results = list(team_results[a])
        if a_results:
            form_away.append(sum(a_results) / (len(a_results) * 3))
            streak = sum(1 for r in reversed(a_results) if r == 3)
            win_streak_away.append(min(streak, len([r for r in reversed(a_results) if r == 3])))
            unbeaten = sum(1 for i, r in enumerate(reversed(a_results)) if r >= 1 and all(x >= 1 for x in list(reversed(a_results))[:i+1]))
            unbeaten_away.append(unbeaten)
        else:
            form_away.append(0.5)
            win_streak_away.append(0)
            unbeaten_away.append(0)
        
        # result
        ftr = row["FTR"]
        if ftr == "H":
            team_results[h].append(3)
            team_results[a].append(0)
        elif ftr == "D":
            team_results[h].append(1)
            team_results[a].append(1)
        else:
            team_results[h].append(0)
            team_results[a].append(3)
    
    df["form_home_v2"] = form_home
    df["form_away_v2"] = form_away
    df["form_diff_v2"] = np.array(form_home) - np.array(form_away)
    df["win_streak_home"] = win_streak_home
    df["win_streak_away"] = win_streak_away
    df["unbeaten_home"] = unbeaten_home
    df["unbeaten_away"] = unbeaten_away
    
    home_home_results = defaultdict(lambda: deque(maxlen=5))
    away_away_results = defaultdict(lambda: deque(maxlen=5))
    
    home_home_form, away_away_form = [], []
    
    for idx, row in df.iterrows():
        h, a = row["HomeTeam"], row["AwayTeam"]
        
        hh = list(home_home_results[h])
        home_home_form.append(sum(hh) / (len(hh) * 3) if hh else 0.5)
        
        aa = list(away_away_results[a])
        away_away_form.append(sum(aa) / (len(aa) * 3) if aa else 0.5)
        
        ftr = row["FTR"]
        if ftr == "H":
            home_home_results[h].append(3)
            away_away_results[a].append(0)
        elif ftr == "D":
            home_home_results[h].append(1)
            away_away_results[a].append(1)
        else:
            home_home_results[h].append(0)
            away_away_results[a].append(3)
    
    df["home_home_form"] = home_home_form
    df["away_away_form"] = away_away_form
    
    team_goals_for = defaultdict(lambda: deque(maxlen=5))
    team_goals_against = defaultdict(lambda: deque(maxlen=5))
    
    attack_mom_home, attack_mom_away = [], []
    defense_mom_home, defense_mom_away = [], []
    
    for idx, row in df.iterrows():
        h, a = row["HomeTeam"], row["AwayTeam"]
        
        gf_h = list(team_goals_for[h])
        ga_h = list(team_goals_against[h])
        attack_mom_home.append(np.mean(gf_h) if gf_h else 1.3)
        defense_mom_home.append(np.mean(ga_h) if ga_h else 1.3)
        
        gf_a = list(team_goals_for[a])
        ga_a = list(team_goals_against[a])
        attack_mom_away.append(np.mean(gf_a) if gf_a else 1.3)
        defense_mom_away.append(np.mean(ga_a) if ga_a else 1.3)
        
        team_goals_for[h].append(row["FTHG"])
        team_goals_against[h].append(row["FTAG"])
        team_goals_for[a].append(row["FTAG"])
        team_goals_against[a].append(row["FTHG"])
    
    df["attack_momentum_home"] = attack_mom_home
    df["attack_momentum_away"] = attack_mom_away
    df["defense_momentum_home"] = defense_mom_home
    df["defense_momentum_away"] = defense_mom_away
    df["attack_vs_defense_home"] = np.array(attack_mom_home) - np.array(defense_mom_away)
    df["attack_vs_defense_away"] = np.array(attack_mom_away) - np.array(defense_mom_home)
    
    season_points = defaultdict(lambda: defaultdict(int))
    season_gd = defaultdict(lambda: defaultdict(int))
    
    position_home, position_away = [], []
    points_home, points_away = [], []
    gd_home, gd_away = [], []
    
    for idx, row in df.iterrows():
        h, a = row["HomeTeam"], row["AwayTeam"]
        season = row["Season"]
        
        pts_h = season_points[season][h]
        pts_a = season_points[season][a]
        gd_h_val = season_gd[season][h]
        gd_a_val = season_gd[season][a]
        
        points_home.append(pts_h)
        points_away.append(pts_a)
        gd_home.append(gd_h_val)
        gd_away.append(gd_a_val)
        
        all_teams = list(season_points[season].keys())
        if all_teams:
            sorted_teams = sorted(all_teams, 
                                  key=lambda t: (season_points[season][t], season_gd[season][t]), 
                                  reverse=True)
            pos_h = sorted_teams.index(h) + 1 if h in sorted_teams else 10
            pos_a = sorted_teams.index(a) + 1 if a in sorted_teams else 10
        else:
            pos_h, pos_a = 10, 10
        
        position_home.append(pos_h)
        position_away.append(pos_a)
        
        ftr = row["FTR"]
        gh, ga = row["FTHG"], row["FTAG"]
        if ftr == "H":
            season_points[season][h] += 3
        elif ftr == "D":
            season_points[season][h] += 1
            season_points[season][a] += 1
        else:
            season_points[season][a] += 3
        
        season_gd[season][h] += (gh - ga)
        season_gd[season][a] += (ga - gh)
    
    df["position_home_v2"] = position_home
    df["position_away_v2"] = position_away
    df["position_diff_v2"] = np.array(position_away) - np.array(position_home)
    df["points_home_v2"] = points_home
    df["points_away_v2"] = points_away
    df["points_diff_v2"] = np.array(points_home) - np.array(points_away)
    df["gd_home"] = gd_home
    df["gd_away"] = gd_away
    df["gd_diff"] = np.array(gd_home) - np.array(gd_away)
    
    h2h_history = defaultdict(list)
    
    h2h_home_rate, h2h_draw_rate, h2h_away_rate = [], [], []
    h2h_matches = []
    
    for idx, row in df.iterrows():
        h, a = row["HomeTeam"], row["AwayTeam"]
        key = tuple(sorted([h, a]))
        
        past = h2h_history[key][-10:]
        
        if past:
            h_wins = sum(1 for r in past if 
                        (r["home"] == h and r["result"] == "H") or 
                        (r["home"] == a and r["result"] == "A"))
            draws = sum(1 for r in past if r["result"] == "D")
            total = len(past)
            
            h2h_home_rate.append(h_wins / total)
            h2h_draw_rate.append(draws / total)
            h2h_away_rate.append((total - h_wins - draws) / total)
            h2h_matches.append(total)
        else:
            h2h_home_rate.append(0.45)
            h2h_draw_rate.append(0.25)
            h2h_away_rate.append(0.30)
            h2h_matches.append(0)
        
        h2h_history[key].append({"home": h, "result": row["FTR"]})
    
    df["h2h_home_rate_v2"] = h2h_home_rate
    df["h2h_draw_rate_v2"] = h2h_draw_rate
    df["h2h_away_rate_v2"] = h2h_away_rate
    df["h2h_matches_v2"] = h2h_matches
    
    last_match = {}
    rest_home, rest_away = [], []
    
    for idx, row in df.iterrows():
        h, a = row["HomeTeam"], row["AwayTeam"]
        d = row["Date"]
        
        rest_h = (d - last_match[h]).days if h in last_match else 7
        rest_a = (d - last_match[a]).days if a in last_match else 7
        
        rest_home.append(np.clip(rest_h, 1, 21))
        rest_away.append(np.clip(rest_a, 1, 21))
        
        last_match[h] = d
        last_match[a] = d
    
    df["rest_days_home_v2"] = rest_home
    df["rest_days_away_v2"] = rest_away
    df["rest_diff_v2"] = np.array(rest_home) - np.array(rest_away)
    
    goals_history = defaultdict(lambda: deque(maxlen=5))
    
    gs_pm_home, gc_pm_home = [], []
    gs_pm_away, gc_pm_away = [], []
    
    for idx, row in df.iterrows():
        h, a = row["HomeTeam"], row["AwayTeam"]
        
        h_hist = list(goals_history[h])
        a_hist = list(goals_history[a])
        
        if h_hist:
            gs_pm_home.append(np.mean([x[0] for x in h_hist]))
            gc_pm_home.append(np.mean([x[1] for x in h_hist]))
        else:
            gs_pm_home.append(1.3)
            gc_pm_home.append(1.3)
        
        if a_hist:
            gs_pm_away.append(np.mean([x[0] for x in a_hist]))
            gc_pm_away.append(np.mean([x[1] for x in a_hist]))
        else:
            gs_pm_away.append(1.3)
            gc_pm_away.append(1.3)
        
        goals_history[h].append((row["FTHG"], row["FTAG"]))
        goals_history[a].append((row["FTAG"], row["FTHG"]))
    
    df["goals_scored_pm_home"] = gs_pm_home
    df["goals_conceded_pm_home"] = gc_pm_home
    df["goals_scored_pm_away"] = gs_pm_away
    df["goals_conceded_pm_away"] = gc_pm_away
    
    league_avg_home = df["FTHG"].mean()
    league_avg_away = df["FTAG"].mean()
    
    team_attack = defaultdict(lambda: deque(maxlen=10))
    team_defense = defaultdict(lambda: deque(maxlen=10))
    
    xg_home, xg_away = [], []
    
    for idx, row in df.iterrows():
        h, a = row["HomeTeam"], row["AwayTeam"]
        
        h_att = list(team_attack[h])
        a_def = list(team_defense[a])
        
        if h_att and a_def:
            h_attack_str = np.mean(h_att) / league_avg_home if league_avg_home > 0 else 1
            a_defense_str = np.mean(a_def) / league_avg_away if league_avg_away > 0 else 1
            xg_h = league_avg_home * h_attack_str * a_defense_str
        else:
            xg_h = league_avg_home
        
        a_att = list(team_attack[a])
        h_def = list(team_defense[h])
        
        if a_att and h_def:
            a_attack_str = np.mean(a_att) / league_avg_away if league_avg_away > 0 else 1
            h_defense_str = np.mean(h_def) / league_avg_home if league_avg_home > 0 else 1
            xg_a = league_avg_away * a_attack_str * h_defense_str
        else:
            xg_a = league_avg_away
        
        xg_home.append(xg_h)
        xg_away.append(xg_a)
        
        team_attack[h].append(row["FTHG"])
        team_defense[h].append(row["FTAG"])
        team_attack[a].append(row["FTAG"])
        team_defense[a].append(row["FTHG"])
    
    df["xg_home"] = xg_home
    df["xg_away"] = xg_away
    df["xg_diff"] = np.array(xg_home) - np.array(xg_away)
    df["xg_total"] = np.array(xg_home) + np.array(xg_away)
    
    season_match_count = defaultdict(int)
    match_week = []
    
    for idx, row in df.iterrows():
        season = row["Season"]
        season_match_count[season] += 1
        week = (season_match_count[season] - 1) // 10 + 1
        match_week.append(min(week, 38))
    
    df["match_week"] = match_week
    df["is_late_season"] = (np.array(match_week) >= 30).astype(int)
    
    print(f"Added {len([c for c in df.columns if c not in ['Date','HomeTeam','AwayTeam','FTHG','FTAG','FTR','y','Season']])} features")
    
    return df

def add_draw_specific_features(df):
    """
    """
    df = df.sort_values("Date").copy()

    if "form_diff" in df.columns:
        df["abs_form_diff"] = df["form_diff"].abs()
    if "points_diff" in df.columns:
        df["abs_points_diff"] = df["points_diff"].abs()
    if "gd_diff" in df.columns:
        df["abs_gd_diff"] = df["gd_diff"].abs()
    if "elo_sum_diff" in df.columns:
        df["abs_elo_sum_diff"] = df["elo_sum_diff"].abs()
    if "attack_vs_defense_home" in df.columns and "attack_vs_defense_away" in df.columns:
        df["att_vs_def_diff_abs"] = (
            (df["attack_vs_defense_home"] - df["attack_vs_defense_away"]).abs()
        )
    if "position_diff" in df.columns:
        df["abs_position_diff"] = df["position_diff"].abs()
    if "h2h_goal_diff_avg" in df.columns:
        df["abs_h2h_gd_avg"] = df["h2h_goal_diff_avg"].abs()

 # xG & xG
    if "xg_total" in df.columns:
        df["low_xg_flag"] = (df["xg_total"] <= df["xg_total"].median()).astype(int)

    if "attack_momentum_home" in df.columns and "attack_momentum_away" in df.columns:
        df["attack_mom_sum"] = df["attack_momentum_home"] + df["attack_momentum_away"]
    if "defense_momentum_home" in df.columns and "defense_momentum_away" in df.columns:
        df["defense_mom_sum"] = df["defense_momentum_home"] + df["defense_momentum_away"]

    if "shots_pm_diff" in df.columns:
        df["abs_shots_pm_diff"] = df["shots_pm_diff"].abs()
    if "sot_pm_diff" in df.columns:
        df["abs_sot_pm_diff"] = df["sot_pm_diff"].abs()
    if "corners_pm_diff" in df.columns:
        df["abs_corners_pm_diff"] = df["corners_pm_diff"].abs()
    if "fouls_pm_diff" in df.columns:
        df["abs_fouls_pm_diff"] = df["fouls_pm_diff"].abs()
    if "yellows_pm_diff" in df.columns:
        df["abs_yellows_pm_diff"] = df["yellows_pm_diff"].abs()

    # last N draw rate + home/away draw rate
    N = 8
    team_draw_overall = defaultdict(lambda: deque(maxlen=N))
    team_draw_home    = defaultdict(lambda: deque(maxlen=N))
    team_draw_away    = defaultdict(lambda: deque(maxlen=N))

    draw_prop_home_list = []
    draw_prop_away_list = []

    for _, row in df.iterrows():
        h, a = row["HomeTeam"], row["AwayTeam"]

        hist_overall_h = list(team_draw_overall[h])
        hist_overall_a = list(team_draw_overall[a])
        hist_home_h    = list(team_draw_home[h])
        hist_away_a    = list(team_draw_away[a])

        # league baseline ~ 0.25
        def rate_or_baseline(hist, baseline=0.25):
            return (sum(hist) / len(hist)) if hist else baseline

        draw_prop_home = (
            0.5 * rate_or_baseline(hist_overall_h) +
            0.5 * rate_or_baseline(hist_home_h)
        )
        draw_prop_away = (
            0.5 * rate_or_baseline(hist_overall_a) +
            0.5 * rate_or_baseline(hist_away_a)
        )

        draw_prop_home_list.append(draw_prop_home)
        draw_prop_away_list.append(draw_prop_away)

        # result
        is_draw = 1 if row["FTR"] == "D" else 0
        team_draw_overall[h].append(is_draw)
        team_draw_overall[a].append(is_draw)

        team_draw_home[h].append(is_draw)
        team_draw_away[a].append(is_draw)

    df["draw_prop_home_v2"] = draw_prop_home_list
    df["draw_prop_away_v2"] = draw_prop_away_list
    df["draw_prop_sum_v2"]  = df["draw_prop_home_v2"] + df["draw_prop_away_v2"]
    df["draw_prop_diff_v2"] = df["draw_prop_home_v2"] - df["draw_prop_away_v2"]

    if "match_week" in df.columns:
        df["early_season"] = (df["match_week"] <= 5).astype(int)
        df["mid_season"]   = ((df["match_week"] >= 6) & (df["match_week"] <= 28)).astype(int)
        df["late_season"]  = (df["match_week"] >= 29).astype(int)

    if "position_home" in df.columns and "position_away" in df.columns:
        mid_home = ((df["position_home"] >= 7) & (df["position_home"] <= 14)).astype(int)
        mid_away = ((df["position_away"] >= 7) & (df["position_away"] <= 14)).astype(int)
        df["both_mid_table"] = (mid_home & mid_away).astype(int)

    if "rest_diff" in df.columns:
        df["abs_rest_diff"] = df["rest_diff"].abs()

    if "h2h_draw_rate" in df.columns and "h2h_draw_rate_td" in df.columns:
        df["h2h_draw_rate_mean"] = 0.5 * (df["h2h_draw_rate"] + df["h2h_draw_rate_td"])

    if "ref_draw_rate" in df.columns:
        df["high_draw_ref_flag"] = (df["ref_draw_rate"] >= df["ref_draw_rate"].median()).astype(int)

    print(" Added extended draw-specific features")
    return df

def compute_all_features(df, fe=None, is_train=True,
                         use_state_features=True,
                         use_all_adv_block=True,
                         use_shot_corners=True,
                         use_td_h2h=True,
                         use_draw_block=True):
    """

        fe = FeatureEngineering()
        cleaned_df = cleaned_df.sort_values("Date").copy()
        full_df = compute_all_features(cleaned_df, fe=fe, is_train=True)[0]

    return:(df_with_features, fe)
    """
    df = df.sort_values("Date").copy()

    if fe is None:
        fe = FeatureEngineering()

    # based on state ( class)
    if use_state_features:
        df = fe.compute_features(df, is_train=is_train)

    #  (win_streak / xg / match_week etc)
    if use_all_adv_block:
        df = add_all_advanced_features(df)

    # shot / corner / foul / yellow card
    if use_shot_corners:
        df = add_prematch_shot_form_v2(df)
        df = add_prematch_corners_fouls(df)

    # time decay H2H
    if use_td_h2h:
        df = add_advanced_h2h_features(df)

    # draw 
    if use_draw_block:
        df = add_draw_specific_features(df)

    
    # df = add_referee_features(df)

    return df, fe


In [5]:
fe = FeatureEngineering()

print("Feature engineering on cleaned_df(rolling by time state)...")
cleaned_df = cleaned_df.sort_values("Date").copy()
cleaned_df, fe = compute_all_features(
    cleaned_df,
    fe=fe,
    is_train=True,           # rolling through entire timeline
    use_state_features=True,
    use_all_adv_block=True,
    use_shot_corners=True,
    use_td_h2h=True,
    use_draw_block=True,
)

season_counts = cleaned_df.groupby("Season").size()
valid_seasons = season_counts[season_counts == 380].index.tolist()
print("invalid seasons:")
print(season_counts[season_counts != 380])

def season_start_year(s): 
    return int(str(s).split("/")[0])

train_seasons = [f"{y}/{y+1}" for y in range(2000, 2021)]
val_seasons   = ["2021/2022"]
test_seasons  = ["2022/2023", "2023/2024", "2024/2025"]

train_df = cleaned_df[
    cleaned_df["Season"].isin(train_seasons) & 
    cleaned_df["Season"].isin(valid_seasons)
].copy()

val_df   = cleaned_df[
    cleaned_df["Season"].isin(val_seasons) & 
    cleaned_df["Season"].isin(valid_seasons)
].copy()

test_df  = cleaned_df[
    cleaned_df["Season"].isin(test_seasons) & 
    cleaned_df["Season"].isin(valid_seasons)
].copy()

all_train_seasons = sorted(train_df["Season"].unique(), key=season_start_year)
recent_seasons = all_train_seasons[-8:]
train_recent = train_df[train_df["Season"].isin(recent_seasons)].copy()

print("recent_seasons:", recent_seasons)
print("sizes:", len(train_recent), len(val_df), len(test_df))

# This section can be kept,for DC / other models
DECAY_LAMBDA = 0.00325
TIME_DECAY_TAU = 1.0 / DECAY_LAMBDA
DRAW_CLASS_MULT = 1.1

y_tr = train_recent["y"].astype(int).values
base_class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.array([0, 1, 2]),
    y=y_tr
)
class_weights = {
    i: float(w) * (DRAW_CLASS_MULT if i == 1 else 1.0)
    for i, w in enumerate(base_class_weights)
}

def time_decay_weights(df, tau=TIME_DECAY_TAU):
    max_date = df["Date"].max()
    dt = (max_date - df["Date"]).dt.days.values
    return np.exp(-dt / tau)

def make_sample_weight(df, tau=TIME_DECAY_TAU):
    w_time = time_decay_weights(df, tau=tau)
    w_class = np.array([class_weights[int(y)] for y in df["y"].values])
    return w_time * w_class

print("class_weights (draw boosted):", class_weights)
print("time_decay_tau_days:", TIME_DECAY_TAU)


Feature engineering on cleaned_df(rolling by time state)...
Added 107 features
corner, foul, yellow/red cards history features are added
Added advanced H2H features (time-decayed & directional)
 Added extended draw-specific features
invalid seasons:
Series([], dtype: int64)
recent_seasons: ['2013/2014', '2014/2015', '2015/2016', '2016/2017', '2017/2018', '2018/2019', '2019/2020', '2020/2021']
sizes: 3040 380 1140
class_weights (draw boosted): {0: 0.7423687423687424, 1: 1.5766148043375767, 2: 1.046831955922865}
time_decay_tau_days: 307.6923076923077


In [6]:

# Verify data split correctness
print("=" * 50)
print("Dataset size verification")
print("=" * 50)
print(f"train_df: {len(train_df)} row")
print(f"val_df: {len(val_df)} row")
print(f"test_df: {len(test_df)} row")

# Verify no data leakage:check date ranges
print("\nDate range verification(ensure correct time order):")
print(f"train: {train_df['Date'].min()} ~ {train_df['Date'].max()}")
print(f"val:   {val_df['Date'].min()} ~ {val_df['Date'].max()}")
print(f"test:  {test_df['Date'].min()} ~ {test_df['Date'].max()}")

# Verify feature statistics(key checkpoints)
print(f"features statistical validation, make sure the distribution make sense")
key_features = ['form_home', 'elo_diff', 'h2h_draw_rate', 'position_diff']
for feat in key_features:
    if feat in train_df.columns:
        train_mean = train_df[feat].mean()
        val_mean = val_df[feat].mean()
        test_mean = test_df[feat].mean()
        print(f"{feat:20s} train={train_mean:7.3f}  val={val_mean:7.3f}  test={test_mean:7.3f}")

print(f"NaN features ratio validation")
for name, df in [("train", train_df), ("val", val_df), ("test", test_df)]:
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    nan_ratio = df[numeric_cols].isna().mean().mean()
    print(f"{name}: average NaN ratio = {nan_ratio:.4f}")

print(f"feature engineering has been completed. data has been splited correctly")


Dataset size verification
train_df: 7980 row
val_df: 380 row
test_df: 1140 row

Date range verification(ensure correct time order):
train: 2000-08-19 00:00:00 ~ 2021-05-23 00:00:00
val:   2021-08-13 00:00:00 ~ 2022-05-22 00:00:00
test:  2022-08-05 00:00:00 ~ 2025-05-25 00:00:00
features statistical validation, make sure the distribution make sense
form_home            train=  0.450  val=  0.459  test=  0.456
elo_diff             train= 75.285  val= 75.693  test= 75.922
h2h_draw_rate        train=  0.256  val=  0.232  test=  0.234
position_diff        train= -0.196  val= -0.161  test= -0.164
NaN features ratio validation
train: average NaN ratio = 0.0003
val: average NaN ratio = 0.0002
test: average NaN ratio = 0.0001
feature engineering has been completed. data has been splited correctly


## 3. Data Transformation & Exploration

In [7]:

# if DC model uses only recent seasons,e.g. dc_train_df,can replace base_df with that one;
# otherwise,just use train_df as base.
base_df = train_df   # or dc_train_df,depending on what you feed to DC 

print("Current base_df columns:", len(base_df.columns))

# Select only numeric columns
numeric_cols = base_df.select_dtypes(include=[np.number]).columns.tolist()
print(f"check the column number of base_df, len(numeric_cols)")

# y( id/meta column)
meta_cols = ["y"]  # if match_id,
candidate_features = [c for c in numeric_cols if c not in meta_cols]

print(f"total number of column, len(candidate_features)")

# Remove leakage features
candidate_features = remove_leak_features(candidate_features)

print(f"features after removed leak features, len(candidate_features)")

# safety check
if len(candidate_features) == 0:
    print(f"candidate_features is empty, the first 50th column：")
    print(numeric_cols[:50])
    raise ValueError(
        "candidate_features has been deleted as empty by leak features"
    )


Current base_df columns: 171
check the column number of base_df, len(numeric_cols)
total number of column, len(candidate_features)
 remove 24 leakage features: ['FTHG', 'FTAG', 'HTHG', 'HTAG', 'HS', 'AS', 'HST', 'AST', 'HC', 'AC', 'HF', 'AF', 'HY', 'AY', 'HR', 'AR', 'shots_for', 'shots_against', 'sot_for', 'sot_against', 'corners_for', 'corners_against', 'shot_accuracy', 'opp_shot_accuracy']
✅ Features after cleaning: 163 → 139
features after removed leak features, len(candidate_features)


In [8]:
def compute_feature_scores_once_full(
    df,
    candidate_features,
    target_col="y",
    methods=('permutation', 'mutual_info', 'rf_importance'),
    n_estimators_rf=200,
    n_repeats_perm=10,
    random_state_base=42,
    verbose=True
):
    """
      - RandomForest + permutation_importance
      - mutual_info_classif
      - RandomForest feature_importances_
    return:
      - combined_scores
      - method_scores
    """
    available_features = [f for f in candidate_features if f in df.columns]
    if verbose:
        print(f"  number of avaliable features: {len(available_features)}")

    if not available_features:
        if verbose:
            print(f"no feature avaliable, return empty")
        return pd.Series(dtype=float), {}

    X = df[available_features].copy().fillna(0.0)
    y = df[target_col].astype(int)

    method_scores = {}

    # ---- 1) Permutation Importance ----
    if 'permutation' in methods:
        if verbose:
            print("  [1/3] calculate Permutation...")
        rf_perm = RandomForestClassifier(
            n_estimators=n_estimators_rf,
            random_state=random_state_base,
            n_jobs=-1
        )
        rf_perm.fit(X, y)
        perm = permutation_importance(
            rf_perm, X, y,
            n_repeats=n_repeats_perm,
            random_state=random_state_base
        )
        perm_scores = pd.Series(perm.importances_mean, index=X.columns)
        method_scores['permutation'] = perm_scores

    # ---- 2) Mutual Information ----
    if 'mutual_info' in methods:
        if verbose:
            print("  [2/3] calculate Mutual Information...")
        mi_vals = mutual_info_classif(X, y, random_state=random_state_base)
        mi_scores = pd.Series(mi_vals, index=X.columns)
        method_scores['mutual_info'] = mi_scores

    # ---- 3) RandomForest feature_importances_ ----
    if 'rf_importance' in methods:
        if verbose:
            print("  [3/3] calculate RandomForest...")
        rf_imp = RandomForestClassifier(
            n_estimators=n_estimators_rf,
            random_state=random_state_base + 1,
            n_jobs=-1
        )
        rf_imp.fit(X, y)
        rf_scores = pd.Series(rf_imp.feature_importances_, index=X.columns)
        method_scores['rf_importance'] = rf_scores

    if not method_scores:
        if verbose:
            print(f"None of the rating mathoed is applied.")
        return pd.Series(dtype=float), {}

    if verbose:
        print(f"Combine multiple methods' score, find average.")

    # Construct DataFrame: line=Feature, column=method
    score_df = pd.DataFrame(method_scores)

    # z-score standardization, dominate
    score_df_z = (score_df - score_df.mean()) / (score_df.std(ddof=0) + 1e-9)

    # compute average as combined score
    combined_scores = score_df_z.mean(axis=1)
    combined_scores = combined_scores.sort_values(ascending=False)

    if verbose:
        print(f"rating process has been complete once, below are top 10 combined features:")
        print(combined_scores.head(10).to_dict())

    return combined_scores, method_scores


In [9]:
def clean_tf_features(df, cols):
    good = []
    for c in cols:
        if not isinstance(c, str) or c not in df.columns:
            continue
        s = df[c]
        if isinstance(s, pd.DataFrame):
            if all(pd.api.types.is_numeric_dtype(s[col]) for col in s.columns):
                if (s.nunique(axis=1) == 1).all():
                    s = s.iloc[:, 0]
                else:
                    continue
            else:
                continue
        if not pd.api.types.is_numeric_dtype(s.dtype):
            continue
        good.append(c)
    return good

def select_unified_features(df, verbose=True):
    """
    return:(all_features, xgb_features, dc_features, tf_features)
    """
    # leaked features
    LEAK_COLS = {
        'FTHG', 'FTAG', 'HTHG', 'HTAG', 'FTR', 'HTR',
        'HS', 'AS', 'HST', 'AST', 'HC', 'AC', 
        'HF', 'AF', 'HY', 'AY', 'HR', 'AR',
        'shots_for', 'shots_against', 'sot_for', 'sot_against',
        'corners_for', 'corners_against', 'shot_accuracy', 'opp_shot_accuracy',
    }
    
    # core features
    manual_core = [
        # form/form
        "form_home", "form_away", "form_diff",
        "form_home_v2", "form_away_v2", "form_diff_v2",
        "home_home_form", "away_away_form",
        "win_streak_home", "win_streak_away", "unbeaten_home", "unbeaten_away",
        # L5/L10
        "L5HWR", "L5HDR", "L5AWR", "L5ADR",
        "L10HWR", "L10HDR", "L10AWR", "L10ADR",
        "L5_home_adv", "L10_home_adv",
        # integration/position
        "points_home", "points_away", "points_diff",
        "points_home_v2", "points_away_v2", "points_diff_v2",
        "position_home", "position_away", "position_diff",
        "position_home_v2", "position_away_v2", "position_diff_v2",
        "gd_home", "gd_away", "gd_diff", "season_gd_diff",
        # Elo
        "elo_home", "elo_away", "elo_diff",
        "elo_att_home", "elo_def_home", "elo_att_away", "elo_def_away",
        # momentum
        "attack_momentum_home", "attack_momentum_away",
        "defense_momentum_home", "defense_momentum_away",
        "attack_vs_defense_home", "attack_vs_defense_away",
        # goal
        "goals_pm_home", "goals_pm_away", "conceded_pm_home", "conceded_pm_away",
        "goals_scored_pm_home", "goals_scored_pm_away",
        "goals_conceded_pm_home", "goals_conceded_pm_away",
        "gd_pm_home", "gd_pm_away", "gd_pm_diff",
        # xG
        "xg_home", "xg_away", "xg_diff", "xg_total",
        # H2H
        "h2h_home_rate", "h2h_draw_rate", "h2h_away_rate",
        "h2h_home_rate_v2", "h2h_draw_rate_v2", "h2h_away_rate_v2",
        "h2h_home_rate_td", "h2h_draw_rate_td", "h2h_away_rate_td",
        "h2h_goal_diff_avg", "h2h_matches", "h2h_matches_v2", "h2h_matches_td",
        # rest
        "rest_days_home", "rest_days_away", "rest_diff",
        "rest_days_home_v2", "rest_days_away_v2", "rest_diff_v2",
        # draw
        "draw_prop_home", "draw_prop_away", "draw_prop_sum",
        "draw_prop_home_v2", "draw_prop_away_v2", "draw_prop_sum_v2", "draw_prop_diff_v2",
        # 
        "abs_form_diff", "abs_points_diff", "abs_gd_diff", "abs_position_diff", "abs_h2h_gd_avg",
        # pm
        "shots_for_pm_home", "shots_for_pm_away", "sot_for_pm_home", "sot_for_pm_away",
        "shots_against_pm_home", "shots_against_pm_away", "sot_against_pm_home", "sot_against_pm_away",
        "shots_pm_diff", "sot_pm_diff",
        # corners/fouls/pm
        "corners_for_pm_home", "corners_for_pm_away", "corners_pm_diff",
        "fouls_pm_home", "fouls_pm_away", "fouls_pm_diff",
        "yellows_pm_home", "yellows_pm_away", "yellows_pm_diff",
        # referee
        "ref_home_rate", "ref_draw_rate", "ref_away_rate",
        "ref_home_rate_v2", "ref_draw_rate_v2", "ref_away_rate_v2",
        "ref_matches", "ref_matches_v2", "ref_home_bias", "ref_home_bias_v2",
        "ref_avg_yellow", "ref_avg_red", "ref_avg_fouls",
        # 
        "match_week", "is_late_season", "early_season", "mid_season", "late_season", "both_mid_table",
    ]
    
    manual_core = [c for c in manual_core if c in df.columns]
    
    # 
    extra_cols = []
    meta_cols = {"date", "season", "hometeam", "awayteam", "ftr", "y", "referee"}
    for c in df.columns:
        name = str(c).lower()
        if c in manual_core or name in meta_cols or c in LEAK_COLS:
            continue
        if any(kw in name for kw in ["momentum", "adv", "streak", "prop"]):
            extra_cols.append(c)
    
    candidate = list(dict.fromkeys(manual_core + extra_cols))
    candidate = [c for c in candidate if c not in LEAK_COLS]
    all_features = clean_tf_features(df, candidate)
    
    # 
    xgb_features = all_features.copy()
    
    dc_priority = [
        "elo_diff", "form_diff", "form_diff_v2", "points_diff", "position_diff", "gd_diff",
        "h2h_home_rate", "h2h_draw_rate", "rest_diff", "xg_diff", "L5_home_adv", "draw_prop_sum_v2",
    ]
    dc_features = [f for f in dc_priority if f in all_features]
    for f in all_features:
        if f not in dc_features and len(dc_features) < 12:
            dc_features.append(f)
    dc_features = dc_features[:12]
    
    tf_features = all_features.copy()
    
    if verbose:
        print(f"total features are choosed: all = {len(all_features)}, XGB={len(xgb_features)}, DC={len(dc_features)}, TF={len(tf_features)}")
        print(f"DC features: {dc_features}")
    
    return all_features, xgb_features, dc_features, tf_features

In [10]:
RESELECT_FEATURES = False  #  True 

if (not RESELECT_FEATURES) and os.path.exists("unified_features.pkl"):
    # 
    with open("unified_features.pkl", "rb") as f:
        saved = pickle.load(f)
    feature_cols_xgb = saved["feature_cols_xgb"]
    dc_feature_cols = saved["dc_feature_cols"]
    tf_token_features = saved["tf_token_features"]
    
    LEAK_CHECK = {'FTHG','FTAG','HS','AS','HST','AST','HC','AC','HF','AF','HY','AY','HR','AR',
                  'shots_for','shots_against','sot_for','sot_against','corners_for','corners_against'}
    feature_cols_xgb = [f for f in feature_cols_xgb if f not in LEAK_CHECK]
    dc_feature_cols = [f for f in dc_feature_cols if f not in LEAK_CHECK]
    tf_token_features = [f for f in tf_token_features if f not in LEAK_CHECK]
    
    print(f"we have load features from unified_features.pkl")

else:
 # ( train_df)
    _, feature_cols_xgb, dc_feature_cols, tf_token_features = select_unified_features(train_df, verbose=True)
    
    # Save
    with open("unified_features.json", "w") as f:
        json.dump({"xgb": feature_cols_xgb, "dc": dc_feature_cols, "tf": tf_token_features}, f, indent=2)
    with open("unified_features.pkl", "wb") as f:
        pickle.dump({"feature_cols_xgb": feature_cols_xgb, "dc_feature_cols": dc_feature_cols, 
                     "tf_token_features": tf_token_features}, f)
    print("✅ unified_features.pkl")

print(f"\nfinal number of features: XGB={len(feature_cols_xgb)}, DC={len(dc_feature_cols)}, TF={len(tf_token_features)}")
print(f"DC features: {dc_feature_cols}")

we have load features from unified_features.pkl

final number of features: XGB=120, DC=12, TF=120
DC features: ['elo_diff', 'form_diff', 'form_diff_v2', 'points_diff', 'position_diff', 'gd_diff', 'h2h_home_rate', 'h2h_draw_rate', 'rest_diff', 'xg_diff', 'draw_prop_sum_v2', 'form_home']


## 4. Methodology Overview

## 5. Model Training & Validation

### Bayesian Dixon–Coles

In [11]:
from sklearn.preprocessing import StandardScaler

# DC ( train_recent DC)
dc_train_df = train_recent.copy()

# scaler( dc_train_df)
dc_scaler = StandardScaler()
X_dc_train = dc_scaler.fit_transform(
    dc_train_df[dc_feature_cols].copy().fillna(0.0)
)

# val/test ( scaler)
X_dc_train_full = dc_scaler.transform(
    train_df[dc_feature_cols].copy().fillna(0.0)
)
X_dc_val = dc_scaler.transform(
    val_df[dc_feature_cols].copy().fillna(0.0)
)
X_dc_test = dc_scaler.transform(
    test_df[dc_feature_cols].copy().fillna(0.0)
)

print("shape of DC features matrix:" , X_dc_train.shape,
      "train:", X_dc_train_full.shape,
      "val:", X_dc_val.shape,
      "test:", X_dc_test.shape)


shape of DC features matrix: (3040, 12) train: (7980, 12) val: (380, 12) test: (1140, 12)


In [12]:
dc_train_df = train_recent.copy()

teams = sorted(set(dc_train_df["HomeTeam"]).union(dc_train_df["AwayTeam"]))
tmap = {t: i for i, t in enumerate(teams)}
n_teams = len(teams)

home_idx = dc_train_df["HomeTeam"].map(tmap).values.astype("int64")
away_idx = dc_train_df["AwayTeam"].map(tmap).values.astype("int64")
gh = dc_train_df["FTHG"].astype(int).values
ga = dc_train_df["FTAG"].astype(int).values

xi = DECAY_LAMBDA
last_date = dc_train_df["Date"].max()
delta_days = (last_date - dc_train_df["Date"]).dt.days.values
w = np.exp(-xi * delta_days)

X_dc_train = dc_scaler.transform(
    dc_train_df[dc_feature_cols].copy().fillna(0.0)
)
n_features_dc = X_dc_train.shape[1]

X_dc_shared = at.as_tensor_variable(X_dc_train, dtype="float64")
hi = at.as_tensor_variable(home_idx, dtype="int64")
ai = at.as_tensor_variable(away_idx, dtype="int64")
gH = at.as_tensor_variable(gh, dtype="int64")
gA = at.as_tensor_variable(ga, dtype="int64")
weights = at.as_tensor_variable(w, dtype="float64")

with pm.Model() as dc_model:
    sigma_att = pm.HalfNormal("sigma_att", sigma=0.7)
    sigma_def = pm.HalfNormal("sigma_def", sigma=0.7)

    att_offset = pm.Normal("att_offset", 0.0, 1.0, shape=n_teams)
    def_offset = pm.Normal("def_offset", 0.0, 1.0, shape=n_teams)

    attack_raw  = att_offset * sigma_att
    defense_raw = def_offset * sigma_def
    attack  = pm.Deterministic("attack",  attack_raw  - attack_raw[-1])
    defense = pm.Deterministic("defense", defense_raw - defense_raw[-1])

    home_adv = pm.Normal("home_adv", 0.0, 0.5)
    rho_raw  = pm.Normal("rho_raw", 0.0, 0.7)
    rho = pm.Deterministic("rho", 0.6 * pm.math.tanh(rho_raw))

    beta_h = pm.Normal("beta_h", 0.0, 0.3, shape=n_features_dc)
    beta_a = pm.Normal("beta_a", 0.0, 0.3, shape=n_features_dc)

    lin_h = home_adv + attack[hi] - defense[ai] + at.dot(X_dc_shared, beta_h)
    lin_a = attack[ai] - defense[hi] + at.dot(X_dc_shared, beta_a)

    lam_h = pm.math.exp(lin_h)
    lam_a = pm.math.exp(lin_a)

    corr = at.ones_like(gH, dtype="float64")

    m00 = at.and_(at.eq(gH, 0), at.eq(gA, 0))
    m01 = at.and_(at.eq(gH, 0), at.eq(gA, 1))
    m10 = at.and_(at.eq(gH, 1), at.eq(gA, 0))
    m11 = at.and_(at.eq(gH, 1), at.eq(gA, 1))

    corr = at.switch(m00, 1 - lam_h * lam_a * rho, corr)
    corr = at.switch(m01, 1 + lam_h * rho, corr)
    corr = at.switch(m10, 1 + lam_a * rho, corr)
    corr = at.switch(m11, 1 - rho, corr)
    corr = at.clip(corr, 1e-6, np.inf)

    logp_home = pm.logp(pm.Poisson.dist(mu=lam_h), gH)
    logp_away = pm.logp(pm.Poisson.dist(mu=lam_a), gA)
    logp_corr = at.log(corr)

    pm.Potential("weighted_like",
                 at.sum(weights * (logp_home + logp_away + logp_corr)))

    # sampling
    trace_dc = pm.sample(
        draws=1000,
        tune=500,
        chains=4,
        cores=2,             
        target_accept=0.95,    # Less conservative → faster
        random_seed=42
    )

post = trace_dc.posterior.stack(sample=("chain", "draw"))

attack_s   = post["attack"].values               # (n_samples, n_teams)
defense_s  = post["defense"].values             # (n_samples, n_teams)
home_adv_s = post["home_adv"].values            # (n_samples,)
rho_s      = post["rho"].values                 # (n_samples,)
beta_h_s   = post["beta_h"].transpose("sample", "beta_h_dim_0").values
beta_a_s   = post["beta_a"].transpose("sample", "beta_a_dim_0").values

n_samples = attack_s.shape[0]
print("n_samples:", n_samples, "n_features_dc:", n_features_dc)

import pickle

dc_bayes_ckpt = {
    "tmap": tmap,
    "attack_s": attack_s,
    "defense_s": defense_s,
    "home_adv_s": home_adv_s,
    "rho_s": rho_s,
    "beta_h_s": beta_h_s,
    "beta_a_s": beta_a_s,
    "dc_feature_cols": dc_feature_cols,
    "DECAY_LAMBDA": DECAY_LAMBDA,
    "dc_scaler": dc_scaler,
}

with open("bayes_dc_global.ckpt", "wb") as f:
    pickle.dump(dc_bayes_ckpt, f)

print(f"Bayes DC posterior checkpoint has been save as bayes_dc_global.ckpt")


Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 2 jobs)
NUTS: [sigma_att, sigma_def, att_offset, def_offset, home_adv, rho_raw, beta_h, beta_a]


Output()

Sampling 4 chains for 500 tune and 1_000 draw iterations (2_000 + 4_000 draws total) took 139 seconds.


n_samples: 31 n_features_dc: 12
Bayes DC posterior checkpoint has been save as bayes_dc_global.ckpt


In [13]:
from math import exp, factorial
from sklearn.metrics import log_loss

def poisson_pmf(lam, k):
    return exp(-lam) * lam**k / factorial(k)

def dc_joint_matrix(lam_h, lam_a, rho, max_goals=5):
    goals = np.arange(max_goals + 1)
    p_h = np.array([poisson_pmf(lam_h, g) for g in goals])
    p_a = np.array([poisson_pmf(lam_a, g) for g in goals])
    M = np.outer(p_h, p_a)
    if abs(rho) > 1e-9:
        M = M.copy()
        M[0,0] *= max(1e-12, 1 - lam_h*lam_a*rho)
        if max_goals >= 1:
            M[0,1] *= max(1e-12, 1 + lam_h*rho)
            M[1,0] *= max(1e-12, 1 + lam_a*rho)
            M[1,1] *= max(1e-12, 1 - rho)
    return M / M.sum()

def bayes_dc_predict_full(df_subset,
                          X_dc_subset,
                          attack_s, defense_s,
                          home_adv_s, rho_s,
                          beta_h_s, beta_a_s,
                          tmap,
                          max_goals=5):
    
    n_matches = len(df_subset)
    n_samples = attack_s.shape[0]
    n_features_dc = X_dc_subset.shape[1]

    out_probs = np.zeros((n_matches, 3))
    out_scorelines = []

    for i, (_, row) in enumerate(df_subset.iterrows()):
        home, away = row["HomeTeam"], row["AwayTeam"]
        ih = tmap.get(home)
        ia = tmap.get(away)

        # 
        if ih is None or ia is None:
            out_probs[i] = [1/3, 1/3, 1/3]
            out_scorelines.append(
                np.full((max_goals+1, max_goals+1),
                        1/((max_goals+1)**2))
            )
            continue

        x_vec = X_dc_subset[i]  # shape: (n_features_dc,)

        M_avg = np.zeros((max_goals+1, max_goals+1))

        for k in range(n_samples):
            base_h = home_adv_s[k] + attack_s[k, ih] - defense_s[k, ia]
            base_a = attack_s[k, ia] - defense_s[k, ih]

            adj_h = np.dot(beta_h_s[k], x_vec)
            adj_a = np.dot(beta_a_s[k], x_vec)

            lam_h_k = np.exp(base_h + adj_h)
            lam_a_k = np.exp(base_a + adj_a)
            rho_k = rho_s[k]

            M_avg += dc_joint_matrix(lam_h_k, lam_a_k, rho_k, max_goals=max_goals)

        M_avg /= n_samples
        out_scorelines.append(M_avg)

        p_home = np.tril(M_avg, k=-1).sum()   # gH > gA
        p_away = np.triu(M_avg, k=1).sum()    # gH < gA
        p_draw = np.trace(M_avg)

        tot = p_home + p_draw + p_away
        out_probs[i] = [p_home/tot, p_draw/tot, p_away/tot]  #  = [H, D, A]

    return out_probs, out_scorelines


In [14]:
# X_dc_*( scaler standardization)

beta_h_s   = post["beta_h"].transpose("sample", "beta_h_dim_0").values
beta_a_s   = post["beta_a"].transpose("sample", "beta_a_dim_0").values

proba_train_dc, scorelines_train = bayes_dc_predict_full(
    train_df,
    X_dc_train_full,
    attack_s, defense_s,
    home_adv_s, rho_s,
    beta_h_s, beta_a_s,
    tmap
)
proba_val_dc, scorelines_val = bayes_dc_predict_full(
    val_df,
    X_dc_val,
    attack_s, defense_s,
    home_adv_s, rho_s,
    beta_h_s, beta_a_s,
    tmap
)
proba_test_dc, scorelines_test = bayes_dc_predict_full(
    test_df,
    X_dc_test,
    attack_s, defense_s,
    home_adv_s, rho_s,
    beta_h_s, beta_a_s,
    tmap
)

print("Bayes DC val logloss:",
      round(log_loss(val_df["y"], proba_val_dc, labels=[0,1,2]), 4))
print("Bayes DC test logloss:",
      round(log_loss(test_df["y"], proba_test_dc, labels=[0,1,2]), 4))


Bayes DC val logloss: 0.9729
Bayes DC test logloss: 1.001


### XGBoost

In [15]:

import os
import pickle
import json

if (not RESELECT_FEATURES) and os.path.exists("unified_features.pkl"):
    with open("unified_features.pkl", "rb") as f:
        saved = pickle.load(f)

    feature_cols_xgb = saved["feature_cols_xgb"]
    dc_feature_cols  = saved.get("dc_feature_cols", [])
    stability_score_dict = saved.get("stability_score", {})
    feature_cols_xgb = remove_leak_features(feature_cols_xgb)
    
    print(f"XGB feature count: {len(feature_cols_xgb)}")
    print(f"features has been load from unified_features.pkl")
    print(f"number of XGB features", len(feature_cols_xgb))
    print(f"number of DC features", len(dc_feature_cols))

else:
    # 
    feature_cols_xgb = list(selected_features_60)
    feature_cols_xgb = [c for c in feature_cols_xgb if c in train_df.columns]
    
    # Remove leakage features
    feature_cols_xgb = remove_leak_features(feature_cols_xgb)
    
    print(f"XGB feature count: {len(feature_cols_xgb)}")

✅ Features after cleaning: 120 → 120
XGB feature count: 120
features has been load from unified_features.pkl
number of XGB features 120
number of DC features 12


In [16]:
for name in ["train_df", "val_df", "test_df"]:
    df = locals()[name]
    df = df.loc[:, ~df.columns.duplicated()].copy()
    locals()[name] = df


In [17]:
feature_cols_xgb = list(dict.fromkeys(feature_cols_xgb))
print("XGB features (unique):", len(feature_cols_xgb))


XGB features (unique): 120


In [18]:
train_means = train_df[feature_cols_xgb].mean()

for dfx in [train_df, val_df, test_df]:
 # fillna, train_means
    dfx[feature_cols_xgb] = dfx[feature_cols_xgb].fillna(train_means)


In [19]:
print(f"repeated feature name", [c for c in feature_cols_xgb if feature_cols_xgb.count(c) > 1])
train_means = train_df[feature_cols_xgb].mean()
print("train_means index duplicated?:", train_means.index.duplicated().any())


repeated feature name []
train_means index duplicated?: False


In [20]:
feature_cols_xgb = [c for c in feature_cols_xgb if c in train_df.columns]
print(f"XGB feature count: {len(feature_cols_xgb)}")

# filter for features not in train_df (prevent column name mismatch)
train_means = train_df[feature_cols_xgb].mean()
for dfx in [train_df, val_df, test_df]:
    for col in feature_cols_xgb:
        dfx[col] = dfx[col].fillna(train_means[col])

all_train_seasons = sorted(train_df["Season"].unique(), key=season_start_year)
recent_train_seasons = all_train_seasons[-10:]      #  10 
fold_seasons = recent_train_seasons[-6:]            #  6 "CV"

print("All train seasons:", all_train_seasons)
print("CV fold seasons (target seasons):", fold_seasons)

def make_time_class_weight(df, tau=TIME_DECAY_TAU, draw_boost=DRAW_CLASS_MULT):
    y = df["y"].astype(int).values

    base_w = compute_class_weight(
        class_weight="balanced",
        classes=np.array([0, 1, 2]),
        y=y
    )
    class_w = {
        i: float(w) * (draw_boost if i == 1 else 1.0)
        for i, w in enumerate(base_w)
    }

    max_date = df["Date"].max()
    dt = (max_date - df["Date"]).dt.days.values
    w_time = np.exp(-dt / tau)
    w_class = np.array([class_w[int(v)] for v in y])

    return w_time * w_class

#  GPU
try:
    import torch
    device = "cuda" if torch.cuda.is_available() else "cpu"
except Exception:
    device = "cpu"

print("XGB device:", device)


def objective(trial):
    max_depth = trial.suggest_int("max_depth", 3, 4)
    min_child_weight = trial.suggest_int("min_child_weight", 5, 7)
    reg_alpha = trial.suggest_float("reg_alpha", 0.8, 2.0)
    reg_lambda = trial.suggest_float("reg_lambda", 2.0, 4.0)
    gamma = trial.suggest_float("gamma", 0.0, 0.5)
    subsample = trial.suggest_float("subsample", 0.7, 0.9)
    colsample_bytree = trial.suggest_float("colsample_bytree", 0.7, 0.9)

    fold_losses = []
    fold_best_iters = []

    for fs in fold_seasons:
        fs_idx = all_train_seasons.index(fs)

        hist_seasons = all_train_seasons[:fs_idx][-8:]
        hist_df = train_df[train_df["Season"].isin(hist_seasons)].copy()
        fold_df = train_df[train_df["Season"] == fs].copy()

        if len(hist_df) == 0 or len(fold_df) == 0:
            continue

        w_hist = make_time_class_weight(
            hist_df,
            tau=TIME_DECAY_TAU,
            draw_boost=DRAW_CLASS_MULT
        )

        model = xgb.XGBClassifier(
            n_estimators=800,
            learning_rate=0.05,
            max_depth=max_depth,
            min_child_weight=min_child_weight,
            gamma=gamma,
            reg_alpha=reg_alpha,
            reg_lambda=reg_lambda,
            subsample=subsample,
            colsample_bytree=colsample_bytree,
            objective="multi:softprob",
            num_class=3,
            eval_metric="mlogloss",
            tree_method="hist",
            device=device,          # if xgboost ,
            early_stopping_rounds=15,
            random_state=42,
        )

        model.fit(
            hist_df[feature_cols_xgb],
            hist_df["y"].astype(int),
            sample_weight=w_hist,
            eval_set=[(fold_df[feature_cols_xgb], fold_df["y"].astype(int))],
            verbose=False,
        )

        best_iter = (
            int(model.best_iteration)
            if hasattr(model, "best_iteration") and model.best_iteration is not None
            else model.get_params().get("n_estimators", 800)
        )

        proba_fold = model.predict_proba(fold_df[feature_cols_xgb])
        ll = log_loss(fold_df["y"], proba_fold, labels=[0, 1, 2])

        fold_losses.append(ll)
        fold_best_iters.append(best_iter)

 # fold()
    if not fold_losses:
        return 10.0

    mean_ll = float(np.mean(fold_losses))
    median_iter = float(np.median(fold_best_iters))

    trial.set_user_attr("median_best_iter", median_iter)
    return mean_ll

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=25)

print("Best trial:", study.best_trial.number)
print("Best value (cv mean logloss):", study.best_value)
print("Best params:", study.best_params)
print("Median best_iter of best trial:", study.best_trial.user_attrs["median_best_iter"])

best_overall_params = study.best_params
best_overall = study.best_value
best_overall_iters = int(study.best_trial.user_attrs["median_best_iter"])
best_n = int(best_overall_iters + 5)

print("\n== Optuna result ==")
print("best_overall_params:", best_overall_params)
print("best_overall (cv mean logloss):", best_overall)
print("best_overall_iters:", best_overall_iters)
print("==> best_n =", best_n)

[I 2025-12-15 15:13:25,568] A new study created in memory with name: no-name-2adeb226-3d84-4b89-83a7-8efb7e50ab55


XGB feature count: 120
All train seasons: ['2000/2001', '2001/2002', '2002/2003', '2003/2004', '2004/2005', '2005/2006', '2006/2007', '2007/2008', '2008/2009', '2009/2010', '2010/2011', '2011/2012', '2012/2013', '2013/2014', '2014/2015', '2015/2016', '2016/2017', '2017/2018', '2018/2019', '2019/2020', '2020/2021']
CV fold seasons (target seasons): ['2015/2016', '2016/2017', '2017/2018', '2018/2019', '2019/2020', '2020/2021']
XGB device: cuda


[I 2025-12-15 15:13:36,955] Trial 0 finished with value: 1.0182854349688006 and parameters: {'max_depth': 4, 'min_child_weight': 6, 'reg_alpha': 0.8395497738645505, 'reg_lambda': 2.223055163432705, 'gamma': 0.040125418880110675, 'subsample': 0.721755258343871, 'colsample_bytree': 0.8818579903579938}. Best is trial 0 with value: 1.0182854349688006.
[I 2025-12-15 15:13:45,327] Trial 1 finished with value: 1.020187063563738 and parameters: {'max_depth': 3, 'min_child_weight': 5, 'reg_alpha': 1.5778157199926826, 'reg_lambda': 2.5188666949365617, 'gamma': 0.2070593747444298, 'subsample': 0.7837454461059394, 'colsample_bytree': 0.7116894452295639}. Best is trial 0 with value: 1.0182854349688006.
[I 2025-12-15 15:13:54,331] Trial 2 finished with value: 1.0221782721126897 and parameters: {'max_depth': 4, 'min_child_weight': 5, 'reg_alpha': 0.957883410569316, 'reg_lambda': 2.47039659382137, 'gamma': 0.40739884221360456, 'subsample': 0.8657310377457114, 'colsample_bytree': 0.8844913955239555}. B

Best trial: 4
Best value (cv mean logloss): 1.016986437074465
Best params: {'max_depth': 4, 'min_child_weight': 7, 'reg_alpha': 0.9171164596785282, 'reg_lambda': 3.9593101843582716, 'gamma': 0.2537808233885571, 'subsample': 0.7349785646680412, 'colsample_bytree': 0.7784179577242174}
Median best_iter of best trial: 62.5

== Optuna result ==
best_overall_params: {'max_depth': 4, 'min_child_weight': 7, 'reg_alpha': 0.9171164596785282, 'reg_lambda': 3.9593101843582716, 'gamma': 0.2537808233885571, 'subsample': 0.7349785646680412, 'colsample_bytree': 0.7784179577242174}
best_overall (cv mean logloss): 1.016986437074465
best_overall_iters: 62
==> best_n = 67


In [21]:
recent_seasons = all_train_seasons[-8:]
train_recent = train_df[train_df["Season"].isin(recent_seasons)].copy()

print("Final train_recent seasons:", recent_seasons)
print("train_recent size:", len(train_recent))
print("val size:", len(val_df), "test size:", len(test_df))

# + sample_weight
w_train_recent = make_time_class_weight(
    train_recent,
    tau=TIME_DECAY_TAU,
    draw_boost=DRAW_CLASS_MULT
)

# from Optuna result
md  = best_overall_params["max_depth"]
mcw = best_overall_params["min_child_weight"]
ra  = best_overall_params["reg_alpha"]
rl  = best_overall_params["reg_lambda"]
gm  = best_overall_params["gamma"]
sub = best_overall_params["subsample"]
col = best_overall_params["colsample_bytree"]

# final XGB Model
xgb_final = xgb.XGBClassifier(
    n_estimators=best_n,
    learning_rate=0.05,
    max_depth=md,
    min_child_weight=mcw,
    gamma=gm,
    reg_alpha=ra,
    reg_lambda=rl,
    subsample=sub,
    colsample_bytree=col,
    objective="multi:softprob",
    num_class=3,
    eval_metric="mlogloss",
    tree_method="hist",
    device=device,      # if, device=...
    random_state=42,
)

xgb_final.fit(
    train_recent[feature_cols_xgb],
    train_recent["y"].astype(int),
    sample_weight=w_train_recent,
    verbose=False,
)

# full train/val/test 
proba_train_xgb = xgb_final.predict_proba(train_df[feature_cols_xgb])
proba_val_xgb   = xgb_final.predict_proba(val_df[feature_cols_xgb])
proba_test_xgb  = xgb_final.predict_proba(test_df[feature_cols_xgb])

# Calculate logloss + 
train_ll = log_loss(train_df["y"], proba_train_xgb, labels=[0, 1, 2])
val_ll   = log_loss(val_df["y"],   proba_val_xgb,   labels=[0, 1, 2])
test_ll  = log_loss(test_df["y"],  proba_test_xgb,  labels=[0, 1, 2])

gap_val  = val_ll  - train_ll
gap_test = test_ll - train_ll

print("\n=== XGB Performance ===")
print(f"Train LL: {train_ll:.4f}")
print(f"Val LL:   {val_ll:.4f} (gap={gap_val:+.4f})")
print(f"Test LL:  {test_ll:.4f} (gap={gap_test:+.4f})")

if gap_val > 0.10 or gap_test > 0.10:
    print("significant overfitting")
elif gap_val > 0.05 or gap_test > 0.05:
    print("midium overfitting")
else:
    print("low or no overfitting")

Final train_recent seasons: ['2013/2014', '2014/2015', '2015/2016', '2016/2017', '2017/2018', '2018/2019', '2019/2020', '2020/2021']
train_recent size: 3040
val size: 380 test size: 1140

=== XGB Performance ===
Train LL: 1.0321
Val LL:   1.0261 (gap=-0.0059)
Test LL:  1.0295 (gap=-0.0025)
low or no overfitting


In [22]:
# Final training on train_recent + val
train_recent = train_df[train_df["Season"].isin(recent_seasons)].copy()
w_train_recent = make_sample_weight(train_recent)

try:
    import torch
    device = "cuda" if torch.cuda.is_available() else "cpu"
except:
    device = "cpu"

xgb_tmp = xgb.XGBClassifier(
    n_estimators=best_n, learning_rate=0.03,
    max_depth=4, min_child_weight=5, gamma=0.5,
    reg_alpha=0.5, reg_lambda=2.0,
    subsample=0.8, colsample_bytree=0.8,
    objective="multi:softprob", num_class=3,
    eval_metric="mlogloss", tree_method="hist",
    device=device,
    random_state=42
)
xgb_tmp.fit(train_recent[feature_cols_xgb], train_recent["y"].astype(int),
            sample_weight=w_train_recent, verbose=False)

proba_val_xgb  = xgb_tmp.predict_proba(val_df[feature_cols_xgb])
proba_test_xgb = xgb_tmp.predict_proba(test_df[feature_cols_xgb])

print("XGB val logloss:", round(log_loss(val_df["y"], proba_val_xgb, labels=[0,1,2]), 4))
print("XGB test logloss:", round(log_loss(test_df["y"], proba_test_xgb, labels=[0,1,2]), 4))

# Final model on (train_recent + val)
trainval_df = pd.concat([train_recent, val_df], axis=0, ignore_index=True)
w_trainval = np.concatenate([w_train_recent, make_sample_weight(val_df)])

xgb_final = xgb.XGBClassifier(
    n_estimators=best_n, learning_rate=0.03,
    max_depth=4, min_child_weight=5, gamma=0.5,
    reg_alpha=0.5, reg_lambda=2.0,
    subsample=0.8, colsample_bytree=0.8,
    objective="multi:softprob", num_class=3,
    eval_metric="mlogloss", tree_method="hist",
    device=device,
    random_state=42
)
xgb_final.fit(trainval_df[feature_cols_xgb], trainval_df["y"].astype(int),
              sample_weight=w_trainval, verbose=False)

proba_train_xgb = xgb_final.predict_proba(train_df[feature_cols_xgb])
proba_val_xgb  = xgb_final.predict_proba(val_df[feature_cols_xgb])
proba_test_xgb = xgb_final.predict_proba(test_df[feature_cols_xgb])

print(f"\nXGB Final model trained on train_recent + val")

XGB val logloss: 1.027
XGB test logloss: 1.0323

XGB Final model trained on train_recent + val


In [23]:
# save xgb oof
xgb_base_params = xgb_final.get_params()

xgb_oof_cfg = {
    "xgb_base_params": xgb_base_params,
    "best_n": best_n,
}

with open("xgb_oof_config.pkl", "wb") as f:
    pickle.dump(xgb_oof_cfg, f)

print(f"XGB OOF has been save as xgb_oof_config.pkl")


XGB OOF has been save as xgb_oof_config.pkl


### Transformer

In [24]:
# Transformer v3
warnings.filterwarnings('ignore')
try:
    import torch
    import torch.nn as nn
    import torch.optim as optim
    from torch.utils.data import Dataset, DataLoader
    from torch.cuda.amp import GradScaler, autocast
    TORCH_OK = True
except Exception as e:
    print(" torch not available:", e)
    TORCH_OK = True

if TORCH_OK:
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Transformer device:", DEVICE)

class EnhancedMatchTransformer(nn.Module):
    """
    simplified version
    """
    def __init__(self, seq_len=5, feat_dim=20, match_feat_dim=3, 
                 d_model=64, nhead=4, num_layers=2, 
                 ff_dim=128, num_classes=3, dropout=0.3):
        super().__init__()
        
        self.d_model = d_model
        
        # input projection

        self.input_proj = nn.Sequential(
            nn.Linear(feat_dim, d_model),
            nn.LayerNorm(d_model),
            nn.ReLU(),  # using ReLU, more stable
            nn.Dropout(dropout),
        )
        
        # position c
        self.pos_emb = nn.Embedding(seq_len, d_model)
        
        # simple encoder
        enc_layer = nn.TransformerEncoderLayer(
            d_model=d_model, 
            nhead=nhead,
            dim_feedforward=ff_dim, 
            dropout=dropout,
            batch_first=True, 
            activation="relu"  # ReLU
        )
        self.encoder = nn.TransformerEncoder(enc_layer, num_layers=num_layers)
        
        # using attention pool instead of using last time step
        self.attn_pool = nn.Sequential(
            nn.Linear(d_model, 1),
            nn.Softmax(dim=1)
        )
        
        # match projection
        self.match_proj = nn.Sequential(
            nn.Linear(match_feat_dim, d_model // 2),
            nn.ReLU(),
            nn.Dropout(dropout),
        )
        
        # home_summary + away_summary + match_feat
        final_dim = d_model * 2 + d_model // 2
        
        self.classifier = nn.Sequential(
            nn.LayerNorm(final_dim),
            nn.Linear(final_dim, 128),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(128, num_classes)
        )
        
        # Initialize
        self._init_weights()
    
    def _init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight, gain=0.5)  # gain
                if m.bias is not None:
                    nn.init.zeros_(m.bias)
            elif isinstance(m, nn.Embedding):
                nn.init.normal_(m.weight, std=0.01)
    
    def encode_team(self, seq):
        """
        encoder for single team
        """
        B, L, _ = seq.shape
        
        # projection + position code
        x = self.input_proj(seq)
        pos_idx = torch.arange(L, device=seq.device).unsqueeze(0).expand(B, -1)
        x = x + self.pos_emb(pos_idx)
        
        # Transformer Encoding
        z = self.encoder(x)
        
        attn_weights = self.attn_pool(z)  # [B, L, 1]
        summary = (z * attn_weights).sum(dim=1)  # [B, d_model]
        
        return summary
    
    def forward(self, home_seq, away_seq, match_feat):
        # homeaway
        home_summary = self.encode_team(home_seq)
        away_summary = self.encode_team(away_seq)
        
        match_emb = self.match_proj(match_feat)
        
        # combine all features
        combined = torch.cat([home_summary, away_summary, match_emb], dim=1)
        
        return self.classifier(combined)

def build_team_sequences_fixed(df, feature_cols, seq_len=5):
    """
    
    """
    df = df.sort_values("Date").copy().reset_index(drop=True)
    
    safe_features = [f for f in feature_cols if f in df.columns]
    FEAT_DIM = len(safe_features)
    
    print(f"Sequence feature number: {FEAT_DIM}")
    print(f"Sequence feature : {safe_features[:10]}...")  # only print first 10th
    
    zero_token = np.zeros(FEAT_DIM, dtype=np.float32)
    
    # maintain past sequence for each team
    # structure: {team: deque([token1, token2, ...])}
    team_history = defaultdict(lambda: deque([zero_token.copy() for _ in range(seq_len)], maxlen=seq_len))
    
    home_seqs = []
    away_seqs = []
    match_features = []
    
    for idx, match in df.iterrows():
        h_team = match["HomeTeam"]
        a_team = match["AwayTeam"]
        
        home_seq = np.array(list(team_history[h_team]), dtype=np.float32)
        away_seq = np.array(list(team_history[a_team]), dtype=np.float32)
        
        home_seqs.append(home_seq)
        away_seqs.append(away_seq)
        
        # 
        h2h_home = match.get("h2h_home_rate", 0.45)
        h2h_draw = match.get("h2h_draw_rate", 0.25)
 # rest_diff → clip → [-2,2] around
        rest_val = match.get("rest_diff", 0.0)
        rest_val = 0.0 if pd.isna(rest_val) else float(rest_val)
        rest_val = np.clip(rest_val, -14.0, 14.0) / 7.0   # [-2,2] interval

        match_feat = np.array(
            [
                float(h2h_home) if not pd.isna(h2h_home) else 0.45,
                float(h2h_draw) if not pd.isna(h2h_draw) else 0.25,
                rest_val,
            ],
            dtype=np.float32,
        )
        match_features.append(match_feat)

        # 
        home_token = []
        away_token = []
        
        for f in safe_features:
            v = match.get(f, 0.0)
            v = 0.0 if pd.isna(v) else float(v)
        
            #  xxx_home / xxx_away
            if f.endswith("_home"):
                f_away = f.replace("_home", "_away")
                v_home = v
                v_away = match.get(f_away, v)  #  v
                v_away = 0.0 if pd.isna(v_away) else float(v_away)
        
            elif f.endswith("_away"):
                f_home = f.replace("_away", "_home")
                v_away = v
                v_home = match.get(f_home, v)
                v_home = 0.0 if pd.isna(v_home) else float(v_home)
        
 # xxx_diff(home)
            elif f.endswith("_diff"):
                v_home = v
                v_away = -v  # away
        
            else:
                v_home = v
                v_away = v
        
            home_token.append(float(v_home))
            away_token.append(float(v_away))

        home_token = np.array(home_token, dtype=np.float32)
        away_token = np.array(away_token, dtype=np.float32)
        
        team_history[h_team].append(home_token)
        team_history[a_team].append(away_token)
    
    df["home_form_seq"] = home_seqs
    df["away_form_seq"] = away_seqs
    df["match_features"] = match_features
    
    return df, FEAT_DIM

# 3: Dataset -> EnhancedMatchDataset

class EnhancedMatchDataset(Dataset):
    """
    """
    def __init__(self, df, class_weights=None):
 # columns
        home_np = np.stack(df["home_form_seq"].values).astype(np.float32)
        away_np = np.stack(df["away_form_seq"].values).astype(np.float32)
        match_np = np.stack(df["match_features"].values).astype(np.float32)
        
        # NaN
        home_np = np.nan_to_num(home_np, nan=0.0, posinf=0.0, neginf=0.0)
        away_np = np.nan_to_num(away_np, nan=0.0, posinf=0.0, neginf=0.0)
        match_np = np.nan_to_num(match_np, nan=0.0, posinf=0.0, neginf=0.0)
        
        # 
        print(f"  home_seq range: [{home_np.min():.2f}, {home_np.max():.2f}]")
        print(f"  away_seq range: [{away_np.min():.2f}, {away_np.max():.2f}]")
        print(f"  match_feat range: [{match_np.min():.2f}, {match_np.max():.2f}]")
        
        self.home_seq = torch.tensor(home_np, dtype=torch.float32)
        self.away_seq = torch.tensor(away_np, dtype=torch.float32)
        self.match_feat = torch.tensor(match_np, dtype=torch.float32)
        self.y = torch.tensor(df["y"].astype(int).values, dtype=torch.long)
        
        # 
        if class_weights is not None:
            self.sample_weights = torch.tensor(
                [class_weights[yi] for yi in self.y.numpy()], 
                dtype=torch.float32
            )
        else:
            self.sample_weights = torch.ones(len(self.y), dtype=torch.float32)

    def __len__(self): 
        return len(self.y)
    
    def __getitem__(self, idx):
        return (self.home_seq[idx], self.away_seq[idx], 
                self.match_feat[idx], self.y[idx], self.sample_weights[idx])

# 4: -> train_enhanced_tf

def train_enhanced_tf(model, train_loader, val_loader, 
                      class_weights, epochs=50, patience=8):
    """

    2. use FocalLoss
    """

 # , draw class(1)
    weight_tensor = torch.tensor(
        [class_weights.get(i, 1.0) for i in range(3)], 
        dtype=torch.float32
    ).to(DEVICE)

 # use FocalLoss CrossEntropy
    criterion = nn.CrossEntropyLoss(weight=weight_tensor)

    #  & Learning rate
    optimizer = optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-3)

    # 
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode='min', factor=0.5, patience=3, min_lr=1e-5
    )

    scaler = GradScaler(enabled=(DEVICE.type == "cuda"))

    best_ll = float("inf")
    best_state = None
    bad = 0

    for ep in range(1, epochs + 1):
        model.train()
        tr_loss = 0.0
        tr_correct = 0
        tr_total = 0
        all_preds = []

        for home_seq, away_seq, match_feat, yb, sw in train_loader:
            home_seq = home_seq.to(DEVICE)
            away_seq = away_seq.to(DEVICE)
            match_feat = match_feat.to(DEVICE)
            yb = yb.to(DEVICE)
            sw = sw.to(DEVICE)

            home_seq = torch.nan_to_num(home_seq, nan=0.0)
            away_seq = torch.nan_to_num(away_seq, nan=0.0)
            match_feat = torch.nan_to_num(match_feat, nan=0.0)

            optimizer.zero_grad(set_to_none=True)

            with autocast(enabled=(DEVICE.type == "cuda")):
                logits = model(home_seq, away_seq, match_feat)

                if torch.isnan(logits).any() or torch.isinf(logits).any():
                    print(f" Epoch {ep}: NaN/Inf in logits, skipping batch")
                    continue

                loss = criterion(logits, yb)

            if torch.isnan(loss):
                print(f" Epoch {ep}: NaN loss, skipping batch")
                continue

            scaler.scale(loss).backward()
            scaler.unscale_(optimizer)
            nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            scaler.step(optimizer)
            scaler.update()

            tr_loss += loss.item() * yb.size(0)
            preds = logits.argmax(dim=1)
            tr_correct += (preds == yb).sum().item()
            tr_total += yb.size(0)
            all_preds.extend(preds.cpu().numpy())

        tr_loss /= len(train_loader.dataset)
        tr_acc = tr_correct / tr_total
        train_pred_dist = np.bincount(all_preds, minlength=3) / len(all_preds)

        model.eval()
        probs, ys, val_preds = [], [], []
        with torch.no_grad():
            for home_seq, away_seq, match_feat, yb, _ in val_loader:
                home_seq = home_seq.to(DEVICE)
                away_seq = away_seq.to(DEVICE)
                match_feat = match_feat.to(DEVICE)
                home_seq = torch.nan_to_num(home_seq, nan=0.0)
                away_seq = torch.nan_to_num(away_seq, nan=0.0)
                match_feat = torch.nan_to_num(match_feat, nan=0.0)

                logits = model(home_seq, away_seq, match_feat)
                prob = torch.softmax(logits, dim=1)
                prob = torch.clamp(prob, 1e-7, 1 - 1e-7)
                probs.append(prob.cpu().numpy())
                ys.append(yb.numpy())
                val_preds.extend(logits.argmax(dim=1).cpu().numpy())

        proba_val = np.vstack(probs)
        y_val = np.concatenate(ys)
        proba_val = proba_val / proba_val.sum(axis=1, keepdims=True)
        va_ll = log_loss(y_val, proba_val, labels=[0, 1, 2])
        val_pred_dist = np.bincount(val_preds, minlength=3) / len(val_preds)

        scheduler.step(va_ll)

        if ep == 1 or ep % 3 == 0:
            lr = optimizer.param_groups[0]['lr']
            print(f"Epoch {ep:02d} | lr={lr:.6f} | train_loss={tr_loss:.4f} | train_acc={tr_acc:.3f}")
            print(f"         | val_logloss={va_ll:.4f}")
            print(f"         | train_pred_dist: H={train_pred_dist[0]:.2f} D={train_pred_dist[1]:.2f} A={train_pred_dist[2]:.2f}")
            print(f"         | val_pred_dist:   H={val_pred_dist[0]:.2f} D={val_pred_dist[1]:.2f} A={val_pred_dist[2]:.2f}")

            if max(train_pred_dist) > 0.8:
                print(f"Warning: Training set predictions are too focused on categories {np.argmax(train_pred_dist)}")

        if va_ll < best_ll - 1e-4:
            best_ll = va_ll
            best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
            bad = 0
        else:
            bad += 1
            if bad >= patience:
                print(f"Early stopping at epoch {ep}")
                break

    if best_state:
        model.load_state_dict(best_state)

    print(f"\n Best val logloss: {best_ll:.4f}")
    return model

def predict_enhanced_tf(model, loader):
    """
    prediction function
    """
    model.eval()
    probs = []
    
    with torch.no_grad():
        for home_seq, away_seq, match_feat, _, _ in loader:
            home_seq = home_seq.to(DEVICE)
            away_seq = away_seq.to(DEVICE)
            match_feat = match_feat.to(DEVICE)
            
            home_seq = torch.nan_to_num(home_seq, nan=0.0)
            away_seq = torch.nan_to_num(away_seq, nan=0.0)
            match_feat = torch.nan_to_num(match_feat, nan=0.0)
            
            logits = model(home_seq, away_seq, match_feat)
            prob = torch.softmax(logits, dim=1)
            prob = torch.clamp(prob, 1e-7, 1 - 1e-7)
            probs.append(prob.cpu().numpy())
    
    proba = np.vstack(probs)
    proba = proba / proba.sum(axis=1, keepdims=True)
    return proba

def standardize_sequences(train_df, val_df, test_df=None):
    """
    only use statistics from training set
    """
    # 
    flat_train_home = np.vstack(train_df["home_form_seq"].values).astype(np.float32)
    flat_train_home = np.nan_to_num(flat_train_home, nan=0.0)
    
    mu_seq = flat_train_home.mean(axis=0)
    sd_seq = flat_train_home.std(axis=0)
    
    mask_seq = sd_seq > 1e-3
    sd_seq_safe = sd_seq.copy()
    sd_seq_safe[~mask_seq] = 1.0  # 
    
    flat_match = np.vstack(train_df["match_features"].values).astype(np.float32)
    flat_match = np.nan_to_num(flat_match, nan=0.0)
    mu_match = flat_match.mean(axis=0)
    sd_match = flat_match.std(axis=0)
    mask_match = sd_match > 1e-3
    sd_match_safe = sd_match.copy()
    sd_match_safe[~mask_match] = 1.0
    
    def norm_seq(seq):
        seq = np.nan_to_num(seq.astype(np.float32), nan=0.0)
        return (seq - mu_seq) / sd_seq_safe
    
    def norm_match(feat):
        feat = np.nan_to_num(feat.astype(np.float32), nan=0.0)
        return (feat - mu_match) / sd_match_safe

    for dfx in [train_df, val_df] + ([test_df] if test_df is not None else []):
        dfx["home_form_seq"] = dfx["home_form_seq"].apply(norm_seq)
        dfx["away_form_seq"] = dfx["away_form_seq"].apply(norm_seq)
        dfx["match_features"] = dfx["match_features"].apply(norm_match)
    
    return train_df, val_df, test_df, mu_seq, sd_seq, mu_match, sd_match

# home: -> run_enhanced_tf

# home: -> run_enhanced_tf

def run_enhanced_tf(cleaned_df, train_df, val_df, test_df, 
                    tf_token_features, seq_len=5, epochs=50):
    
    print(f" Running EnhancedMatchTransformer ")
    
    # filt from tf_token_features instead of using feature_cols directly
    name_map = {f: str(f).lower() for f in tf_token_features}

    safe_features = []
    for f in tf_token_features:
        name = name_map[f]

        # skip useless features
        if f not in cleaned_df.columns:
            continue

        # features prior to save :form / elo / position / points / streak / pm
        if (
            ("pm" in name) or
            ("form" in name) or
            ("elo" in name) or
            ("position" in name) or
            ("points" in name) or
            ("l10" in name) or
            ("win_streak" in name) or
            ("unbeaten" in name) or
            ("draw" in name) or
            ("xg" in name) or
            ("h2h" in name) or
            ("rest" in name) or
            ("momentum" in name) or
            ("attack" in name) or
            ("defense" in name)
        ):
            safe_features.append(f)

 # if the number is not enough, use tf_token_features
    if len(safe_features) < 10:
        print(f"lack of secure features, all using tf_token_features")
        safe_features = [f for f in tf_token_features if f in cleaned_df.columns]

    print(f"using {len(safe_features)} of sequence features")

    # Constructing the sequence: 
    # using the subset of lines from train/val/test 
    # that correspond to cleaned_df
    print(f"build team past sequence")
    
    # using rows from cleaned_df , keep same as train/val/test 
    all_idx = pd.Index(train_df.index).union(val_df.index).union(test_df.index)
    all_df = cleaned_df.loc[all_idx].sort_values("Date")

    # build past sequence
    all_df_with_seq, feat_dim = build_team_sequences_fixed(
        all_df, safe_features, seq_len=seq_len
    )
    
    # sequence data are feed back to their sets
    train_df = train_df.copy()
    val_df = val_df.copy()
    test_df = test_df.copy()
    
    seq_cols = ["home_form_seq", "away_form_seq", "match_features"]
    for col in seq_cols:
        if col in train_df.columns:
            train_df.drop(columns=[col], inplace=True)
        if col in val_df.columns:
            val_df.drop(columns=[col], inplace=True)
        if col in test_df.columns:
            test_df.drop(columns=[col], inplace=True)
    
    seq_data = all_df_with_seq[seq_cols]
    train_df = train_df.join(seq_data, how="left")
    val_df   = val_df.join(seq_data, how="left")
    test_df  = test_df.join(seq_data, how="left")
    
    zero_seq = np.zeros((seq_len, feat_dim), dtype=np.float32)
    zero_match = np.zeros(3, dtype=np.float32)
    
    for dfx in [train_df, val_df, test_df]:
        dfx["home_form_seq"] = dfx["home_form_seq"].apply(
            lambda x: x if isinstance(x, np.ndarray) else zero_seq.copy()
        )
        dfx["away_form_seq"] = dfx["away_form_seq"].apply(
            lambda x: x if isinstance(x, np.ndarray) else zero_seq.copy()
        )
        dfx["match_features"] = dfx["match_features"].apply(
            lambda x: x if isinstance(x, np.ndarray) else zero_match.copy()
        )
    
    # standardization
    print(f"standardization...")
    train_df, val_df, test_df, mu_seq, sd_seq, mu_match, sd_match = \
        standardize_sequences(train_df, val_df, test_df)
    
    # 
    print(f"calculated class weights")
    y_train = train_df["y"].astype(int).values
    class_counts = np.bincount(y_train, minlength=3)
    total = class_counts.sum()
    
    class_weights = {
        0: total / (3 * class_counts[0])*1,           # Home
        1: total / (3 * class_counts[1]) * 1.1,     # Draw ()
        2: total / (3 * class_counts[2]),           # Away
    }
    
    print(f"  class distribution: H={class_counts[0]}, D={class_counts[1]}, A={class_counts[2]}")
    print(f"  class weight: H={class_weights[0]:.2f}, D={class_weights[1]:.2f}, A={class_weights[2]:.2f}")

    # 5 Oversampling Enhancement draw class(1)
    try:
        X_sample = np.vstack(train_df["home_form_seq"].values)[:, :5]  # 5
        y_sample = train_df["y"].values

        sm = SMOTE(sampling_strategy={1: int(class_counts[1] * 1.3)}, random_state=42)
        X_res, y_res = sm.fit_resample(X_sample, y_sample)

    # random sampling train_df row, SMOTE 
        selected_indices = np.random.choice(train_df.index, size=len(X_res), replace=True)
        train_df = train_df.loc[selected_indices].copy()
        train_df["y"] = y_res
        print(f"  Oversampled train size: {len(train_df)}")
    except Exception as e:
        print(" SMOTE oversampling failure:", str(e))
    
    # DataLoader
    print("create dataloader")
    train_ds = EnhancedMatchDataset(train_df, class_weights)
    val_ds   = EnhancedMatchDataset(val_df,   class_weights)
    test_ds  = EnhancedMatchDataset(test_df,  class_weights)
    
    train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
    val_loader   = DataLoader(val_ds,   batch_size=64, shuffle=False)
    test_loader  = DataLoader(test_ds,  batch_size=64, shuffle=False)
    
    # create model

    model = EnhancedMatchTransformer(
        seq_len=seq_len,
        feat_dim=feat_dim,
        match_feat_dim=3,
        d_model=64,
        nhead=4,
        num_layers=2,
        ff_dim=128,
        dropout=0.3
    ).to(DEVICE)
    
    n_params = sum(p.numel() for p in model.parameters())
    print(f" number of model features {n_params:,}")
    
    # Train
    print(f" Training start ")
    model = train_enhanced_tf(
        model, train_loader, val_loader, 
        class_weights, epochs=epochs, patience=8
    )
    
    # forecast
    proba_train_tf = predict_enhanced_tf(model, train_loader)
    proba_val_tf   = predict_enhanced_tf(model, val_loader)
    proba_test_tf  = predict_enhanced_tf(model, test_loader)
    
    # Evaluation
    val_ll = log_loss(val_df["y"].values, proba_val_tf, labels=[0, 1, 2])
    test_ll = log_loss(test_df["y"].values, proba_test_tf, labels=[0, 1, 2])
    
    val_pred  = proba_val_tf.argmax(axis=1)
    test_pred = proba_test_tf.argmax(axis=1)
    
    val_dist  = np.bincount(val_pred,  minlength=3) / len(val_pred)
    test_dist = np.bincount(test_pred, minlength=3) / len(test_pred)
    
    print(f"Result of transformer")
    print(f"Val  logloss: {val_ll:.4f}")
    print(f"Test logloss: {test_ll:.4f}")
    print(f"Val  prediction distribution: H={val_dist[0]:.3f} D={val_dist[1]:.3f} A={val_dist[2]:.3f}")
    print(f"Test prediction distribution: H={test_dist[0]:.3f} D={test_dist[1]:.3f} A={test_dist[2]:.3f}")
    # Save Transformer v3 ckpt
    TF_CKPT_PATH = "enhanced_tf_v3.ckpt"   #  v2,
    
    tf_ckpt = {
        # model weights
        "model_state": model.state_dict(),
        "seq_len": seq_len,
        "feat_dim": feat_dim,
        "match_feat_dim": 3,
        "d_model": 64,
        "nhead": 4,
        "num_layers": 2,
        "ff_dim": 128,
        "dropout": 0.3,
        "token_features": safe_features,
        "mu_seq":   mu_seq,
        "sd_seq":   sd_seq,
        "mu_match": mu_match,
        "sd_match": sd_match,
    }

    torch.save(tf_ckpt, TF_CKPT_PATH)
    print(f"Transformer v3 has been save to {TF_CKPT_PATH}")
    return model, proba_train_tf, proba_val_tf, proba_test_tf


Transformer device: cuda


In [25]:
class FocalLoss(nn.Module):
    def __init__(self, alpha=None, gamma=2.0, reduction='mean'):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction
        self.ce = nn.CrossEntropyLoss(weight=alpha, reduction='none')

    def forward(self, logits, target):
        ce_loss = self.ce(logits, target)  # [B]
        pt = torch.exp(-ce_loss)
        focal = (1 - pt) ** self.gamma * ce_loss
        return focal.mean() if self.reduction == 'mean' else focal
    
if TORCH_OK:
 # tf_token_features transformer feature columns
    model_tf, proba_train_tf, proba_val_tf, proba_test_tf = run_enhanced_tf(
        cleaned_df=cleaned_df,
        train_df=train_df,
        val_df=val_df,
        test_df=test_df,
        tf_token_features=tf_token_features,
        seq_len=10,      # or 8 / 10 
        epochs=100       #  50 
    )
else:
 # no torch fallback, meta 
    proba_train_tf = np.full((len(train_df), 3), 1/3, dtype=float)
    proba_val_tf   = np.full((len(val_df),   3), 1/3, dtype=float)
    proba_test_tf  = np.full((len(test_df),  3), 1/3, dtype=float)

 Running EnhancedMatchTransformer 
using 101 of sequence features
build team past sequence
Sequence feature number: 101
Sequence feature : ['form_home', 'form_away', 'form_diff', 'form_home_v2', 'form_away_v2', 'form_diff_v2', 'home_home_form', 'away_away_form', 'win_streak_home', 'win_streak_away']...
standardization...
calculated class weights
  class distribution: H=3673, D=1997, A=2310
  class weight: H=0.72, D=1.47, A=1.15
 SMOTE oversampling failure: Found input variables with inconsistent numbers of samples: [79800, 7980]
create dataloader
  home_seq range: [-45.94, 45.94]
  away_seq range: [-45.94, 45.94]
  match_feat range: [-5.73, 5.75]
  home_seq range: [-7.65, 7.17]
  away_seq range: [-7.65, 7.17]
  match_feat range: [-5.73, 5.75]
  home_seq range: [-62.47, 62.47]
  away_seq range: [-62.47, 62.47]
  match_feat range: [-5.73, 5.75]
 number of model features 95,748
 Training start 
Epoch 01 | lr=0.001000 | train_loss=1.0890 | train_acc=0.394
         | val_logloss=1.0487
    

### Collecting features from base models

In [26]:
# XGB (base_params + best_n)
with open("xgb_oof_config.pkl", "rb") as f:
    xgb_cfg = pickle.load(f)

XGB_BASE_PARAMS = xgb_cfg["xgb_base_params"]
BEST_N = xgb_cfg.get("best_n", XGB_BASE_PARAMS.get("n_estimators", 800))

print(f" load xgb oof config from xgb_oof_config.pkl")
print("   BEST_N =", BEST_N)


 load xgb oof config from xgb_oof_config.pkl
   BEST_N = 67


In [27]:
params = XGB_BASE_PARAMS.copy()

# 
params.pop("n_estimators", None)
lr = params.pop("learning_rate", 0.03)

xgb_final = xgb.XGBClassifier(
    **params,
    n_estimators=BEST_N,
    learning_rate=lr,
)

# (if make_sample_weight)
sw_train =make_sample_weight(train_df) if "make_sample_weight" in globals() else None

X_tr = train_df[feature_cols_xgb].to_numpy(dtype=np.float32)
y_tr = train_df["y"].astype(int).to_numpy()

xgb_final.fit(
    X_tr,
    y_tr,
    sample_weight=sw_train,
    verbose=False,
)

if "proba_train_xgb" not in globals():
    proba_train_xgb = xgb_final.predict_proba(
        train_df[feature_cols_xgb].to_numpy(dtype=np.float32)
    )

if "proba_val_xgb" not in globals():
    proba_val_xgb = xgb_final.predict_proba(
        val_df[feature_cols_xgb].to_numpy(dtype=np.float32)
    )

if "proba_test_xgb" not in globals():
    proba_test_xgb = xgb_final.predict_proba(
        test_df[feature_cols_xgb].to_numpy(dtype=np.float32)
    )

print("XGB proba shapes:",
      "train", proba_train_xgb.shape,
      "val", proba_val_xgb.shape,
      "test", proba_test_xgb.shape)


XGB proba shapes: train (7980, 3) val (380, 3) test (1140, 3)


In [28]:
import pickle
from sklearn.metrics import log_loss

# Bayes-DC posterior 
if "attack_s" not in globals():
    with open("bayes_dc_global.ckpt", "rb") as f:
        dc_bayes = pickle.load(f)

    tmap        = dc_bayes["tmap"]
    attack_s    = dc_bayes["attack_s"]
    defense_s   = dc_bayes["defense_s"]
    home_adv_s  = dc_bayes["home_adv_s"]
    rho_s       = dc_bayes["rho_s"]
    beta_h_s    = dc_bayes["beta_h_s"]
    beta_a_s    = dc_bayes["beta_a_s"]
    dc_feature_cols = dc_bayes["dc_feature_cols"]
    DECAY_LAMBDA    = dc_bayes["DECAY_LAMBDA"]
    dc_scaler       = dc_bayes["dc_scaler"]

    print("✅ bayes_dc_global.ckpt recovery Bayes-DC posterior")

X_dc_train_full = dc_scaler.transform(
    train_df[dc_feature_cols].copy().fillna(0.0)
)
X_dc_val = dc_scaler.transform(
    val_df[dc_feature_cols].copy().fillna(0.0)
)
X_dc_test = dc_scaler.transform(
    test_df[dc_feature_cols].copy().fillna(0.0)
)

# 
if "proba_train_dc" not in globals():
    proba_train_dc, scorelines_train = bayes_dc_predict_full(
        train_df,
        X_dc_train_full,
        attack_s, defense_s,
        home_adv_s, rho_s,
        beta_h_s, beta_a_s,
        tmap
    )

if "proba_val_dc" not in globals():
    proba_val_dc, scorelines_val = bayes_dc_predict_full(
        val_df,
        X_dc_val,
        attack_s, defense_s,
        home_adv_s, rho_s,
        beta_h_s, beta_a_s,
        tmap
    )

if "proba_test_dc" not in globals():
    proba_test_dc, scorelines_test = bayes_dc_predict_full(
        test_df,
        X_dc_test,
        attack_s, defense_s,
        home_adv_s, rho_s,
        beta_h_s, beta_a_s,
        tmap
    )

print("Bayes DC val logloss:",
      round(log_loss(val_df["y"], proba_val_dc, labels=[0,1,2]), 4))
print("Bayes DC test logloss:",
      round(log_loss(test_df["y"], proba_test_dc, labels=[0,1,2]), 4))


Bayes DC val logloss: 0.9729
Bayes DC test logloss: 1.001


In [29]:
from torch.utils.data import Dataset, DataLoader
if "proba_train_xgb" not in globals():
    proba_train_xgb = xgb_final.predict_proba(train_df[feature_cols_xgb])

if "proba_train_dc" not in globals():
    proba_train_dc, _ = bayes_dc_predict_full(train_df, attack_s, defense_s, home_adv_s, rho_s, tmap)

if TORCH_OK:
    if "proba_train_tf" not in globals():
        train_loader_full = DataLoader(EnhancedMatchDataset(train_df), batch_size=64, shuffle=False, pin_memory=True)
        proba_train_tf = predict_tf(model_tf, train_loader_full)
else:
    proba_train_tf = np.full((len(train_df), 3), 1/3)
    proba_val_tf = np.full((len(val_df), 3), 1/3)
    proba_test_tf = np.full((len(test_df), 3), 1/3)

print("Prob shapes:")
print(f"  XGB:  train={proba_train_xgb.shape}, val={proba_val_xgb.shape}, test={proba_test_xgb.shape}")
print(f"  DC:   train={proba_train_dc.shape}, val={proba_val_dc.shape}, test={proba_test_dc.shape}")
print(f"  TF:   train={proba_train_tf.shape}, val={proba_val_tf.shape}, test={proba_test_tf.shape}")

Prob shapes:
  XGB:  train=(7980, 3), val=(380, 3), test=(1140, 3)
  DC:   train=(7980, 3), val=(380, 3), test=(1140, 3)
  TF:   train=(7980, 3), val=(380, 3), test=(1140, 3)


### OOF stacking

In [30]:
def predict_tf(model, loader):
    model.eval()
    probs = []
    with torch.no_grad():
        for hs, asq, mfeat, _ in loader:
            hs = hs.to(DEVICE)
            asq = asq.to(DEVICE)
            mfeat = mfeat.to(DEVICE)
            logits = model(hs, asq, mfeat)
            prob = torch.softmax(logits, dim=1)
            prob = torch.clamp(prob, 1e-7, 1 - 1e-7)
            probs.append(prob.cpu().numpy())
    proba = np.vstack(probs)
    proba = proba / proba.sum(axis=1, keepdims=True)
    return proba


In [31]:
import torch

def load_enhanced_tf_checkpoint(path="enhanced_tf_v3.ckpt", device=None):
    """
    """
    if device is None:
        device = DEVICE

    ckpt = torch.load(path, map_location=device, weights_only=False)

 # from ckpt 
    seq_len         = ckpt.get("seq_len", 5)
    feat_dim        = ckpt.get("feat_dim")
    match_feat_dim  = ckpt.get("match_feat_dim", 3)
    d_model         = ckpt.get("d_model", 64)
    nhead           = ckpt.get("nhead", 4)
    num_layers      = ckpt.get("num_layers", 2)
    ff_dim          = ckpt.get("ff_dim", 128)
    dropout         = ckpt.get("dropout", 0.3)

    # 
    model = EnhancedMatchTransformer(
        seq_len=seq_len,
        feat_dim=feat_dim,
        match_feat_dim=match_feat_dim,
        d_model=d_model,
        nhead=nhead,
        num_layers=num_layers,
        ff_dim=ff_dim,
        dropout=dropout,
    ).to(device)

    model.load_state_dict(ckpt["model_state"])
    model.eval()

 # ✅: token_features, all_token_features
    token_features = ckpt.get("token_features", ckpt.get("all_token_features", []))

    mu_seq   = ckpt["mu_seq"]
    sd_seq   = ckpt["sd_seq"]
    mu_match = ckpt["mu_match"]
    sd_match = ckpt["sd_match"]

    return model, token_features, mu_seq, sd_seq, mu_match, sd_match


In [32]:
# TF sequence( OOF + final TF )

SEQ_LEN_TF = 5   # sum run_enhanced_tf  seq_len 

# TF token ,if tf_token_features ;
safe_features_tf = [f for f in tf_token_features if f in cleaned_df.columns]

print("TF safe features:", len(safe_features_tf))

# cleaned_df 
cleaned_with_seq, FORM_FEATURE_DIM_TF = build_team_sequences_fixed(
    cleaned_df, safe_features_tf, seq_len=SEQ_LEN_TF
)

seq_cols_tf = ["home_form_seq", "away_form_seq", "match_features"]
tf_seq_store = cleaned_with_seq[seq_cols_tf]

#  join round train_df / val_df / test_df
for name in ["train_df", "val_df", "test_df"]:
    df = globals()[name]
    df = df.drop(columns=seq_cols_tf, errors="ignore")
 # by index connection( df cleaned_df)
    df = df.join(tf_seq_store, how="left")
    globals()[name] = df

# train_recent, OOF 
train_recent = train_df[train_df["Season"].isin(recent_seasons)].copy()

print("TF sequence construction complete: FORM_FEATURE_DIM_TF =", FORM_FEATURE_DIM_TF)
print("Does train_recent have home_form_seq?", "home_form_seq" in train_recent.columns)

TF safe features: 120
Sequence feature number: 120
Sequence feature : ['form_home', 'form_away', 'form_diff', 'form_home_v2', 'form_away_v2', 'form_diff_v2', 'home_home_form', 'away_away_form', 'win_streak_home', 'win_streak_away']...
TF sequence construction complete: FORM_FEATURE_DIM_TF = 120
Does train_recent have home_form_seq? True


In [33]:
# Draw specialist

def fit_draw_xgb_fold(train_fold, feature_cols_draw):
    """
    - target: 1 = Draw, 0 = Not-Draw
    """
    if "XGB_BASE_PARAMS" not in globals() or "BEST_N" not in globals():
        raise ValueError(" XGB_BASE_PARAMS and BEST_N(home XGB consistent) should be defined before.")

    params = XGB_BASE_PARAMS.copy()

    params.pop("n_estimators", None)
    params.pop("num_class",   None)

 # ;eval_metric logloss
    params["objective"] = "binary:logistic"
    params["eval_metric"] = "logloss"

 # learning_rate 
    lr = params.pop("learning_rate", 0.03)

    model = xgb.XGBClassifier(
        **params,
        n_estimators=BEST_N,
        learning_rate=lr,
    )

    # Feature & Label
    X_train = train_fold[feature_cols_draw].to_numpy(dtype=np.float32)
    y_train = (train_fold["y"].astype(int).to_numpy() == 1).astype(int)  # 1 = Draw

 # sample_weight( TIME_DECAY_LAMBDA + DRAW_CLASS_MULT)
    sw = make_sample_weight(train_fold) if "make_sample_weight" in globals() else None

    model.fit(
        X_train,
        y_train,
        sample_weight=sw,
        verbose=False,
    )
    return model

def get_draw_specialist_oof(
    meta_df,
    seasons_sorted,
    draw_feature_cols,
    max_train_seasons=8,
    target_last_k_seasons=6,
):
    """
    - input:
                 (contain 'Season', 'Date', 'y' and draw_feature_cols)
    - output:
    """
    df = meta_df.sort_values("Date").copy()

    # 
    feature_cols_draw = [c for c in draw_feature_cols if c in df.columns]
    if len(feature_cols_draw) == 0:
        raise ValueError("draw_feature_cols  meta_df ,.")

    n = len(df)
    oof_pD = np.full(n, np.nan, dtype=float)

 # K OOF target
    if target_last_k_seasons is None or target_last_k_seasons <= 0:
        target_seasons = seasons_sorted
    else:
        target_seasons = seasons_sorted[-min(target_last_k_seasons, len(seasons_sorted)):]

    print("🎯 Draw-specialist OOF target seasons:", target_seasons)

    for s in target_seasons:
        # current val s  = season s
        val_mask = (df["Season"] == s)
        # training set = max number of season of max_train_seasons 
        train_seasons = [ss for ss in seasons_sorted if ss < s][-max_train_seasons:]
        train_mask = df["Season"].isin(train_seasons)

        if train_mask.sum() == 0:
            print(f"no earlier season could beused for training, skip.")
            continue

        train_fold = df.loc[train_mask].copy()
        val_fold   = df.loc[val_mask].copy()

        print(f"  [Draw OOF] Season {s}: train seasons {sorted(set(train_fold['Season']))}, "
              f"n_train={len(train_fold)}, n_val={len(val_fold)}")

    # XGB “draw”
        draw_model = fit_draw_xgb_fold(train_fold, feature_cols_draw)

    # season p(D)
        X_val = val_fold[feature_cols_draw].to_numpy(dtype=np.float32)
        pD_val = draw_model.predict_proba(X_val)[:, 1]

    # OOF ( index alignment)
        oof_pD[val_mask.values] = pD_val

    missing_mask = np.isnan(oof_pD)
    if missing_mask.any():
        print(f"There are {missing_mask.sum()} samples that are not covered by OOF, which will be filled in using the full model.")

    # draw
    draw_model_full = fit_draw_xgb_fold(df, feature_cols_draw)

    if missing_mask.any():
        X_missing = df.loc[missing_mask, feature_cols_draw].to_numpy(dtype=np.float32)
        oof_pD[missing_mask] = draw_model_full.predict_proba(X_missing)[:, 1]

    print("Draw-specialist OOF completed, shape: ", oof_pD.shape)
    return oof_pD, draw_model_full

# RUN_OOF = True
DO_TF_OOF = True
DO_DC_OOF_PROPER = True 
TORCH_OK = True

# checkpoint for df + OOF 
OOF_CKPT_PATH = "oof_full_checkpoint.pkl"
model_tf_loaded, all_token_features_loaded, mu_seq_loaded, sd_seq_loaded, mu_match_loaded, sd_match_loaded = \
    load_enhanced_tf_checkpoint()
if RUN_OOF:

    if os.path.exists(OOF_CKPT_PATH):
        try:
            with open(OOF_CKPT_PATH, "rb") as f:
                ckpt = pickle.load(f)

            train_df        = ckpt["train_df"]
            val_df          = ckpt["val_df"]
            meta_df         = ckpt["meta_df"]
            oof_xgb         = ckpt["oof_xgb"]
            oof_dc          = ckpt["oof_dc"]
            oof_tf          = ckpt["oof_tf"]
            seasons_sorted  = ckpt["seasons_sorted"]
            done_seasons    = set(ckpt.get("done_seasons", []))

            print(" Recovering data and OOF progress from checkpoint")
            print(" train_df shape:", train_df.shape)
            print(" val_df shape:", val_df.shape)
            print(" meta_df shape:", meta_df.shape)
            print(" Season completed:", sorted(done_seasons))

        except Exception as e:
            print(f" checkpoint fail to read: {e}")
            print(" The old checkpoint will be cleared, and OOF will be reconstructed from scratch.")
            os.remove(OOF_CKPT_PATH)
            ckpt = None
            done_seasons = set()

            # Reconstruct the meta_df/oof array
            meta_train_seasons = sorted(list(set(train_df["Season"]).union(set(val_df["Season"]))))
            meta_df = pd.concat([train_df, val_df], axis=0).copy()
            if "_orig_idx" not in meta_df.columns:
                meta_df["_orig_idx"] = meta_df.index
            meta_df = meta_df[meta_df["Season"].isin(meta_train_seasons)].copy()
            meta_df = meta_df.sort_values("Date").reset_index(drop=True)

            seasons_sorted = sorted(meta_df["Season"].unique(), key=season_start_year)

            oof_xgb = np.full((len(meta_df), 3), np.nan, dtype=float)
            oof_dc  = np.full((len(meta_df), 3), np.nan, dtype=float)
            oof_tf  = np.full((len(meta_df), 3), np.nan, dtype=float)

    else:
        # if there is no checkpoint, it is the first run
        done_seasons = set()

        meta_train_seasons = sorted(list(set(train_df["Season"]).union(set(val_df["Season"]))))
        meta_df = pd.concat([train_df, val_df], axis=0).copy()
        if "_orig_idx" not in meta_df.columns:
            meta_df["_orig_idx"] = meta_df.index
        meta_df = meta_df[meta_df["Season"].isin(meta_train_seasons)].copy()
        meta_df = meta_df.sort_values("Date").reset_index(drop=True)

        seasons_sorted = sorted(meta_df["Season"].unique(), key=season_start_year)

        oof_xgb = np.full((len(meta_df), 3), np.nan, dtype=float)
        oof_dc  = np.full((len(meta_df), 3), np.nan, dtype=float)
        oof_tf  = np.full((len(meta_df), 3), np.nan, dtype=float)

    # ckpt ,meta_df / seasons_sorted / oof_* / done_seasons 
    orig_to_pos = {oid: i for i, oid in enumerate(meta_df["_orig_idx"].values)}

    draw_feature_cols = [
        "abs_form_diff", "abs_points_diff", "abs_gd_diff",
        "abs_elo_sum_diff", "abs_position_diff", "abs_h2h_gd_avg",
        "xg_total", "low_xg_flag", "attack_mom_sum", "defense_mom_sum",
        "abs_shots_pm_diff", "abs_sot_pm_diff",
        "abs_corners_pm_diff", "abs_fouls_pm_diff",
        "draw_prop_home", "draw_prop_away", "draw_prop_sum",
        "h2h_draw_rate", "h2h_draw_rate_td", "h2h_draw_rate_mean",
        "ref_draw_rate", "high_draw_ref_flag",
        "both_mid_table", "mid_season", "late_season",
    ]
    # meta_df , KeyError
    draw_feature_cols = [c for c in draw_feature_cols if c in meta_df.columns]
    print("number of draw features:", len(draw_feature_cols))

    oof_pD_draw, draw_model_full = get_draw_specialist_oof(
        meta_df=meta_df,
        seasons_sorted=seasons_sorted,
        draw_feature_cols=draw_feature_cols,
        max_train_seasons=8,
        target_last_k_seasons=6,
    )
    meta_df["pD_special"] = oof_pD_draw

    def fit_xgb_fold(train_fold):
 # / xgb_final base Parameters
        params = XGB_BASE_PARAMS.copy()
    
        # 
        params.pop("n_estimators", None)
 # learning_rate from params 
        lr = params.pop("learning_rate", 0.03)
    
        model = xgb.XGBClassifier(
            **params,
            n_estimators=BEST_N,
            learning_rate=lr,
        )
    
        sw = make_sample_weight(train_fold) if "make_sample_weight" in globals() else None
    
 #  numpy XGB, pandas 
        X_train = train_fold[feature_cols_xgb].to_numpy(dtype=np.float32)
        y_train = train_fold["y"].astype(int).to_numpy()
    
        model.fit(
            X_train,
            y_train,
            sample_weight=sw,
            verbose=False,
        )
        return model
    
    def fit_dc_fold_proper(train_fold, draws=300, tune=300):
        teams_fold = sorted(set(train_fold["HomeTeam"]).union(train_fold["AwayTeam"]))
        tmap_fold = {t: i for i, t in enumerate(teams_fold)}
        n_teams_fold = len(teams_fold)
        
        home_idx_fold = train_fold["HomeTeam"].map(tmap_fold).values.astype("int64")
        away_idx_fold = train_fold["AwayTeam"].map(tmap_fold).values.astype("int64")
        gh_fold = train_fold["FTHG"].astype(int).values
        ga_fold = train_fold["FTAG"].astype(int).values
        
        xi = DECAY_LAMBDA
        last_date_fold = train_fold["Date"].max()
        delta_days_fold = (last_date_fold - train_fold["Date"]).dt.days.values
        w_fold = np.exp(-xi * delta_days_fold)
        
        with pm.Model() as dc_fold_model:
            sigma_att = pm.HalfNormal("sigma_att", sigma=0.7)
            sigma_def = pm.HalfNormal("sigma_def", sigma=0.7)
            
            att_offset = pm.Normal("att_offset", 0.0, 1.0, shape=n_teams_fold)
            def_offset = pm.Normal("def_offset", 0.0, 1.0, shape=n_teams_fold)
            
            attack_raw = att_offset * sigma_att
            defense_raw = def_offset * sigma_def
            attack = pm.Deterministic("attack", attack_raw - attack_raw[-1])
            defense = pm.Deterministic("defense", defense_raw - defense_raw[-1])
            
            home_adv = pm.Normal("home_adv", 0.0, 0.5)
            rho_raw = pm.Normal("rho_raw", 0.0, 0.7)
            rho = pm.Deterministic("rho", 0.6 * pm.math.tanh(rho_raw))
            
            hi = at.as_tensor_variable(home_idx_fold, dtype="int64")
            ai = at.as_tensor_variable(away_idx_fold, dtype="int64")
            
            eta_h = home_adv + attack[hi] - defense[ai]
            eta_a = attack[ai] - defense[hi]
            lam_h = pm.math.exp(eta_h)
            lam_a = pm.math.exp(eta_a)
            
            gH = at.as_tensor_variable(gh_fold, dtype="int64")
            gA = at.as_tensor_variable(ga_fold, dtype="int64")
            corr = at.ones_like(gH, dtype="float64")
            
            m00 = at.and_(at.eq(gH, 0), at.eq(gA, 0))
            m01 = at.and_(at.eq(gH, 0), at.eq(gA, 1))
            m10 = at.and_(at.eq(gH, 1), at.eq(gA, 0))
            m11 = at.and_(at.eq(gH, 1), at.eq(gA, 1))
            
            corr = at.switch(m00, 1 - lam_h * lam_a * rho, corr)
            corr = at.switch(m01, 1 + lam_h * rho, corr)
            corr = at.switch(m10, 1 + lam_a * rho, corr)
            corr = at.switch(m11, 1 - rho, corr)
            corr = at.clip(corr, 1e-6, np.inf)
            
            logp_home = pm.logp(pm.Poisson.dist(mu=lam_h), gh_fold)
            logp_away = pm.logp(pm.Poisson.dist(mu=lam_a), ga_fold)
            logp_corr = at.log(corr)
            weights = at.as_tensor_variable(w_fold, dtype="float64")
            pm.Potential("weighted_like", at.sum(weights * (logp_home + logp_away + logp_corr)))
            
            trace = pm.sample(
                draws=draws, tune=tune,
                chains=4, cores=2,
                target_accept=0.95,
                random_seed=42,
                progressbar=False
            )
        
        post = trace.posterior.stack(sample=("chain", "draw"))
        return {
            "attack": post["attack"].values,
            "defense": post["defense"].values,
            "home_adv": post["home_adv"].values,
            "rho": post["rho"].values,
            "tmap": tmap_fold,
            "teams": teams_fold
        }

    def predict_dc_fold_proper(state, df_fold, max_goals=5):
        if state is None:
            return np.full((len(df_fold), 3), 1/3)
        attack_s_fold = state["attack"]
        defense_s_fold = state["defense"]
        home_adv_s_fold = state["home_adv"]
        rho_s_fold = state["rho"]
        tmap_fold = state["tmap"]
        n_samples_fold = attack_s_fold.shape[0]
        n_matches = len(df_fold)
        out_probs = np.zeros((n_matches, 3))
        for i, (_, row) in enumerate(df_fold.iterrows()):
            home, away = row["HomeTeam"], row["AwayTeam"]
            ih = tmap_fold.get(home)
            ia = tmap_fold.get(away)
            if ih is None or ia is None:
                out_probs[i] = [1/3, 1/3, 1/3]
                continue
            M_avg = np.zeros((max_goals + 1, max_goals + 1))
            for k in range(n_samples_fold):
                lam_h_k = np.exp(home_adv_s_fold[k] + attack_s_fold[k, ih] - defense_s_fold[k, ia])
                lam_a_k = np.exp(attack_s_fold[k, ia] - defense_s_fold[k, ih])
                rho_k = rho_s_fold[k]
                M_avg += dc_joint_matrix(lam_h_k, lam_a_k, rho_k, max_goals)
            M_avg /= n_samples_fold
            p_home = np.triu(M_avg, k=1).sum()
            p_away = np.tril(M_avg, k=-1).sum()
            p_draw = np.trace(M_avg)
            tot = p_home + p_draw + p_away
            out_probs[i] = [p_home/tot, p_draw/tot, p_away/tot]
        return out_probs

### Class weight adjust

In [35]:
if TORCH_OK and DO_TF_OOF:

    def fit_tf_fold(train_fold, val_fold, epochs=25, patience=6):

        y_tr = train_fold["y"].astype(int).values
        cc = np.bincount(y_tr, minlength=3)
        T  = cc.sum()
        class_weights = {
            0: T / (3 * cc[0]),        # H
            1: T / (3 * cc[1]) * 1,  # D 
            2: T / (3 * cc[2]),        # A
        }
        print(f"[TF fold] class_counts={cc}, class_weights={class_weights}")

        train_ds = EnhancedMatchDataset(train_fold, class_weights=class_weights)
        val_ds   = EnhancedMatchDataset(val_fold,   class_weights=class_weights)

        train_loader = DataLoader(train_ds, batch_size=32, shuffle=True, pin_memory=True)
        val_loader   = DataLoader(val_ds,   batch_size=64, shuffle=False, pin_memory=True)

        seq_len_tf   = train_ds.home_seq.shape[1]
        feat_dim_tf  = train_ds.home_seq.shape[2]
        match_dim_tf = train_ds.match_feat.shape[1]

        print(f"[TF fold] seq_len={seq_len_tf}, feat_dim={feat_dim_tf}, match_feat_dim={match_dim_tf}")

        model = EnhancedMatchTransformer(
            seq_len=seq_len_tf,
            feat_dim=feat_dim_tf,
            match_feat_dim=match_dim_tf,
            d_model=64,
            nhead=4,
            num_layers=2,
            ff_dim=128,
            dropout=0.3
        ).to(DEVICE)

        train_loader = DataLoader(
            train_ds,
            batch_size=32,
            shuffle=True,
            pin_memory=True
        )
        val_loader = DataLoader(
            val_ds,
            batch_size=64,
            shuffle=False,
            pin_memory=True
        )

 # ⑤ ( sample_weights etc)
        model = train_enhanced_tf(
            model,
            train_loader,
            val_loader,
            class_weights=class_weights,
            epochs=epochs,
            patience=patience
        )

        return model

    def predict_tf_fold(model, df_fold):
        ds = EnhancedMatchDataset(df_fold, class_weights=None)  # 
        loader = DataLoader(
            ds,
            batch_size=64,
            shuffle=False,
            pin_memory=True
        )

        model.eval()
        probs = []
        with torch.no_grad():
            for hs, asq, mfeat, yb, sw in loader:
                hs    = hs.to(DEVICE)
                asq   = asq.to(DEVICE)
                mfeat = mfeat.to(DEVICE)

                hs    = torch.nan_to_num(hs,    nan=0.0)
                asq   = torch.nan_to_num(asq,   nan=0.0)
                mfeat = torch.nan_to_num(mfeat, nan=0.0)

                logits = model(hs, asq, mfeat)
                prob = torch.softmax(logits, dim=1)
                prob = torch.clamp(prob, 1e-7, 1 - 1e-7)
                probs.append(prob.cpu().numpy())

        proba = np.vstack(probs)
        proba = proba / proba.sum(axis=1, keepdims=True)
        return proba

 # run_enhanced_tf 
    safe_features_tf = [
        f for f in feature_cols_xgb
        if 'pm' in f.lower()
        or 'form' in f.lower()
        or 'elo' in f.lower()
        or 'position' in f.lower()
        or 'points' in f.lower()
        or 'l10' in f.lower()
        or 'win_streak' in f.lower()
        or 'unbeaten' in f.lower()
    ]
    
    if len(safe_features_tf) < 10:
        print(f" lack of tf oof secure features, using all xgb features.")
        safe_features_tf = feature_cols_xgb

    print(f" [tf oof] constructing meta_df past sequence")
    meta_df_with_seq, tf_feat_dim = build_team_sequences_fixed(
        meta_df, safe_features_tf, seq_len=5
    )
    
    #  meta_df
    for col in ["home_form_seq", "away_form_seq", "match_features"]:
        if col in meta_df.columns:
            meta_df.drop(columns=[col], inplace=True)
        meta_df[col] = meta_df_with_seq[col]
    print(f" standardize past sequence")

 # standardize_sequences , val/test,
 # train=val=meta_df ,.
    meta_tmp_train = meta_df.copy()
    meta_tmp_val   = meta_df.copy()

    meta_tmp_train, meta_tmp_val, _, mu_seq_tf, sd_seq_tf, mu_match_tf, sd_match_tf = \
        standardize_sequences(meta_tmp_train, meta_tmp_val, None)

 # result meta_df
    meta_df["home_form_seq"]  = meta_tmp_train["home_form_seq"]
    meta_df["away_form_seq"]  = meta_tmp_train["away_form_seq"]
    meta_df["match_features"] = meta_tmp_train["match_features"]

    for s in seasons_sorted:
        if s in done_seasons:
            print(f"Season {s}: has been completed, skip")
            continue

        val_fold = meta_df[meta_df["Season"] == s].copy()
        train_fold = meta_df[meta_df["Season"].isin([ss for ss in seasons_sorted if ss < s])].copy()
        if len(train_fold) == 0:
            print(f"Season {s}: skip (no prior seasons)")
            done_seasons.add(s)
            continue

        print(f"\nOOF season {s}: train<= {train_fold['Season'].max()}, val={s}, n_val={len(val_fold)}")

        # XGB
        xgb_fold = fit_xgb_fold(train_fold)
        X_val = val_fold[feature_cols_xgb].to_numpy(dtype=np.float32)
        proba_val_xgb_fold = xgb_fold.predict_proba(X_val)
        
        # DC
        if DO_DC_OOF_PROPER:
            try:
                dc_state = fit_dc_fold_proper(train_fold, draws=600, tune=300)
                proba_val_dc_fold = predict_dc_fold_proper(dc_state, val_fold)
                print("  DC fold trained independently")
            except Exception as e:
                print(f"  DC fold failed: {e}, using global posterior")
                
 # ✅ val_fold DC 
                X_dc_val_fold = dc_scaler.transform(
                    val_fold[dc_feature_cols].copy().fillna(0.0)
                )
                
                proba_val_dc_fold, _ = bayes_dc_predict_full(
                    val_fold,
                    X_dc_val_fold,      # ✅
                    attack_s,
                    defense_s,
                    home_adv_s,
                    rho_s,
                    beta_h_s,
                    beta_a_s,
                    tmap,
                )

        else:
            print(f"  DC fold failed: {e}, using global posterior")
            
 # ✅ val_fold DC 
            X_dc_val_fold = dc_scaler.transform(
                val_fold[dc_feature_cols].copy().fillna(0.0)
            )
            
            proba_val_dc_fold, _ = bayes_dc_predict_full(
                val_fold,
                X_dc_val_fold,      # ✅
                attack_s,
                defense_s,
                home_adv_s,
                rho_s,
                beta_h_s,
                beta_a_s,
                tmap,
            )

        # TF
        if TORCH_OK and DO_TF_OOF:
            tf_fold = fit_tf_fold(train_fold, val_fold)
            proba_val_tf_fold = predict_tf_fold(tf_fold, val_fold)
        else:
            proba_val_tf_fold = np.full_like(proba_val_xgb_fold, 1/3)

        #  OOF
        for oid, px, pd_, pt_ in zip(
            val_fold["_orig_idx"].values,
            proba_val_xgb_fold,
            proba_val_dc_fold,
            proba_val_tf_fold
        ):
            pos = orig_to_pos[oid]
            oof_xgb[pos] = px
            oof_dc[pos]  = pd_
            oof_tf[pos]  = pt_

 # ✅, checkpoint once
        done_seasons.add(s)
        ckpt_to_save = {
            "train_df":       train_df,
            "val_df":         val_df,
            "meta_df":        meta_df,
            "oof_xgb":        oof_xgb,
            "oof_dc":         oof_dc,
            "oof_tf":         oof_tf,
            "seasons_sorted": seasons_sorted,
            "done_seasons":   sorted(done_seasons),
        }
        with open(OOF_CKPT_PATH, "wb") as f:
            pickle.dump(ckpt_to_save, f)
        print(f"checkpoint updated，completed season: {sorted(done_seasons)}")

 # ✅draw OOF (optional,)
    mask_ok = ~(
        np.isnan(oof_xgb).any(1) |
        np.isnan(oof_dc).any(1)  |
        np.isnan(oof_tf).any(1)  |
        np.isnan(oof_pD_draw)    # 
    )
    
    meta_oof_df = meta_df.loc[mask_ok].copy().reset_index(drop=True)
    
    #  OOF
    oof_xgb     = oof_xgb[mask_ok]
    oof_dc      = oof_dc[mask_ok]
    oof_tf      = oof_tf[mask_ok]
    oof_pD_draw = oof_pD_draw[mask_ok]
    
    # ✅ meta_oof_df["pD_special"]
    meta_oof_df["pD_special"] = oof_pD_draw

    print(" OOF built (with proper DC fold training):", meta_oof_df.shape)
    print(" ready to start stacking.")

 [tf oof] constructing meta_df past sequence
Sequence feature number: 64
Sequence feature : ['form_home', 'form_away', 'form_diff', 'form_home_v2', 'form_away_v2', 'form_diff_v2', 'home_home_form', 'away_away_form', 'win_streak_home', 'win_streak_away']...
 standardize past sequence
Season 2000/2001: skip (no prior seasons)

OOF season 2001/2002: train<= 2000/2001, val=2001/2002, n_val=380


Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 2 jobs)
NUTS: [sigma_att, sigma_def, att_offset, def_offset, home_adv, rho_raw]
Sampling 4 chains for 300 tune and 600 draw iterations (1_200 + 2_400 draws total) took 15 seconds.


  DC fold trained independently
[TF fold] class_counts=[184 101  95], class_weights={0: np.float64(0.6884057971014492), 1: np.float64(1.2541254125412542), 2: np.float64(1.3333333333333333)}
  home_seq range: [-9.97, 6.18]
  away_seq range: [-9.97, 6.18]
  match_feat range: [-5.58, 4.40]
  home_seq range: [-9.97, 5.66]
  away_seq range: [-9.97, 5.66]
  match_feat range: [-5.58, 5.60]
[TF fold] seq_len=5, feat_dim=64, match_feat_dim=3
Epoch 01 | lr=0.001000 | train_loss=1.1060 | train_acc=0.368
         | val_logloss=1.1006
         | train_pred_dist: H=0.39 D=0.18 A=0.43
         | val_pred_dist:   H=0.36 D=0.13 A=0.51
Epoch 03 | lr=0.001000 | train_loss=1.0552 | train_acc=0.487
         | val_logloss=1.1213
         | train_pred_dist: H=0.38 D=0.38 A=0.24
         | val_pred_dist:   H=0.41 D=0.13 A=0.46
Epoch 06 | lr=0.000500 | train_loss=0.9430 | train_acc=0.566
         | val_logloss=1.1779
         | train_pred_dist: H=0.41 D=0.33 A=0.26
         | val_pred_dist:   H=0.51 D=0.24 A=0

Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 2 jobs)
NUTS: [sigma_att, sigma_def, att_offset, def_offset, home_adv, rho_raw]
Sampling 4 chains for 300 tune and 600 draw iterations (1_200 + 2_400 draws total) took 18 seconds.


  DC fold trained independently
[TF fold] class_counts=[349 202 209], class_weights={0: np.float64(0.725883476599809), 1: np.float64(1.2541254125412542), 2: np.float64(1.2121212121212122)}
  home_seq range: [-9.97, 6.18]
  away_seq range: [-9.97, 6.18]
  match_feat range: [-5.58, 5.60]
  home_seq range: [-9.97, 4.08]
  away_seq range: [-9.97, 4.08]
  match_feat range: [-5.58, 5.60]
[TF fold] seq_len=5, feat_dim=64, match_feat_dim=3
Epoch 01 | lr=0.001000 | train_loss=1.1262 | train_acc=0.355
         | val_logloss=1.0675
         | train_pred_dist: H=0.37 D=0.35 A=0.28
         | val_pred_dist:   H=0.53 D=0.05 A=0.42
Epoch 03 | lr=0.001000 | train_loss=1.0535 | train_acc=0.476
         | val_logloss=1.1141
         | train_pred_dist: H=0.47 D=0.21 A=0.32
         | val_pred_dist:   H=0.42 D=0.28 A=0.29
Epoch 06 | lr=0.000500 | train_loss=1.0145 | train_acc=0.482
         | val_logloss=1.1081
         | train_pred_dist: H=0.38 D=0.26 A=0.36
         | val_pred_dist:   H=0.42 D=0.34 A=0.

Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 2 jobs)
NUTS: [sigma_att, sigma_def, att_offset, def_offset, home_adv, rho_raw]
Sampling 4 chains for 300 tune and 600 draw iterations (1_200 + 2_400 draws total) took 24 seconds.


  DC fold trained independently
[TF fold] class_counts=[536 292 312], class_weights={0: np.float64(0.7089552238805971), 1: np.float64(1.3013698630136987), 2: np.float64(1.2179487179487178)}
  home_seq range: [-9.97, 6.18]
  away_seq range: [-9.97, 6.18]
  match_feat range: [-5.58, 5.60]
  home_seq range: [-9.97, 6.01]
  away_seq range: [-9.97, 6.01]
  match_feat range: [-5.58, 5.60]
[TF fold] seq_len=5, feat_dim=64, match_feat_dim=3
Epoch 01 | lr=0.001000 | train_loss=1.1297 | train_acc=0.383
         | val_logloss=1.1067
         | train_pred_dist: H=0.38 D=0.26 A=0.37
         | val_pred_dist:   H=0.49 D=0.19 A=0.32
Epoch 03 | lr=0.001000 | train_loss=1.0632 | train_acc=0.444
         | val_logloss=1.1244
         | train_pred_dist: H=0.40 D=0.29 A=0.31
         | val_pred_dist:   H=0.37 D=0.19 A=0.44
Epoch 06 | lr=0.000500 | train_loss=1.0211 | train_acc=0.468
         | val_logloss=1.1326
         | train_pred_dist: H=0.36 D=0.29 A=0.35
         | val_pred_dist:   H=0.40 D=0.35 A=0

Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 2 jobs)
NUTS: [sigma_att, sigma_def, att_offset, def_offset, home_adv, rho_raw]
Sampling 4 chains for 300 tune and 600 draw iterations (1_200 + 2_400 draws total) took 30 seconds.
The rhat statistic is larger than 1.01 for some parameters. This indicates problems during sampling. See https://arxiv.org/abs/1903.08008 for details


  DC fold trained independently
[TF fold] class_counts=[703 400 417], class_weights={0: np.float64(0.7207207207207207), 1: np.float64(1.2666666666666666), 2: np.float64(1.2150279776179056)}
  home_seq range: [-9.97, 6.18]
  away_seq range: [-9.97, 6.18]
  match_feat range: [-5.58, 5.60]
  home_seq range: [-9.97, 3.85]
  away_seq range: [-9.97, 3.85]
  match_feat range: [-5.58, 3.41]
[TF fold] seq_len=5, feat_dim=64, match_feat_dim=3
Epoch 01 | lr=0.001000 | train_loss=1.1126 | train_acc=0.364
         | val_logloss=1.1037
         | train_pred_dist: H=0.41 D=0.28 A=0.32
         | val_pred_dist:   H=0.39 D=0.04 A=0.57
Epoch 03 | lr=0.001000 | train_loss=1.0834 | train_acc=0.418
         | val_logloss=1.1005
         | train_pred_dist: H=0.42 D=0.29 A=0.29
         | val_pred_dist:   H=0.59 D=0.25 A=0.16
Epoch 06 | lr=0.001000 | train_loss=1.0555 | train_acc=0.434
         | val_logloss=1.1229
         | train_pred_dist: H=0.37 D=0.33 A=0.30
         | val_pred_dist:   H=0.52 D=0.13 A=0

Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 2 jobs)
NUTS: [sigma_att, sigma_def, att_offset, def_offset, home_adv, rho_raw]
Sampling 4 chains for 300 tune and 600 draw iterations (1_200 + 2_400 draws total) took 30 seconds.


  DC fold trained independently
[TF fold] class_counts=[876 510 514], class_weights={0: np.float64(0.7229832572298326), 1: np.float64(1.2418300653594772), 2: np.float64(1.2321660181582361)}
  home_seq range: [-9.97, 6.18]
  away_seq range: [-9.97, 6.18]
  match_feat range: [-5.58, 5.60]
  home_seq range: [-9.97, 4.55]
  away_seq range: [-9.97, 4.55]
  match_feat range: [-5.58, 5.60]
[TF fold] seq_len=5, feat_dim=64, match_feat_dim=3
Epoch 01 | lr=0.001000 | train_loss=1.1166 | train_acc=0.349
         | val_logloss=1.0839
         | train_pred_dist: H=0.37 D=0.28 A=0.35
         | val_pred_dist:   H=0.43 D=0.29 A=0.28
Epoch 03 | lr=0.001000 | train_loss=1.0866 | train_acc=0.399
         | val_logloss=1.0675
         | train_pred_dist: H=0.38 D=0.27 A=0.35
         | val_pred_dist:   H=0.40 D=0.14 A=0.46
Epoch 06 | lr=0.001000 | train_loss=1.0585 | train_acc=0.456
         | val_logloss=1.0980
         | train_pred_dist: H=0.40 D=0.30 A=0.30
         | val_pred_dist:   H=0.32 D=0.18 A=0

Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 2 jobs)
NUTS: [sigma_att, sigma_def, att_offset, def_offset, home_adv, rho_raw]
Sampling 4 chains for 300 tune and 600 draw iterations (1_200 + 2_400 draws total) took 35 seconds.


  DC fold trained independently
[TF fold] class_counts=[1068  587  625], class_weights={0: np.float64(0.7116104868913857), 1: np.float64(1.2947189097103917), 2: np.float64(1.216)}
  home_seq range: [-9.97, 6.18]
  away_seq range: [-9.97, 6.18]
  match_feat range: [-5.58, 5.60]
  home_seq range: [-9.97, 4.55]
  away_seq range: [-9.97, 3.63]
  match_feat range: [-5.58, 5.60]
[TF fold] seq_len=5, feat_dim=64, match_feat_dim=3
Epoch 01 | lr=0.001000 | train_loss=1.1145 | train_acc=0.361
         | val_logloss=1.0901
         | train_pred_dist: H=0.38 D=0.31 A=0.31
         | val_pred_dist:   H=0.36 D=0.52 A=0.13
Epoch 03 | lr=0.001000 | train_loss=1.0819 | train_acc=0.432
         | val_logloss=1.1250
         | train_pred_dist: H=0.40 D=0.24 A=0.37
         | val_pred_dist:   H=0.20 D=0.47 A=0.33
Epoch 06 | lr=0.001000 | train_loss=1.0566 | train_acc=0.422
         | val_logloss=1.0864
         | train_pred_dist: H=0.35 D=0.30 A=0.36
         | val_pred_dist:   H=0.52 D=0.31 A=0.18
Epoch 

Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 2 jobs)
NUTS: [sigma_att, sigma_def, att_offset, def_offset, home_adv, rho_raw]
Sampling 4 chains for 300 tune and 600 draw iterations (1_200 + 2_400 draws total) took 39 seconds.


  DC fold trained independently
[TF fold] class_counts=[1250  685  725], class_weights={0: np.float64(0.7093333333333334), 1: np.float64(1.294403892944039), 2: np.float64(1.2229885057471264)}
  home_seq range: [-9.97, 6.18]
  away_seq range: [-9.97, 6.18]
  match_feat range: [-5.58, 5.60]
  home_seq range: [-4.42, 4.48]
  away_seq range: [-4.42, 4.48]
  match_feat range: [-5.58, 5.60]
[TF fold] seq_len=5, feat_dim=64, match_feat_dim=3
Epoch 01 | lr=0.001000 | train_loss=1.1198 | train_acc=0.342
         | val_logloss=1.0613
         | train_pred_dist: H=0.31 D=0.32 A=0.36
         | val_pred_dist:   H=0.38 D=0.05 A=0.57
Epoch 03 | lr=0.001000 | train_loss=1.0848 | train_acc=0.418
         | val_logloss=1.0796
         | train_pred_dist: H=0.38 D=0.20 A=0.42
         | val_pred_dist:   H=0.13 D=0.29 A=0.58
Epoch 06 | lr=0.001000 | train_loss=1.0651 | train_acc=0.414
         | val_logloss=1.0602
         | train_pred_dist: H=0.29 D=0.34 A=0.37
         | val_pred_dist:   H=0.49 D=0.27 A

Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 2 jobs)
NUTS: [sigma_att, sigma_def, att_offset, def_offset, home_adv, rho_raw]
Sampling 4 chains for 300 tune and 600 draw iterations (1_200 + 2_400 draws total) took 50 seconds.


  DC fold trained independently
[TF fold] class_counts=[1426  785  829], class_weights={0: np.float64(0.7106124357176251), 1: np.float64(1.2908704883227176), 2: np.float64(1.222356252513068)}
  home_seq range: [-9.97, 6.18]
  away_seq range: [-9.97, 6.18]
  match_feat range: [-5.58, 5.60]
  home_seq range: [-9.97, 4.44]
  away_seq range: [-9.97, 4.44]
  match_feat range: [-5.58, 5.60]
[TF fold] seq_len=5, feat_dim=64, match_feat_dim=3
Epoch 01 | lr=0.001000 | train_loss=1.1032 | train_acc=0.388
         | val_logloss=1.0692
         | train_pred_dist: H=0.35 D=0.31 A=0.34
         | val_pred_dist:   H=0.68 D=0.02 A=0.30
Epoch 03 | lr=0.001000 | train_loss=1.0814 | train_acc=0.412
         | val_logloss=1.0724
         | train_pred_dist: H=0.35 D=0.24 A=0.42
         | val_pred_dist:   H=0.41 D=0.46 A=0.13
Epoch 06 | lr=0.000500 | train_loss=1.0575 | train_acc=0.441
         | val_logloss=1.0949
         | train_pred_dist: H=0.38 D=0.29 A=0.34
         | val_pred_dist:   H=0.44 D=0.16 A

Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 2 jobs)
NUTS: [sigma_att, sigma_def, att_offset, def_offset, home_adv, rho_raw]
Sampling 4 chains for 300 tune and 600 draw iterations (1_200 + 2_400 draws total) took 48 seconds.


  DC fold trained independently
[TF fold] class_counts=[1599  882  939], class_weights={0: np.float64(0.7129455909943715), 1: np.float64(1.2925170068027212), 2: np.float64(1.2140575079872205)}
  home_seq range: [-9.97, 6.18]
  away_seq range: [-9.97, 6.18]
  match_feat range: [-5.58, 5.60]
  home_seq range: [-9.97, 4.44]
  away_seq range: [-9.97, 4.44]
  match_feat range: [-5.58, 5.60]
[TF fold] seq_len=5, feat_dim=64, match_feat_dim=3
Epoch 01 | lr=0.001000 | train_loss=1.1062 | train_acc=0.389
         | val_logloss=1.0536
         | train_pred_dist: H=0.39 D=0.28 A=0.33
         | val_pred_dist:   H=0.51 D=0.19 A=0.30
Epoch 03 | lr=0.001000 | train_loss=1.0821 | train_acc=0.423
         | val_logloss=1.0689
         | train_pred_dist: H=0.38 D=0.24 A=0.38
         | val_pred_dist:   H=0.33 D=0.33 A=0.34
Epoch 06 | lr=0.000500 | train_loss=1.0644 | train_acc=0.439
         | val_logloss=1.0855
         | train_pred_dist: H=0.37 D=0.25 A=0.38
         | val_pred_dist:   H=0.29 D=0.49 

Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 2 jobs)
NUTS: [sigma_att, sigma_def, att_offset, def_offset, home_adv, rho_raw]
Sampling 4 chains for 300 tune and 600 draw iterations (1_200 + 2_400 draws total) took 48 seconds.


  DC fold trained independently
[TF fold] class_counts=[1792  978 1030], class_weights={0: np.float64(0.7068452380952381), 1: np.float64(1.2951601908657124), 2: np.float64(1.2297734627831716)}
  home_seq range: [-9.97, 6.18]
  away_seq range: [-9.97, 6.18]
  match_feat range: [-5.58, 5.60]
  home_seq range: [-9.97, 6.90]
  away_seq range: [-9.97, 6.90]
  match_feat range: [-5.58, 5.60]
[TF fold] seq_len=5, feat_dim=64, match_feat_dim=3
Epoch 01 | lr=0.001000 | train_loss=1.1038 | train_acc=0.394
         | val_logloss=1.1202
         | train_pred_dist: H=0.35 D=0.31 A=0.34
         | val_pred_dist:   H=0.44 D=0.03 A=0.53
Epoch 03 | lr=0.001000 | train_loss=1.0842 | train_acc=0.393
         | val_logloss=1.0978
         | train_pred_dist: H=0.34 D=0.28 A=0.38
         | val_pred_dist:   H=0.62 D=0.05 A=0.33
Epoch 06 | lr=0.001000 | train_loss=1.0734 | train_acc=0.429
         | val_logloss=1.1373
         | train_pred_dist: H=0.42 D=0.22 A=0.35
         | val_pred_dist:   H=0.26 D=0.51 

Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 2 jobs)
NUTS: [sigma_att, sigma_def, att_offset, def_offset, home_adv, rho_raw]
Sampling 4 chains for 300 tune and 600 draw iterations (1_200 + 2_400 draws total) took 53 seconds.


  DC fold trained independently
[TF fold] class_counts=[1971 1089 1120], class_weights={0: np.float64(0.7069169626247251), 1: np.float64(1.2794612794612794), 2: np.float64(1.244047619047619)}
  home_seq range: [-9.97, 6.90]
  away_seq range: [-9.97, 6.90]
  match_feat range: [-5.58, 5.60]
  home_seq range: [-9.97, 7.30]
  away_seq range: [-9.97, 7.30]
  match_feat range: [-5.58, 5.60]
[TF fold] seq_len=5, feat_dim=64, match_feat_dim=3
Epoch 01 | lr=0.001000 | train_loss=1.1047 | train_acc=0.380
         | val_logloss=1.0695
         | train_pred_dist: H=0.38 D=0.25 A=0.37
         | val_pred_dist:   H=0.33 D=0.36 A=0.31
Epoch 03 | lr=0.001000 | train_loss=1.0846 | train_acc=0.416
         | val_logloss=1.0661
         | train_pred_dist: H=0.37 D=0.28 A=0.35
         | val_pred_dist:   H=0.50 D=0.16 A=0.34
Epoch 06 | lr=0.001000 | train_loss=1.0724 | train_acc=0.419
         | val_logloss=1.1006
         | train_pred_dist: H=0.38 D=0.24 A=0.39
         | val_pred_dist:   H=0.33 D=0.46 A

Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 2 jobs)
NUTS: [sigma_att, sigma_def, att_offset, def_offset, home_adv, rho_raw]
Sampling 4 chains for 300 tune and 600 draw iterations (1_200 + 2_400 draws total) took 59 seconds.


  DC fold trained independently
[TF fold] class_counts=[2142 1182 1236], class_weights={0: np.float64(0.7096171802054155), 1: np.float64(1.2859560067681894), 2: np.float64(1.2297734627831716)}
  home_seq range: [-9.97, 7.30]
  away_seq range: [-9.97, 7.30]
  match_feat range: [-5.58, 5.60]
  home_seq range: [-4.91, 4.96]
  away_seq range: [-4.91, 4.96]
  match_feat range: [-5.58, 5.60]
[TF fold] seq_len=5, feat_dim=64, match_feat_dim=3
Epoch 01 | lr=0.001000 | train_loss=1.1045 | train_acc=0.375
         | val_logloss=1.0701
         | train_pred_dist: H=0.37 D=0.28 A=0.35
         | val_pred_dist:   H=0.52 D=0.24 A=0.23
Epoch 03 | lr=0.001000 | train_loss=1.0865 | train_acc=0.395
         | val_logloss=1.0656
         | train_pred_dist: H=0.35 D=0.24 A=0.41
         | val_pred_dist:   H=0.53 D=0.29 A=0.18
Epoch 06 | lr=0.001000 | train_loss=1.0751 | train_acc=0.416
         | val_logloss=1.0803
         | train_pred_dist: H=0.36 D=0.26 A=0.38
         | val_pred_dist:   H=0.43 D=0.14 

Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 2 jobs)
NUTS: [sigma_att, sigma_def, att_offset, def_offset, home_adv, rho_raw]
Sampling 4 chains for 300 tune and 600 draw iterations (1_200 + 2_400 draws total) took 64 seconds.


  DC fold trained independently
[TF fold] class_counts=[2308 1290 1342], class_weights={0: np.float64(0.7134604274985558), 1: np.float64(1.276485788113695), 2: np.float64(1.22702434177844)}
  home_seq range: [-9.97, 7.30]
  away_seq range: [-9.97, 7.30]
  match_feat range: [-5.58, 5.60]
  home_seq range: [-9.97, 4.75]
  away_seq range: [-9.97, 4.75]
  match_feat range: [-5.58, 5.60]
[TF fold] seq_len=5, feat_dim=64, match_feat_dim=3
Epoch 01 | lr=0.001000 | train_loss=1.1006 | train_acc=0.382
         | val_logloss=1.0735
         | train_pred_dist: H=0.39 D=0.25 A=0.36
         | val_pred_dist:   H=0.61 D=0.11 A=0.28
Epoch 03 | lr=0.001000 | train_loss=1.0835 | train_acc=0.408
         | val_logloss=1.0898
         | train_pred_dist: H=0.38 D=0.31 A=0.31
         | val_pred_dist:   H=0.43 D=0.27 A=0.30
Epoch 06 | lr=0.000500 | train_loss=1.0699 | train_acc=0.438
         | val_logloss=1.0784
         | train_pred_dist: H=0.43 D=0.19 A=0.38
         | val_pred_dist:   H=0.44 D=0.09 A=0

Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 2 jobs)
NUTS: [sigma_att, sigma_def, att_offset, def_offset, home_adv, rho_raw]
Sampling 4 chains for 300 tune and 600 draw iterations (1_200 + 2_400 draws total) took 92 seconds.
There was 1 divergence after tuning. Increase `target_accept` or reparameterize.


  DC fold trained independently
[TF fold] class_counts=[2487 1368 1465], class_weights={0: np.float64(0.7130411472992897), 1: np.float64(1.2962962962962963), 2: np.float64(1.2104664391353812)}
  home_seq range: [-9.97, 7.30]
  away_seq range: [-9.97, 7.30]
  match_feat range: [-5.58, 5.60]
  home_seq range: [-3.79, 3.80]
  away_seq range: [-3.79, 3.80]
  match_feat range: [-3.19, 5.60]
[TF fold] seq_len=5, feat_dim=64, match_feat_dim=3
Epoch 01 | lr=0.001000 | train_loss=1.1020 | train_acc=0.386
         | val_logloss=1.0808
         | train_pred_dist: H=0.41 D=0.23 A=0.36
         | val_pred_dist:   H=0.36 D=0.34 A=0.31
Epoch 03 | lr=0.001000 | train_loss=1.0849 | train_acc=0.404
         | val_logloss=1.1108
         | train_pred_dist: H=0.41 D=0.24 A=0.36
         | val_pred_dist:   H=0.39 D=0.03 A=0.58
Epoch 06 | lr=0.000500 | train_loss=1.0731 | train_acc=0.421
         | val_logloss=1.0832
         | train_pred_dist: H=0.37 D=0.26 A=0.37
         | val_pred_dist:   H=0.32 D=0.20 

Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 2 jobs)
NUTS: [sigma_att, sigma_def, att_offset, def_offset, home_adv, rho_raw]
Sampling 4 chains for 300 tune and 600 draw iterations (1_200 + 2_400 draws total) took 73 seconds.


  DC fold trained independently
[TF fold] class_counts=[2659 1461 1580], class_weights={0: np.float64(0.7145543437382474), 1: np.float64(1.3004791238877482), 2: np.float64(1.2025316455696202)}
  home_seq range: [-9.97, 7.30]
  away_seq range: [-9.97, 7.30]
  match_feat range: [-5.58, 5.60]
  home_seq range: [-9.97, 3.55]
  away_seq range: [-9.97, 3.55]
  match_feat range: [-5.58, 5.60]
[TF fold] seq_len=5, feat_dim=64, match_feat_dim=3
Epoch 01 | lr=0.001000 | train_loss=1.1019 | train_acc=0.386
         | val_logloss=1.0852
         | train_pred_dist: H=0.37 D=0.32 A=0.31
         | val_pred_dist:   H=0.74 D=0.01 A=0.25
Epoch 03 | lr=0.001000 | train_loss=1.0879 | train_acc=0.412
         | val_logloss=1.0978
         | train_pred_dist: H=0.43 D=0.20 A=0.37
         | val_pred_dist:   H=0.38 D=0.16 A=0.46
Epoch 06 | lr=0.001000 | train_loss=1.0791 | train_acc=0.425
         | val_logloss=1.0842
         | train_pred_dist: H=0.44 D=0.22 A=0.34
         | val_pred_dist:   H=0.54 D=0.15 

Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 2 jobs)
NUTS: [sigma_att, sigma_def, att_offset, def_offset, home_adv, rho_raw]
Sampling 4 chains for 300 tune and 600 draw iterations (1_200 + 2_400 draws total) took 74 seconds.


  DC fold trained independently
[TF fold] class_counts=[2816 1568 1696], class_weights={0: np.float64(0.7196969696969697), 1: np.float64(1.2925170068027212), 2: np.float64(1.1949685534591195)}
  home_seq range: [-9.97, 7.30]
  away_seq range: [-9.97, 7.30]
  match_feat range: [-5.58, 5.60]
  home_seq range: [-3.72, 4.05]
  away_seq range: [-3.72, 4.05]
  match_feat range: [-5.58, 5.60]
[TF fold] seq_len=5, feat_dim=64, match_feat_dim=3
Epoch 01 | lr=0.001000 | train_loss=1.1023 | train_acc=0.386
         | val_logloss=1.0933
         | train_pred_dist: H=0.40 D=0.26 A=0.34
         | val_pred_dist:   H=0.36 D=0.30 A=0.33
Epoch 03 | lr=0.001000 | train_loss=1.0859 | train_acc=0.414
         | val_logloss=1.0852
         | train_pred_dist: H=0.43 D=0.24 A=0.34
         | val_pred_dist:   H=0.41 D=0.11 A=0.48
Epoch 06 | lr=0.001000 | train_loss=1.0795 | train_acc=0.417
         | val_logloss=1.0820
         | train_pred_dist: H=0.39 D=0.28 A=0.33
         | val_pred_dist:   H=0.34 D=0.32 

Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 2 jobs)
NUTS: [sigma_att, sigma_def, att_offset, def_offset, home_adv, rho_raw]
Sampling 4 chains for 300 tune and 600 draw iterations (1_200 + 2_400 draws total) took 85 seconds.


  DC fold trained independently
[TF fold] class_counts=[3003 1652 1805], class_weights={0: np.float64(0.7170607170607171), 1: np.float64(1.3034705407586764), 2: np.float64(1.1929824561403508)}
  home_seq range: [-9.97, 7.30]
  away_seq range: [-9.97, 7.30]
  match_feat range: [-5.58, 5.60]
  home_seq range: [-9.97, 5.35]
  away_seq range: [-9.97, 5.35]
  match_feat range: [-5.58, 5.60]
[TF fold] seq_len=5, feat_dim=64, match_feat_dim=3
Epoch 01 | lr=0.001000 | train_loss=1.1018 | train_acc=0.385
         | val_logloss=1.0800
         | train_pred_dist: H=0.40 D=0.29 A=0.31
         | val_pred_dist:   H=0.44 D=0.10 A=0.47
Epoch 03 | lr=0.001000 | train_loss=1.0852 | train_acc=0.406
         | val_logloss=1.0693
         | train_pred_dist: H=0.40 D=0.23 A=0.37
         | val_pred_dist:   H=0.72 D=0.03 A=0.25
Epoch 06 | lr=0.001000 | train_loss=1.0810 | train_acc=0.422
         | val_logloss=1.0842
         | train_pred_dist: H=0.42 D=0.21 A=0.36
         | val_pred_dist:   H=0.44 D=0.02 

Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 2 jobs)
NUTS: [sigma_att, sigma_def, att_offset, def_offset, home_adv, rho_raw]
Sampling 4 chains for 300 tune and 600 draw iterations (1_200 + 2_400 draws total) took 94 seconds.


  DC fold trained independently
[TF fold] class_counts=[3176 1751 1913], class_weights={0: np.float64(0.7178841309823678), 1: np.float64(1.302113078241005), 2: np.float64(1.1918452692106638)}
  home_seq range: [-9.97, 7.30]
  away_seq range: [-9.97, 7.30]
  match_feat range: [-5.58, 5.60]
  home_seq range: [-5.19, 5.26]
  away_seq range: [-5.19, 5.26]
  match_feat range: [-5.58, 5.60]
[TF fold] seq_len=5, feat_dim=64, match_feat_dim=3
Epoch 01 | lr=0.001000 | train_loss=1.0982 | train_acc=0.385
         | val_logloss=1.1008
         | train_pred_dist: H=0.39 D=0.27 A=0.34
         | val_pred_dist:   H=0.39 D=0.53 A=0.08
Epoch 03 | lr=0.001000 | train_loss=1.0848 | train_acc=0.419
         | val_logloss=1.0810
         | train_pred_dist: H=0.43 D=0.18 A=0.38
         | val_pred_dist:   H=0.36 D=0.01 A=0.63
Epoch 06 | lr=0.000500 | train_loss=1.0803 | train_acc=0.417
         | val_logloss=1.0711
         | train_pred_dist: H=0.41 D=0.22 A=0.38
         | val_pred_dist:   H=0.49 D=0.02 A

Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 2 jobs)
NUTS: [sigma_att, sigma_def, att_offset, def_offset, home_adv, rho_raw]
Sampling 4 chains for 300 tune and 600 draw iterations (1_200 + 2_400 draws total) took 78 seconds.


  DC fold trained independently
[TF fold] class_counts=[3357 1822 2041], class_weights={0: np.float64(0.7169099394300467), 1: np.float64(1.3208927918038784), 2: np.float64(1.179160542217867)}
  home_seq range: [-9.97, 7.30]
  away_seq range: [-9.97, 7.30]
  match_feat range: [-5.58, 5.60]
  home_seq range: [-4.14, 5.26]
  away_seq range: [-4.14, 5.26]
  match_feat range: [-5.58, 5.60]
[TF fold] seq_len=5, feat_dim=64, match_feat_dim=3
Epoch 01 | lr=0.001000 | train_loss=1.1004 | train_acc=0.395
         | val_logloss=1.0799
         | train_pred_dist: H=0.41 D=0.27 A=0.32
         | val_pred_dist:   H=0.71 D=0.01 A=0.28
Epoch 03 | lr=0.001000 | train_loss=1.0868 | train_acc=0.406
         | val_logloss=1.0842
         | train_pred_dist: H=0.40 D=0.21 A=0.38
         | val_pred_dist:   H=0.32 D=0.36 A=0.32
Epoch 06 | lr=0.001000 | train_loss=1.0816 | train_acc=0.413
         | val_logloss=1.0700
         | train_pred_dist: H=0.40 D=0.21 A=0.39
         | val_pred_dist:   H=0.61 D=0.01 A

Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 2 jobs)
NUTS: [sigma_att, sigma_def, att_offset, def_offset, home_adv, rho_raw]
Sampling 4 chains for 300 tune and 600 draw iterations (1_200 + 2_400 draws total) took 80 seconds.


  DC fold trained independently
[TF fold] class_counts=[3529 1914 2157], class_weights={0: np.float64(0.7178615282894115), 1: np.float64(1.3235806339254614), 2: np.float64(1.1744707154999228)}
  home_seq range: [-9.97, 7.30]
  away_seq range: [-9.97, 7.30]
  match_feat range: [-5.58, 5.60]
  home_seq range: [-3.72, 4.43]
  away_seq range: [-3.65, 4.43]
  match_feat range: [-5.58, 5.60]
[TF fold] seq_len=5, feat_dim=64, match_feat_dim=3
Epoch 01 | lr=0.001000 | train_loss=1.1020 | train_acc=0.385
         | val_logloss=1.0766
         | train_pred_dist: H=0.37 D=0.27 A=0.37
         | val_pred_dist:   H=0.67 D=0.01 A=0.32
Epoch 03 | lr=0.001000 | train_loss=1.0866 | train_acc=0.404
         | val_logloss=1.0870
         | train_pred_dist: H=0.38 D=0.26 A=0.35
         | val_pred_dist:   H=0.49 D=0.23 A=0.28
Epoch 06 | lr=0.000500 | train_loss=1.0778 | train_acc=0.442
         | val_logloss=1.0791
         | train_pred_dist: H=0.47 D=0.21 A=0.32
         | val_pred_dist:   H=0.51 D=0.14 

Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 2 jobs)
NUTS: [sigma_att, sigma_def, att_offset, def_offset, home_adv, rho_raw]
Sampling 4 chains for 300 tune and 600 draw iterations (1_200 + 2_400 draws total) took 86 seconds.


  DC fold trained independently
[TF fold] class_counts=[3673 1997 2310], class_weights={0: np.float64(0.7242036482439422), 1: np.float64(1.3319979969954931), 2: np.float64(1.1515151515151516)}
  home_seq range: [-9.97, 7.30]
  away_seq range: [-9.97, 7.30]
  match_feat range: [-5.58, 5.60]
  home_seq range: [-9.97, 4.74]
  away_seq range: [-9.97, 4.43]
  match_feat range: [-5.58, 5.60]
[TF fold] seq_len=5, feat_dim=64, match_feat_dim=3
Epoch 01 | lr=0.001000 | train_loss=1.0990 | train_acc=0.381
         | val_logloss=1.0738
         | train_pred_dist: H=0.37 D=0.27 A=0.36
         | val_pred_dist:   H=0.30 D=0.28 A=0.42
Epoch 03 | lr=0.001000 | train_loss=1.0858 | train_acc=0.409
         | val_logloss=1.0876
         | train_pred_dist: H=0.39 D=0.23 A=0.38
         | val_pred_dist:   H=0.38 D=0.46 A=0.16
Epoch 06 | lr=0.001000 | train_loss=1.0819 | train_acc=0.426
         | val_logloss=1.0710
         | train_pred_dist: H=0.42 D=0.20 A=0.38
         | val_pred_dist:   H=0.41 D=0.00 

In [36]:

# features we already have:

# meta_oof_df : OOF DataFrame( mask_ok filter)
#   oof_xgb, oof_dc, oof_tf: Shape (N_oof, 3)
# Draw specialist: Shape (N_oof,)

# meta_oof_df["pD_special"] --- oof_pD_draw

print("meta_oof_df shape:", meta_oof_df.shape)
print("oof_xgb shape:", oof_xgb.shape)
print("oof_dc shape:", oof_dc.shape)
print("oof_tf shape:", oof_tf.shape)
print("oof_pD_draw shape:", oof_pD_draw.shape)

base_meta_oof = np.hstack([oof_xgb, oof_dc, oof_tf])     # (N_oof, 9)

# Draw specialist
pD_special_oof = oof_pD_draw.reshape(-1, 1)              # (N_oof, 1)

# 👉 final OOF meta features: ,10 
X_oof = np.hstack([base_meta_oof, pD_special_oof])       # (N_oof, 10)

y_oof      = meta_oof_df["y"].astype(int).values
season_oof = meta_oof_df["Season"].values

print("X_oof shape:", X_oof.shape)   # (N_oof, 10)
print("y_oof shape:", y_oof.shape)
print("season_oof len:", len(season_oof))


meta_oof_df shape: (7980, 176)
oof_xgb shape: (7980, 3)
oof_dc shape: (7980, 3)
oof_tf shape: (7980, 3)
oof_pD_draw shape: (7980,)
X_oof shape: (7980, 10)
y_oof shape: (7980,)
season_oof len: 7980


In [37]:
from sklearn.linear_model import LogisticRegression

def build_draw_special_oof(X, y, seasons, cw=2.5, C=1.0):
    n = len(X); pD_oof = np.zeros(n)
    uniq = sorted(np.unique(seasons), key=season_start_year)
    for s in uniq:
        tr, va = seasons != s, seasons == s
        if tr.sum() == 0 or va.sum() == 0: continue
        clf = LogisticRegression(solver="lbfgs", C=C, class_weight={0:1.0,1:cw}, max_iter=500)
        clf.fit(X[tr], (y[tr] == 1).astype(int))
        pD_oof[va] = clf.predict_proba(X[va])[:,1]
    clf_final = LogisticRegression(solver="lbfgs", C=C, class_weight={0:1.0,1:cw}, max_iter=500)
    clf_final.fit(X, (y == 1).astype(int))
    return pD_oof, clf_final

pD_special_oof, draw_clf_final = build_draw_special_oof(X_oof, y_oof, season_oof, cw=2.5, C=1.0)

### Meta model calibration

In [38]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss

def fit_two_stage(X_tr, y_tr,
                  cw_draw=2.0,
                  C1=0.5,
                  C2=0.5):
    """
      stage1: Draw vs Non-Draw
    parameter:
    """
    y_bin = (y_tr == 1).astype(int)

    stage1 = LogisticRegression(
        solver="lbfgs",
        C=C1,
        class_weight={0: 1.0, 1: cw_draw},
        max_iter=500,
    ).fit(X_tr, y_bin)

    mask_nd = (y_tr != 1)
    X2 = X_tr[mask_nd]
    y2 = y_tr[mask_nd]
    y2_bin = (y2 == 2).astype(int)  # 2=Away ->1, 0=Home->0

    stage2 = LogisticRegression(
        solver="lbfgs",
        C=C2,
        class_weight="balanced",   #  balanced
        max_iter=500,
    ).fit(X2, y2_bin)

    return stage1, stage2

# Two-stage
#     """
# + alpha draw:
#       pD = alpha * p_draw
# 1-pD Then by p_away H/A
#     """
#     p_draw = np.clip(np.nan_to_num(p_draw, nan=0.0), 0, 1)
#     p_away = np.clip(np.nan_to_num(p_away, nan=0.5), 0, 1)

#     pD = np.clip(alpha * p_draw, 0, 1)
#     pA = (1 - pD) * p_away
#     pH = (1 - pD) * (1 - p_away)

#     proba = np.stack([pH, pD, pA], axis=1)
#     proba = proba / proba.sum(axis=1, keepdims=True)
#     proba = np.clip(proba, 1e-7, 1-1e-7)
#     return proba

def make_proba_two_stage_alpha(p_draw, p_away, alpha):
    """
        pD = alpha * p_draw
    """
    p_draw = np.clip(np.nan_to_num(p_draw, nan=0.0), 0, 1)
    p_away = np.clip(np.nan_to_num(p_away, nan=0.5), 0, 1)

    pD = np.clip(alpha * p_draw, 0, 1)  # ← use alpha scaling
    pA = (1 - pD) * p_away
    pH = (1 - pD) * (1 - p_away)

    proba = np.stack([pH, pD, pA], axis=1)
    proba = proba / proba.sum(axis=1, keepdims=True)
    return proba

# draw
TARGET_D = 0.26
TOL      = 0.04
target_low, target_high = TARGET_D - TOL, TARGET_D + TOL

alpha_grid = np.linspace(0.5, 1.0, 11)
seasons    = sorted(pd.unique(season_oof), key=season_start_year)

fold_alphas = []

for s in seasons:
    tr_mask = (season_oof != s)
    va_mask = (season_oof == s)

    X_tr, y_tr = X_oof[tr_mask], y_oof[tr_mask]
    X_va, y_va = X_oof[va_mask], y_oof[va_mask]

    if len(X_tr) == 0 or len(X_va) == 0:
        continue

    stage1, stage2 = fit_two_stage(X_tr, y_tr, cw_draw=DRAW_CLASS_MULT)
    p_draw_va = stage1.predict_proba(X_va)[:, 1]
    p_away_va = stage2.predict_proba(X_va)[:, 1]

    best_feasible = None
    best_any      = None

    for a in alpha_grid:
        proba_va = make_proba_two_stage_alpha(p_draw_va, p_away_va, a)
        ll = log_loss(y_va, proba_va, labels=[0,1,2])
        mean_pD = proba_va[:, 1].mean()

        if best_any is None or ll < best_any[0]:
            best_any = (ll, float(a), mean_pD)

        if target_low <= mean_pD <= target_high:
            if best_feasible is None or ll < best_feasible[0]:
                best_feasible = (ll, float(a), mean_pD)

    best = best_feasible if best_feasible is not None else best_any
    fold_alphas.append(best[1])
    print(f"OOF Season {s}: best alpha={best[1]:.3f}, val logloss={best[0]:.4f}, mean_pD={best[2]:.3f}")

import numpy as np
alpha_final = np.mean(fold_alphas)

# cw_draw , DRAW_CLASS_MULT
# if DRAW_CLASS_MULT ,,for example 3.0
cw_draw_best = DRAW_CLASS_MULT 

print("using CV average parameters: cw_draw=%.3f, alpha=%.3f" % (cw_draw_best, alpha_final))

# 
stage1_all, stage2_all = fit_two_stage(
    X_oof, y_oof,
    cw_draw=cw_draw_best,
    C1=0.5,
    C2=0.5
)


OOF Season 2001/2002: best alpha=1.000, val logloss=1.0290, mean_pD=0.269
OOF Season 2002/2003: best alpha=0.900, val logloss=1.0468, mean_pD=0.237
OOF Season 2003/2004: best alpha=1.000, val logloss=1.0512, mean_pD=0.260
OOF Season 2004/2005: best alpha=1.000, val logloss=1.0187, mean_pD=0.271
OOF Season 2005/2006: best alpha=0.800, val logloss=0.9940, mean_pD=0.228
OOF Season 2006/2007: best alpha=1.000, val logloss=1.0212, mean_pD=0.256
OOF Season 2007/2008: best alpha=1.000, val logloss=0.9742, mean_pD=0.266
OOF Season 2008/2009: best alpha=0.950, val logloss=0.9915, mean_pD=0.257
OOF Season 2009/2010: best alpha=0.950, val logloss=0.9917, mean_pD=0.255
OOF Season 2010/2011: best alpha=1.000, val logloss=1.0518, mean_pD=0.266
OOF Season 2011/2012: best alpha=0.900, val logloss=0.9973, mean_pD=0.245
OOF Season 2012/2013: best alpha=1.000, val logloss=1.0044, mean_pD=0.266
OOF Season 2013/2014: best alpha=0.800, val logloss=0.9675, mean_pD=0.223
OOF Season 2014/2015: best alpha=0.950

In [39]:
def predict_with_threshold(proba, tau=0.27, draw_bias=0.0):
    n = len(proba)
    preds = np.zeros(n, dtype=int)
    
    for i in range(n):
        p_h, p_d, p_a = proba[i]
        p_d_adj = p_d + draw_bias
        
        if abs(p_h - p_a) < tau and p_d_adj > 0.22:
            preds[i] = 1
        else:
            preds[i] = np.argmax([p_h, p_d_adj, p_a])
    
    return preds

In [40]:
from sklearn.metrics import log_loss, classification_report

cw_grid     = [1.5, 2.0, 2.5, 3.0]
alpha_grid  = np.linspace(0.6, 1.1, 11)   # 
C1, C2      = 0.5, 0.5

TAU       = 0.27
DRAW_BIAS = 0.05

seasons = sorted(pd.unique(season_oof), key=season_start_year)
true_draw_ratio_global = (y_oof == 1).mean()
print("Global true draw ratio:", true_draw_ratio_global)

best_combo = None
cv_records = []

for cw in cw_grid:
    for alpha in alpha_grid:
        fold_losses = []
        fold_macroF1 = []
        fold_f1D = []
        fold_draw_ratio = []

        for s in seasons:
            va_mask = (season_oof == s)
            tr_mask = ~va_mask

            X_tr, y_tr = X_oof[tr_mask], y_oof[tr_mask]
            X_va, y_va = X_oof[va_mask], y_oof[va_mask]

            if len(X_tr) == 0 or len(X_va) == 0:
                continue

            st1, st2 = fit_two_stage(X_tr, y_tr, cw_draw=cw, C1=C1, C2=C2)

 # fold p_draw, p_away
            p_draw_va = st1.predict_proba(X_va)[:, 1]
            p_away_va = st2.predict_proba(X_va)[:, 1]

            proba_va = make_proba_two_stage_alpha(p_draw_va, p_away_va, alpha)

 # logloss( proba_va)
            loss = log_loss(y_va, proba_va, labels=[0, 1, 2])

 # proba_va (, proba_val_meta)
            y_pred_va = predict_with_threshold(
                proba_va,
                tau=TAU,
                draw_bias=DRAW_BIAS
            )

            rep = classification_report(
                y_va, y_pred_va,
                digits=3, zero_division=0, output_dict=True
            )

            macro_f1 = rep["macro avg"]["f1-score"]
            f1_D     = rep["1"]["f1-score"]

            dist = np.bincount(y_pred_va, minlength=3) / len(y_pred_va)
            pred_draw_ratio = dist[1]

            fold_losses.append(loss)
            fold_macroF1.append(macro_f1)
            fold_f1D.append(f1_D)
            fold_draw_ratio.append(pred_draw_ratio)

        if not fold_losses:
            continue

        mean_loss        = np.mean(fold_losses)
        mean_macroF1     = np.mean(fold_macroF1)
        mean_f1D         = np.mean(fold_f1D)
        mean_draw_ratio  = np.mean(fold_draw_ratio)

        ratio_penalty = abs(mean_draw_ratio - true_draw_ratio_global)

        score = (
            - mean_loss       * 1.0   #  logloss small
            + mean_macroF1    * 0.4   # 
            + mean_f1D        * 0.6   # draw F1 high
            - ratio_penalty   * 0.8   # draw
        )

        cv_records.append((cw, alpha, mean_loss, mean_macroF1, mean_f1D,
                           mean_draw_ratio, ratio_penalty, score))

        if (best_combo is None) or (score > best_combo[-1]):
            best_combo = (cw, alpha, mean_loss, mean_macroF1, mean_f1D,
                          mean_draw_ratio, ratio_penalty, score)

print("Season CV search results (top 10 by score)")
cv_df = pd.DataFrame(cv_records, columns=[
    "cw_draw", "alpha", "mean_loss", "mean_macroF1", "mean_f1D",
    "mean_pred_draw_ratio", "ratio_penalty", "score"
])
display(cv_df.sort_values("score", ascending=False).head(10))

print(" best combination")
print("cw_draw=%.3f, alpha=%.3f" % (best_combo[0], best_combo[1]))
print("mean_loss=%.4f, macroF1=%.3f, f1D=%.3f" %
      (best_combo[2], best_combo[3], best_combo[4]))
print("mean_pred_draw_ratio=%.3f, penalty=%.3f, score=%.4f" %
      (best_combo[5], best_combo[6], best_combo[7]))


Global true draw ratio: 0.24862155388471177
Season CV search results (top 10 by score)


Unnamed: 0,cw_draw,alpha,mean_loss,mean_macroF1,mean_f1D,mean_pred_draw_ratio,ratio_penalty,score
0,1.5,0.6,1.018364,0.432084,0.385784,0.54411,0.295489,-0.850451
1,1.5,0.65,1.014155,0.428811,0.387779,0.55589,0.307268,-0.855778
2,1.5,0.7,1.011762,0.425389,0.388933,0.565915,0.317293,-0.862081
11,2.0,0.6,1.011249,0.424016,0.389104,0.569298,0.320677,-0.864721
3,1.5,0.75,1.010996,0.421857,0.390087,0.575564,0.326942,-0.869755
12,2.0,0.65,1.011237,0.421452,0.391484,0.580702,0.33208,-0.87343
4,1.5,0.8,1.011709,0.420951,0.392042,0.583208,0.334586,-0.875773
22,2.5,0.6,1.012271,0.419651,0.392509,0.587845,0.339223,-0.880284
13,2.0,0.7,1.01325,0.417967,0.392638,0.592732,0.34411,-0.885769
5,1.5,0.85,1.013785,0.416775,0.392342,0.594612,0.34599,-0.888462


 best combination
cw_draw=1.500, alpha=0.600
mean_loss=1.0184, macroF1=0.432, f1D=0.386
mean_pred_draw_ratio=0.544, penalty=0.295, score=-0.8505


In [41]:
from sklearn.metrics import log_loss

# val/test ( logloss / )
y_val  = val_df["y"].astype(int).values
y_test = test_df["y"].astype(int).values

# OOF two-stage model(=10)
stage1_all, stage2_all = fit_two_stage(X_oof, y_oof, cw_draw=DRAW_CLASS_MULT)

# Construct val/test 9 
X_base_val  = np.hstack([proba_val_xgb,  proba_val_dc,  proba_val_tf])   # (N_val, 9)
X_base_test = np.hstack([proba_test_xgb, proba_test_dc, proba_test_tf])  # (N_test, 9)

# match-level draw draw_model_full in val/test pD_special
X_draw_val  = val_df[draw_feature_cols].to_numpy(dtype=np.float32)
X_draw_test = test_df[draw_feature_cols].to_numpy(dtype=np.float32)

pD_special_val  = draw_model_full.predict_proba(X_draw_val)[:, 1]   # (N_val,)
pD_special_test = draw_model_full.predict_proba(X_draw_test)[:, 1]  # (N_test,)

# 10 meta features:9() + 1(pD_special)
X_meta_val  = np.hstack([X_base_val,  pD_special_val.reshape(-1, 1)])   # (N_val, 10)
X_meta_test = np.hstack([X_base_test, pD_special_test.reshape(-1, 1)])  # (N_test,10)

print("X_meta_val shape:",  X_meta_val.shape)
print("X_meta_test shape:", X_meta_test.shape)

# + alpha generate val/test 
p_draw_val  = stage1_all.predict_proba(X_meta_val)[:, 1]
p_away_val  = stage2_all.predict_proba(X_meta_val)[:, 1]
p_draw_test = stage1_all.predict_proba(X_meta_test)[:, 1]
p_away_test = stage2_all.predict_proba(X_meta_test)[:, 1]

proba_val_base  = make_proba_two_stage_alpha(p_draw_val,  p_away_val,  alpha_final)
proba_test_base = make_proba_two_stage_alpha(p_draw_test, p_away_test, alpha_final)

print("val logloss (two-stage+alpha base):",  log_loss(y_val,  proba_val_base,  labels=[0,1,2]))
print("test logloss (two-stage+alpha base):", log_loss(y_test, proba_test_base, labels=[0,1,2]))
print("test mean pD (two-stage+alpha base):", proba_test_base[:,1].mean())


X_meta_val shape: (380, 10)
X_meta_test shape: (1140, 10)
val logloss (two-stage+alpha base): 0.8598354826297515
test logloss (two-stage+alpha base): 1.0044843060753992
test mean pD (two-stage+alpha base): 0.24216534668492923


In [42]:
from sklearn.isotonic import IsotonicRegression

def fuse_draw_proba(proba_base, pD_special, lam=0.5):
    """
      pD_new = lam * pD_base + (1-lam) * pD_special
    """
    pH_base, pD_base, pA_base = proba_base[:,0], proba_base[:,1], proba_base[:,2]
    pD_special = np.clip(pD_special, 1e-7, 1-1e-7)

    pD_new = lam * pD_base + (1 - lam) * pD_special
    pD_new = np.clip(pD_new, 1e-7, 1-1e-7)

    scale = (1.0 - pD_new) / (1.0 - pD_base + 1e-8)
    pH_new = pH_base * scale
    pA_new = pA_base * scale

    proba_new = np.stack([pH_new, pD_new, pA_new], axis=1)
    proba_new = proba_new / proba_new.sum(axis=1, keepdims=True)
    proba_new = np.clip(proba_new, 1e-7, 1-1e-7)
    return proba_new

LAM = 0.5

# in val/test fuse 
proba_val_fused  = fuse_draw_proba(proba_val_base,  pD_special_val,  lam=LAM)
proba_test_fused = fuse_draw_proba(proba_test_base, pD_special_test, lam=LAM)

print("val logloss (fused):",  log_loss(y_val,  proba_val_fused,  labels=[0,1,2]))
print("test logloss (fused):", log_loss(y_test, proba_test_fused, labels=[0,1,2]))
print("test mean pD (fused):", proba_test_fused[:,1].mean())

# Isotonic 
USE_ISOTONIC = True

if USE_ISOTONIC:
    print("\n--- Isotonic Calibration (on fused OOF probabilities) ---")
    
    n_samples_oof = X_oof.shape[0]
    p_draw_oof = np.zeros(n_samples_oof, dtype=float)
    p_away_oof = np.zeros(n_samples_oof, dtype=float)

    seasons = sorted(pd.unique(season_oof), key=season_start_year)

    for s in seasons:
        va_mask = (season_oof == s)
        tr_mask = ~va_mask

        X_tr, y_tr = X_oof[tr_mask], y_oof[tr_mask]
        X_va, y_va = X_oof[va_mask], y_oof[va_mask]

        if len(X_tr) == 0 or len(X_va) == 0:
            continue

        stage1_cv, stage2_cv = fit_two_stage(X_tr, y_tr, cw_draw=DRAW_CLASS_MULT)
        p_draw_va = stage1_cv.predict_proba(X_va)[:, 1]
        p_away_va = stage2_cv.predict_proba(X_va)[:, 1]

        p_draw_oof[va_mask] = p_draw_va
        p_away_oof[va_mask] = p_away_va

    proba_oof_base = make_proba_two_stage_alpha(
        p_draw_oof,
        p_away_oof,
        alpha_final
    )

 # 2) in OOF draw-fuse(pD_special_oof oof_pD_draw)
    pD_special_oof = oof_pD_draw               # (N_oof,)
    proba_oof_fused = fuse_draw_proba(proba_oof_base, pD_special_oof, lam=LAM)

 # Isotonic 
    iso_calibrators = []
    for c in range(3):
        iso = IsotonicRegression(out_of_bounds="clip")
        iso.fit(proba_oof_fused[:, c], (y_oof == c).astype(float))
        iso_calibrators.append(iso)

 # pair fused val/test 
    proba_val_iso  = np.zeros_like(proba_val_fused)
    proba_test_iso = np.zeros_like(proba_test_fused)

    for c in range(3):
        proba_val_iso[:,  c] = iso_calibrators[c].transform(proba_val_fused[:,  c])
        proba_test_iso[:, c] = iso_calibrators[c].transform(proba_test_fused[:, c])

    proba_val_iso  = np.clip(proba_val_iso,  1e-7, 1 - 1e-7)
    proba_test_iso = np.clip(proba_test_iso, 1e-7, 1 - 1e-7)
    proba_val_iso  = proba_val_iso  / proba_val_iso.sum(axis=1,  keepdims=True)
    proba_test_iso = proba_test_iso / proba_test_iso.sum(axis=1, keepdims=True)

    print("val logloss (fused+isotonic):",  log_loss(y_val,  proba_val_iso,  labels=[0, 1, 2]))
    print("test logloss (fused+isotonic):", log_loss(y_test, proba_test_iso, labels=[0, 1, 2]))
    print("test mean pD (fused+isotonic):", proba_test_iso[:, 1].mean())

    proba_val_final  = proba_val_iso
    proba_test_final = proba_test_iso

else:
    proba_val_final  = proba_val_fused
    proba_test_final = proba_test_fused


val logloss (fused): 0.8436318796016304
test logloss (fused): 1.0106493996290882
test mean pD (fused): 0.2823256971683488

--- Isotonic Calibration (on fused OOF probabilities) ---
val logloss (fused+isotonic): 0.8218106745642301
test logloss (fused+isotonic): 0.9997378061973027
test mean pD (fused+isotonic): 0.24243344157105828


### Final test result

In [43]:
from sklearn.metrics import f1_score, classification_report, accuracy_score

def apply_draw_threshold(proba, tau):
    pH, pD, pA = proba[:,0], proba[:,1], proba[:,2]
    return np.where(pD > tau, 1, np.where(pA > pH, 2, 0))

taus = np.linspace(0.20, 0.45, 81)

best = None
for tau in taus:
    pred_val = apply_draw_threshold(proba_val_final, tau)
    rep_val  = classification_report(y_val, pred_val, output_dict=True, zero_division=0)

    macro_f1 = rep_val["macro avg"]["f1-score"]
    prec_D   = rep_val["1"]["precision"]
    rec_D    = rep_val["1"]["recall"]
    f1_D     = rep_val["1"]["f1-score"]
    dist_val = np.bincount(pred_val, minlength=3)/len(pred_val)

 # draw
    pred_draw_ratio = dist_val[1]
    if not (0.10 <= pred_draw_ratio <= 0.30):
        continue

    score = 0.7 * f1_D + 0.3 * macro_f1

    if (best is None) or (score > best[0]):
        best = (score, float(tau), dist_val, macro_f1, prec_D, rec_D, f1_D)

if best is None:
    best = (-1, float(taus[0]), np.array([0,0,0]), 0, 0, 0, 0)

tau_final = best[1]
print("\nFINAL tau:", tau_final, "(score=%.4f)" % best[0])
print("val pred dist:", best[2])
print("val macro-F1:", best[3])
print("val D precision=%.3f recall=%.3f F1=%.3f" % (best[4], best[5], best[6]))

pred_test = apply_draw_threshold(proba_test_final, tau_final)
print("\n" + "="*50)
print("FINAL TEST RESULTS")
print("="*50)
print("Accuracy:", accuracy_score(y_test, pred_test))
print(classification_report(y_test, pred_test, digits=3, zero_division=0))
print("Prediction distribution:", np.bincount(pred_test, minlength=3)/len(pred_test))
print("FINAL test logloss:", log_loss(y_test, proba_test_final, labels=[0,1,2]))



FINAL tau: 0.26875000000000004 (score=0.6789)
val pred dist: [0.40789474 0.24210526 0.35      ]
val macro-F1: 0.7075871781330516
val D precision=0.652 recall=0.682 F1=0.667

FINAL TEST RESULTS
Accuracy: 0.5114035087719299
              precision    recall  f1-score   support

           0      0.624     0.603     0.613       514
           1      0.248     0.198     0.220       262
           2      0.510     0.607     0.555       364

    accuracy                          0.511      1140
   macro avg      0.461     0.470     0.463      1140
weighted avg      0.501     0.511     0.504      1140

Prediction distribution: [0.43596491 0.18421053 0.37982456]
FINAL test logloss: 0.9997378061973027


In [44]:
from sklearn.metrics import log_loss, classification_report

# Two-stage
# draw-fuse isotonic τ hyperparameter tuning
# Direct argmax(proba) 

#  val/test Label
y_val_baseline  = val_df["y"].astype(int).values
y_test_baseline = test_df["y"].astype(int).values

# OOF Logistic model(input = 10 X_oof)
stage1_pure, stage2_pure = fit_two_stage(X_oof, y_oof, cw_draw=DRAW_CLASS_MULT)

# Construct val/test 10 meta Feature
# 1 9 (XGB/DC/TF)
X_base_val_pure  = np.hstack([proba_val_xgb,  proba_val_dc,  proba_val_tf])   # (N_val, 9)
X_base_test_pure = np.hstack([proba_test_xgb, proba_test_dc, proba_test_tf])  # (N_test, 9)

# 2 1 match-level draw pD_special( X_oof 10 )
X_draw_val_pure  = val_df[draw_feature_cols].to_numpy(dtype=np.float32)
X_draw_test_pure = test_df[draw_feature_cols].to_numpy(dtype=np.float32)

pD_special_val_pure  = draw_model_full.predict_proba(X_draw_val_pure)[:, 1]
pD_special_test_pure = draw_model_full.predict_proba(X_draw_test_pure)[:, 1]

X_meta_val_pure  = np.hstack([X_base_val_pure,  pD_special_val_pure.reshape(-1, 1)])   # (N_val, 10)
X_meta_test_pure = np.hstack([X_base_test_pure, pD_special_test_pure.reshape(-1, 1)])  # (N_test,10)

print("X_meta_val_pure shape:",  X_meta_val_pure.shape)
print("X_meta_test_pure shape:", X_meta_test_pure.shape)

# two-stage + alpha ( baseline)
p_draw_val_pure  = stage1_pure.predict_proba(X_meta_val_pure)[:, 1]
p_away_val_pure  = stage2_pure.predict_proba(X_meta_val_pure)[:, 1]
p_draw_test_pure = stage1_pure.predict_proba(X_meta_test_pure)[:, 1]
p_away_test_pure = stage2_pure.predict_proba(X_meta_test_pure)[:, 1]

proba_val_pure  = make_proba_two_stage_alpha(p_draw_val_pure,  p_away_val_pure,  alpha_final)
proba_test_pure = make_proba_two_stage_alpha(p_draw_test_pure, p_away_test_pure, alpha_final)

# argmax ( τ / draw-fuse / isotonic)
y_pred_val_pure  = proba_val_pure.argmax(axis=1)
y_pred_test_pure = proba_test_pure.argmax(axis=1)

# Evaluation:logloss + macro-F1 + distribution
print("\n===== PURE TWO-STAGE + ALPHA (NO FUSE / NO ISO / NO TAU) =====\n")

print("val logloss (pure):",  log_loss(y_val_baseline,  proba_val_pure,  labels=[0,1,2]))
print("test logloss (pure):", log_loss(y_test_baseline, proba_test_pure, labels=[0,1,2]))

rep_val_pure  = classification_report(y_val_baseline,  y_pred_val_pure,  digits=3, zero_division=0, output_dict=True)
rep_test_pure = classification_report(y_test_baseline, y_pred_test_pure, digits=3, zero_division=0, output_dict=True)

print("\nVAL classification report (pure):")
print(classification_report(y_val_baseline, y_pred_val_pure, digits=3, zero_division=0))

print("VAL macro-F1 (pure):", rep_val_pure["macro avg"]["f1-score"])
print("VAL pred distribution:", np.bincount(y_pred_val_pure, minlength=3)/len(y_pred_val_pure))

print("\nTEST classification report (pure):")
print(classification_report(y_test_baseline, y_pred_test_pure, digits=3, zero_division=0))

print("TEST macro-F1 (pure):", rep_test_pure["macro avg"]["f1-score"])
print("TEST pred distribution:", np.bincount(y_pred_test_pure, minlength=3)/len(y_pred_test_pure))


X_meta_val_pure shape: (380, 10)
X_meta_test_pure shape: (1140, 10)

===== PURE TWO-STAGE + ALPHA (NO FUSE / NO ISO / NO TAU) =====

val logloss (pure): 0.8598354826297515
test logloss (pure): 1.0044843060753992

VAL classification report (pure):
              precision    recall  f1-score   support

           0      0.741     0.755     0.748       163
           1      0.000     0.000     0.000        88
           2      0.537     0.891     0.671       129

    accuracy                          0.626       380
   macro avg      0.426     0.549     0.473       380
weighted avg      0.500     0.626     0.548       380

VAL macro-F1 (pure): 0.4727581002005666
VAL pred distribution: [0.43684211 0.         0.56315789]

TEST classification report (pure):
              precision    recall  f1-score   support

           0      0.653     0.607     0.629       514
           1      0.000     0.000     0.000       262
           2      0.447     0.813     0.577       364

    accuracy        

In [45]:
def print_label_dist(name, y):
    counts = np.bincount(y, minlength=3)
    dist = counts / counts.sum()
    print(f"\n{name} label distribution:")
    print("  counts:", counts.tolist())
    print("  ratio : [H=%.3f, D=%.3f, A=%.3f]" % (dist[0], dist[1], dist[2]))

print_label_dist("OOF",  y_oof)
print_label_dist("VAL",  val_df["y"].astype(int).values)
print_label_dist("TEST", test_df["y"].astype(int).values)



OOF label distribution:
  counts: [3652, 1984, 2344]
  ratio : [H=0.458, D=0.249, A=0.294]

VAL label distribution:
  counts: [163, 88, 129]
  ratio : [H=0.429, D=0.232, A=0.339]

TEST label distribution:
  counts: [514, 262, 364]
  ratio : [H=0.451, D=0.230, A=0.319]


In [46]:
from sklearn.metrics import classification_report

y_val_arr  = val_df["y"].astype(int).values
y_test_arr = test_df["y"].astype(int).values

def eval_base(name, proba_val, proba_test):
    print(f"\n===== {name} =====")
    ypv = proba_val.argmax(axis=1)
    ypt = proba_test.argmax(axis=1)

    dist_val  = np.bincount(ypv, minlength=3) / len(ypv)
    dist_test = np.bincount(ypt, minlength=3) / len(ypt)

    print("VAL pred dist :", dist_val)
    print("TEST pred dist:", dist_test)

    print("\nVAL report:")
    print(classification_report(y_val_arr, ypv, digits=3, zero_division=0))
    print("\nTEST report:")
    print(classification_report(y_test_arr, ypt, digits=3, zero_division=0))

eval_base("XGB",        proba_val_xgb,  proba_test_xgb)
eval_base("Dixon-Coles",proba_val_dc,   proba_test_dc)
eval_base("Transformer",proba_val_tf,   proba_test_tf)



===== XGB =====
VAL pred dist : [0.36842105 0.25263158 0.37894737]
TEST pred dist: [0.35       0.21140351 0.43859649]

VAL report:
              precision    recall  f1-score   support

           0      0.843     0.724     0.779       163
           1      0.729     0.795     0.761        88
           2      0.757     0.845     0.799       129

    accuracy                          0.782       380
   macro avg      0.776     0.788     0.779       380
weighted avg      0.787     0.782     0.781       380


TEST report:
              precision    recall  f1-score   support

           0      0.647     0.502     0.565       514
           1      0.220     0.202     0.211       262
           2      0.468     0.643     0.542       364

    accuracy                          0.478      1140
   macro avg      0.445     0.449     0.439      1140
weighted avg      0.492     0.478     0.476      1140


===== Dixon-Coles =====
VAL pred dist : [0.58947368 0.         0.41052632]
TEST pred dist: 

In [47]:
# if DC model uses only recent seasons,e.g. dc_train_df,can replace base_df with that one;
# otherwise,just use train_df as base.
base_df = train_df   # or dc_train_df,depending on what you feed to DC 

print("Current base_df columns:", len(base_df.columns))

# Select only numeric columns
numeric_cols = base_df.select_dtypes(include=[np.number]).columns.tolist()
print("Total number of numeric columns:", len(numeric_cols))

meta_cols = ["y"]  # if match_id,
candidate_features = [c for c in numeric_cols if c not in meta_cols]

print("Number of candidate features after removing 'y':", len(candidate_features))

# Remove leakage features
candidate_features = remove_leak_features(candidate_features)

print("Number of candidate features after removing leaks:", len(candidate_features))

if len(candidate_features) == 0:
    print(" candidate_features is empty, preview of the first 50 values:")
    print(numeric_cols[:50])
    raise ValueError(
        "candidate_features :"
        " base_df ,score,"
        " base_df  compute_all_features  df."
    )


Current base_df columns: 174
Total number of numeric columns: 164
Number of candidate features after removing 'y': 163
 remove 24 leakage features: ['FTHG', 'FTAG', 'HTHG', 'HTAG', 'HS', 'AS', 'HST', 'AST', 'HC', 'AC', 'HF', 'AF', 'HY', 'AY', 'HR', 'AR', 'shots_for', 'shots_against', 'sot_for', 'sot_against', 'corners_for', 'corners_against', 'shot_accuracy', 'opp_shot_accuracy']
✅ Features after cleaning: 163 → 139
Number of candidate features after removing leaks: 139


In [48]:
def compute_feature_scores_once_full(
    df,
    candidate_features,
    target_col="y",
    methods=('permutation', 'mutual_info', 'rf_importance'),
    n_estimators_rf=200,
    n_repeats_perm=10,
    random_state_base=42,
    verbose=True
):
    """
      - RandomForest + permutation_importance
      - mutual_info_classif
      - RandomForest feature_importances_
    return:
    """
    available_features = [f for f in candidate_features if f in df.columns]
    if verbose:
        print(f" avaliable number of features : {len(available_features)}")

    if not available_features:
        if verbose:
            print("No features available, return empty result.")
        return pd.Series(dtype=float), {}

    X = df[available_features].copy().fillna(0.0)
    y = df[target_col].astype(int)

    method_scores = {}

    if 'permutation' in methods:
        if verbose:
            print(" [1/3] calculate Permutation...")
        rf_perm = RandomForestClassifier(
            n_estimators=n_estimators_rf,
            random_state=random_state_base,
            n_jobs=-1
        )
        rf_perm.fit(X, y)
        perm = permutation_importance(
            rf_perm, X, y,
            n_repeats=n_repeats_perm,
            random_state=random_state_base
        )
        perm_scores = pd.Series(perm.importances_mean, index=X.columns)
        method_scores['permutation'] = perm_scores

    if 'mutual_info' in methods:
        if verbose:
            print("  [2/3] calculate Mutual Information...")
        mi_vals = mutual_info_classif(X, y, random_state=random_state_base)
        mi_scores = pd.Series(mi_vals, index=X.columns)
        method_scores['mutual_info'] = mi_scores

    if 'rf_importance' in methods:
        if verbose:
            print(" [3/3] calculate RandomForest...")
        rf_imp = RandomForestClassifier(
            n_estimators=n_estimators_rf,
            random_state=random_state_base + 1,
            n_jobs=-1
        )
        rf_imp.fit(X, y)
        rf_scores = pd.Series(rf_imp.feature_importances_, index=X.columns)
        method_scores['rf_importance'] = rf_scores

    if not method_scores:
        if verbose:
            print(" No scoring method was enabled; returned empty.")
        return pd.Series(dtype=float), {}

    if verbose:
        print(" Combining scores from multiple methods (averaging after z-score standardization)...")

    # Construct DataFrame: line=Feature, column=method
    score_df = pd.DataFrame(method_scores)

 # z-score standardization, dominate
    score_df_z = (score_df - score_df.mean()) / (score_df.std(ddof=0) + 1e-9)

    # 
    combined_scores = score_df_z.mean(axis=1)
    combined_scores = combined_scores.sort_values(ascending=False)

    if verbose:
        print(" Single scoring completed, Top 10 comprehensive characteristics:")
        print(combined_scores.head(10).to_dict())

    return combined_scores, method_scores


In [49]:
def clean_tf_features(df, cols):

    good = []
    for c in cols:
        if not isinstance(c, str) or c not in df.columns:
            continue
        s = df[c]
        if isinstance(s, pd.DataFrame):
            if all(pd.api.types.is_numeric_dtype(s[col]) for col in s.columns):
                if (s.nunique(axis=1) == 1).all():
                    s = s.iloc[:, 0]
                else:
                    continue
            else:
                continue
        if not pd.api.types.is_numeric_dtype(s.dtype):
            continue
        good.append(c)
    return good

def select_unified_features(df, verbose=True):
    """
    return:(all_features, xgb_features, dc_features, tf_features)
    """
    # 
    LEAK_COLS = {
        'FTHG', 'FTAG', 'HTHG', 'HTAG', 'FTR', 'HTR',
        'HS', 'AS', 'HST', 'AST', 'HC', 'AC', 
        'HF', 'AF', 'HY', 'AY', 'HR', 'AR',
        'shots_for', 'shots_against', 'sot_for', 'sot_against',
        'corners_for', 'corners_against', 'shot_accuracy', 'opp_shot_accuracy',
    }
    
    # 
    manual_core = [
        # form/form
        "form_home", "form_away", "form_diff",
        "form_home_v2", "form_away_v2", "form_diff_v2",
        "home_home_form", "away_away_form",
        "win_streak_home", "win_streak_away", "unbeaten_home", "unbeaten_away",
        # L5/L10
        "L5HWR", "L5HDR", "L5AWR", "L5ADR",
        "L10HWR", "L10HDR", "L10AWR", "L10ADR",
        "L5_home_adv", "L10_home_adv",
        # integration/position
        "points_home", "points_away", "points_diff",
        "points_home_v2", "points_away_v2", "points_diff_v2",
        "position_home", "position_away", "position_diff",
        "position_home_v2", "position_away_v2", "position_diff_v2",
        "gd_home", "gd_away", "gd_diff", "season_gd_diff",
        # Elo
        "elo_home", "elo_away", "elo_diff",
        "elo_att_home", "elo_def_home", "elo_att_away", "elo_def_away",
        # momentum
        "attack_momentum_home", "attack_momentum_away",
        "defense_momentum_home", "defense_momentum_away",
        "attack_vs_defense_home", "attack_vs_defense_away",
        # goal
        "goals_pm_home", "goals_pm_away", "conceded_pm_home", "conceded_pm_away",
        "goals_scored_pm_home", "goals_scored_pm_away",
        "goals_conceded_pm_home", "goals_conceded_pm_away",
        "gd_pm_home", "gd_pm_away", "gd_pm_diff",
        # xG
        "xg_home", "xg_away", "xg_diff", "xg_total",
        # H2H
        "h2h_home_rate", "h2h_draw_rate", "h2h_away_rate",
        "h2h_home_rate_v2", "h2h_draw_rate_v2", "h2h_away_rate_v2",
        "h2h_home_rate_td", "h2h_draw_rate_td", "h2h_away_rate_td",
        "h2h_goal_diff_avg", "h2h_matches", "h2h_matches_v2", "h2h_matches_td",
        # rest
        "rest_days_home", "rest_days_away", "rest_diff",
        "rest_days_home_v2", "rest_days_away_v2", "rest_diff_v2",
        # draw
        "draw_prop_home", "draw_prop_away", "draw_prop_sum",
        "draw_prop_home_v2", "draw_prop_away_v2", "draw_prop_sum_v2", "draw_prop_diff_v2",
        # 
        "abs_form_diff", "abs_points_diff", "abs_gd_diff", "abs_position_diff", "abs_h2h_gd_avg",
        # pm
        "shots_for_pm_home", "shots_for_pm_away", "sot_for_pm_home", "sot_for_pm_away",
        "shots_against_pm_home", "shots_against_pm_away", "sot_against_pm_home", "sot_against_pm_away",
        "shots_pm_diff", "sot_pm_diff",
        # corners/fouls/pm
        "corners_for_pm_home", "corners_for_pm_away", "corners_pm_diff",
        "fouls_pm_home", "fouls_pm_away", "fouls_pm_diff",
        "yellows_pm_home", "yellows_pm_away", "yellows_pm_diff",
        # referee
        "ref_home_rate", "ref_draw_rate", "ref_away_rate",
        "ref_home_rate_v2", "ref_draw_rate_v2", "ref_away_rate_v2",
        "ref_matches", "ref_matches_v2", "ref_home_bias", "ref_home_bias_v2",
        "ref_avg_yellow", "ref_avg_red", "ref_avg_fouls",
        # 
        "match_week", "is_late_season", "early_season", "mid_season", "late_season", "both_mid_table",
    ]
    
    manual_core = [c for c in manual_core if c in df.columns]
    

    extra_cols = []
    meta_cols = {"date", "season", "hometeam", "awayteam", "ftr", "y", "referee"}
    for c in df.columns:
        name = str(c).lower()
        if c in manual_core or name in meta_cols or c in LEAK_COLS:
            continue
        if any(kw in name for kw in ["momentum", "adv", "streak", "prop"]):
            extra_cols.append(c)
    
    candidate = list(dict.fromkeys(manual_core + extra_cols))
    candidate = [c for c in candidate if c not in LEAK_COLS]
    all_features = clean_tf_features(df, candidate)
    

    xgb_features = all_features.copy()
    
    dc_priority = [
        "elo_diff", "form_diff", "form_diff_v2", "points_diff", "position_diff", "gd_diff",
        "h2h_home_rate", "h2h_draw_rate", "rest_diff", "xg_diff", "L5_home_adv", "draw_prop_sum_v2",
    ]
    dc_features = [f for f in dc_priority if f in all_features]
    for f in all_features:
        if f not in dc_features and len(dc_features) < 12:
            dc_features.append(f)
    dc_features = dc_features[:12]
    
    tf_features = all_features.copy()
    
    if verbose:
        print(f" Unified feature selection completed: total{len(all_features)}, XGB={len(xgb_features)}, DC={len(dc_features)}, TF={len(tf_features)}")
        print(f" DC features: {dc_features}")
    
    return all_features, xgb_features, dc_features, tf_features

print("Feature selection function definition complete")

Feature selection function definition complete


In [50]:
RESELECT_FEATURES = False  #  Change to True to force reselection 

if (not RESELECT_FEATURES) and os.path.exists("unified_features.pkl"):
    with open("unified_features.pkl", "rb") as f:
        saved = pickle.load(f)
    feature_cols_xgb = saved["feature_cols_xgb"]
    dc_feature_cols = saved["dc_feature_cols"]
    tf_token_features = saved["tf_token_features"]
    
    LEAK_CHECK = {'FTHG','FTAG','HS','AS','HST','AST','HC','AC','HF','AF','HY','AY','HR','AR',
                  'shots_for','shots_against','sot_for','sot_against','corners_for','corners_against'}
    feature_cols_xgb = [f for f in feature_cols_xgb if f not in LEAK_CHECK]
    dc_feature_cols = [f for f in dc_feature_cols if f not in LEAK_CHECK]
    tf_token_features = [f for f in tf_token_features if f not in LEAK_CHECK]
    
    print("Features have been loaded from unified_features.pkl")

else:
 # ( train_df)
    _, feature_cols_xgb, dc_feature_cols, tf_token_features = select_unified_features(train_df, verbose=True)
    
    # Save
    with open("unified_features.json", "w") as f:
        json.dump({"xgb": feature_cols_xgb, "dc": dc_feature_cols, "tf": tf_token_features}, f, indent=2)
    with open("unified_features.pkl", "wb") as f:
        pickle.dump({"feature_cols_xgb": feature_cols_xgb, "dc_feature_cols": dc_feature_cols, 
                     "tf_token_features": tf_token_features}, f)
    print(" unified_features.pkl")

print(f"Final feature count: XGB={len(feature_cols_xgb)}, DC={len(dc_feature_cols)}, TF={len(tf_token_features)}")
print(f"DC features: {dc_feature_cols}")

Features have been loaded from unified_features.pkl
Final feature count: XGB=120, DC=12, TF=120
DC features: ['elo_diff', 'form_diff', 'form_diff_v2', 'points_diff', 'position_diff', 'gd_diff', 'h2h_home_rate', 'h2h_draw_rate', 'rest_diff', 'xg_diff', 'draw_prop_sum_v2', 'form_home']


## 6. Results & save model

In [51]:
draw_feature_cols_to_save = [
    "abs_form_diff", "abs_points_diff", "abs_gd_diff",
    "abs_elo_sum_diff", "abs_position_diff", "abs_h2h_gd_avg",
    "xg_total", "low_xg_flag", "attack_mom_sum", "defense_mom_sum",
    "abs_shots_pm_diff", "abs_sot_pm_diff",
    "abs_corners_pm_diff", "abs_fouls_pm_diff",
    "draw_prop_home", "draw_prop_away", "draw_prop_sum",
    "h2h_draw_rate", "h2h_draw_rate_td", "h2h_draw_rate_mean",
    "ref_draw_rate", "high_draw_ref_flag",
    "both_mid_table", "mid_season", "late_season",
]
draw_feature_cols_to_save = [c for c in draw_feature_cols_to_save if c in train_df.columns]

if "tf_token_features" not in globals() or len(tf_token_features) == 0:
    tf_token_features_to_save = select_tf_token_features(cleaned_df)
else:
    tf_token_features_to_save = tf_token_features

print(f"draw_feature_cols_to_save: {len(draw_feature_cols_to_save)}")
print(f"tf_token_features_to_save: {len(tf_token_features_to_save)}")

model_state = {
    "xgb_model": xgb_final,
    "stage1": stage1_all,
    "stage2": stage2_all,
    "alpha": alpha_final,
    "tau": tau_final,
    "feature_cols": feature_cols_xgb,

    # ✅ TF model + TF related assets
    "tf_model": model_tf_loaded,
    "tf_token_features": tf_token_features_to_save,

    # ✅ (strongly recommended) TF normalization params + feature list used by checkpoint
    "tf_all_token_features": all_token_features_loaded,
    "tf_mu_seq": mu_seq_loaded,
    "tf_sd_seq": sd_seq_loaded,
    "tf_mu_match": mu_match_loaded,
    "tf_sd_match": sd_match_loaded,

    "dc_posterior": {
        "attack": attack_s,
        "defense": defense_s,
        "home_adv": home_adv_s,
        "rho": rho_s,
        "tmap": tmap,
        "teams": teams
    },

    "draw_model_full": draw_model_full,
    "draw_feature_cols": draw_feature_cols_to_save,

    "dc_feature_cols": dc_feature_cols,
    "dc_scaler": dc_scaler,
    "beta_h_s": beta_h_s,
    "beta_a_s": beta_a_s,

    "fe": fe,
}


if "iso_calibrators" in globals() and iso_calibrators is not None:
    model_state["iso_calibrators"] = iso_calibrators

with open("epl_model_state.pkl", "wb") as f:
    dill.dump(model_state, f)

print("✅ Model state saved (using dill)")
print(f"   XGB features: {len(feature_cols_xgb)}")
print(f"   DC features: {len(dc_feature_cols)}")
print(f"   Draw features: {len(draw_feature_cols_to_save)}")
print(f"   TF features: {len(tf_token_features_to_save)}")
print(f"   FeatureEngineering: ✅")

draw_feature_cols_to_save: 24
tf_token_features_to_save: 120
✅ Model state saved (using dill)
   XGB features: 120
   DC features: 12
   Draw features: 24
   TF features: 120
   FeatureEngineering: ✅


In [52]:
import os
import numpy as np
import pandas as pd
import pickle, dill
import torch

from collections import defaultdict, deque
from sklearn.metrics import log_loss, f1_score
import xgboost as xgb

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("[ENV] DEVICE:", DEVICE)

OOF_CKPT_PATH = "oof_full_checkpoint.pkl"
TRAINING_CSV  = "epl-training.csv"
MODEL_STATE_PATH = "epl_model_state.pkl"
STACK_V2_PATH = "epl_model_state_stack_v2.pkl"
REFIT_OUT     = "epl_model_state_refit.pkl"


[ENV] DEVICE: cuda


In [53]:
with open(MODEL_STATE_PATH, "rb") as f:
    ms = dill.load(f)

# ---- fitted base models (IMPORTANT) ----
xgb_model_old = ms.get("xgb_model", None)
tf_model_old  = ms.get("tf_model", None)
draw_model_full = ms.get("draw_model_full", None)

# ---- feature lists ----
feature_cols_xgb   = ms.get("feature_cols", [])
dc_feature_cols    = ms.get("dc_feature_cols", [])
draw_feature_cols  = ms.get("draw_feature_cols", [])
tf_token_features  = ms.get("tf_token_features", [])

# ---- DC stuff ----
dc_scaler = ms.get("dc_scaler", None)
beta_h_s  = ms.get("beta_h_s", None)
beta_a_s  = ms.get("beta_a_s", None)

dc_post = ms.get("dc_posterior", {}) if isinstance(ms.get("dc_posterior", None), dict) else {}
attack_s   = dc_post.get("attack", None)
defense_s  = dc_post.get("defense", None)
home_adv_s = dc_post.get("home_adv", None)
rho_s      = dc_post.get("rho", None)
tmap       = dc_post.get("tmap", None)
teams      = dc_post.get("teams", None)

# ---- FE state ----
fe = ms.get("fe", None)

# ---- optional ----
iso_calibrators = ms.get("iso_calibrators", None)

print("[LOAD] xgb_model_old:", xgb_model_old is not None)
print("[LOAD] tf_model_old :", tf_model_old is not None)
print("[LOAD] draw_model   :", draw_model_full is not None)
print("[LOAD] feature_cols_xgb:", len(feature_cols_xgb))
print("[LOAD] dc_feature_cols :", len(dc_feature_cols))
print("[LOAD] draw_feature_cols:", len(draw_feature_cols))
print("[LOAD] tf_token_features:", len(tf_token_features))
print("[LOAD] fe:", fe is not None)


[LOAD] xgb_model_old: True
[LOAD] tf_model_old : True
[LOAD] draw_model   : True
[LOAD] feature_cols_xgb: 120
[LOAD] dc_feature_cols : 12
[LOAD] draw_feature_cols: 24
[LOAD] tf_token_features: 120
[LOAD] fe: True


In [54]:
training_all = pd.read_csv(TRAINING_CSV)
training_all["Date"] = pd.to_datetime(training_all["Date"], dayfirst=True, errors="coerce")
training_all = training_all.dropna(subset=["Date"]).sort_values("Date").reset_index(drop=True)

if "Season" not in training_all.columns:
    training_all["Season"] = training_all["Date"].apply(season_label)

with open(OOF_CKPT_PATH, "rb") as f:
    ckpt = pickle.load(f)

meta_df_old = ckpt["meta_df"].copy()
meta_df_old["Date"] = pd.to_datetime(meta_df_old["Date"], errors="coerce")
meta_df_old = meta_df_old.dropna(subset=["Date"]).sort_values("Date").reset_index(drop=True)
last_old_date = meta_df_old["Date"].max()
print("[NEW] last_old_date:", last_old_date)

if "FTR" not in training_all.columns:
    raise ValueError("epl-training.csv missing required column: FTR")

new_results = training_all[(training_all["Date"] > last_old_date) & (training_all["FTR"].notna())].copy()
print("[NEW] new_results:", new_results.shape,
      "seasons tail:", sorted(new_results["Season"].unique())[-5:])

if len(new_results) == 0:
    raise ValueError("No new completed matches found after last_old_date. Check csv Date/FTR.")

new_results_fe, fe = compute_all_features(
    new_results,
    fe=fe,
    is_train=True,
    use_state_features=True,
    use_all_adv_block=True,
    use_shot_corners=True,
    use_td_h2h=True,
    use_draw_block=True,
)

new_results_fe["Date"] = pd.to_datetime(new_results_fe["Date"], errors="coerce")
new_results_fe = new_results_fe.dropna(subset=["Date"]).sort_values("Date").reset_index(drop=True)

print("[NEW] new_results_fe:", new_results_fe.shape,
      "seasons tail:", sorted(new_results_fe["Season"].unique())[-5:])


[NEW] last_old_date: 2022-05-22 00:00:00
[NEW] new_results: (1240, 23) seasons tail: ['2022/2023', '2023/2024', '2024/2025']
Added 99 features
corner, foul, yellow/red cards history features are added
Added advanced H2H features (time-decayed & directional)
 Added extended draw-specific features
[NEW] new_results_fe: (1240, 162) seasons tail: ['2022/2023', '2023/2024', '2024/2025']


In [55]:
def build_tf_sequences_from_cleaned(
    *,
    cleaned_df,
    target_dfs,              # dict: {"train_df": df, ...}
    tf_token_features,
    seq_len=5,
):
    # 1) 白名单 TF 特征
    safe_features_tf = [f for f in tf_token_features if f in cleaned_df.columns]
    if len(safe_features_tf) == 0:
        raise ValueError("No TF safe features found in cleaned_df")
    print("TF safe features:", len(safe_features_tf))

    # 2) 做稳定 key（避免 reset_index 后 join 错位）
    def _make_rid(df):
        d = pd.to_datetime(df["Date"], errors="coerce")
        return (
            d.dt.strftime("%Y-%m-%d").astype(str) + "||" +
            df["HomeTeam"].astype(str) + "||" +
            df["AwayTeam"].astype(str)
        )

    base = cleaned_df.copy()
    base["Date"] = pd.to_datetime(base["Date"], errors="coerce")
    base = base.sort_values("Date").reset_index(drop=True)
    base["_rid"] = _make_rid(base)

    # 3) 在 base 上一次性 build seq（你原来的 build_team_sequences_fixed）
    base_with_seq, feat_dim_tf = build_team_sequences_fixed(base, safe_features_tf, seq_len=seq_len)
    print(f"[TF seq] seq_len={seq_len}, feat_dim={feat_dim_tf}")

    seq_cols = ["home_form_seq", "away_form_seq", "match_features"]
    seq_store = base_with_seq[["_rid"] + seq_cols].copy()

    # 4) merge 回各 df（用 _rid，不用 index）
    out = {}
    for name, df in target_dfs.items():
        if df is None:
            continue
        dfx = df.copy()
        dfx["Date"] = pd.to_datetime(dfx["Date"], errors="coerce")
        dfx["_rid"] = _make_rid(dfx)

        dfx = dfx.drop(columns=seq_cols, errors="ignore")
        dfx = dfx.merge(seq_store, on="_rid", how="left")

        # 清理
        dfx = dfx.drop(columns=["_rid"], errors="ignore")
        out[name] = dfx

    return out, safe_features_tf, feat_dim_tf


In [56]:
def _season_start_year(season_str: str) -> int:
    try:
        return int(str(season_str).split("/")[0])
    except Exception:
        return 0

def _ensure_y(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    if "y" in df.columns:
        df["y"] = pd.to_numeric(df["y"], errors="coerce")
    elif "FTR" in df.columns:
        ftr = df["FTR"].astype(str).str.strip().str.upper()
        m = {"H":0,"D":1,"A":2,"HOME":0,"DRAW":1,"AWAY":2}
        df["y"] = ftr.map(m)
    else:
        raise ValueError("Need y or FTR.")
    df = df.dropna(subset=["y"]).copy()
    df["y"] = df["y"].astype(int)
    return df

def _ensure_cols(df: pd.DataFrame, cols, fill=0.0) -> pd.DataFrame:
    df = df.copy()
    for c in cols:
        if c not in df.columns:
            df[c] = fill
    return df

def _sanitize_proba(P):
    P = np.asarray(P, dtype=np.float32)
    P = np.nan_to_num(P, nan=1/3, posinf=1/3, neginf=1/3)
    P = np.clip(P, 1e-7, 1.0)
    s = P.sum(axis=1, keepdims=True)
    s = np.where(s <= 0, 1.0, s)
    return P / s

def _sanitize_p(p, default=0.25):
    p = np.asarray(p, dtype=np.float32).reshape(-1)
    p = np.nan_to_num(p, nan=default, posinf=default, neginf=default)
    return np.clip(p, 1e-6, 1-1e-6)


In [57]:
# ==========================================================
# Ensure XGB fitted for stacking-expand (refit if needed)
#   - Use ONLY past data (Date <= last_old_date) to avoid leakage
#   - Train a temp model solely for meta_new proba construction
# ==========================================================

import xgboost as xgb
from sklearn.exceptions import NotFittedError

def _xgb_is_fitted(m):
    try:
        _ = m.get_booster()
        return True
    except Exception:
        return False

def ensure_xgb_for_expand(
    *,
    xgb_model_candidate,
    feature_cols_xgb,
    # data sources for refit
    training_all_df,     # full epl-training.csv (raw)
    last_old_date,       # from OOF ckpt meta_df_old max date
    fe,                  # your feature engine state (already exists)
):
    """
    Return a fitted XGB model for stacking-expand.
    If candidate is not fitted, refit on historical finished matches up to last_old_date.
    """
    if (xgb_model_candidate is not None) and _xgb_is_fitted(xgb_model_candidate):
        print("✅ xgb_model_old is already fitted")
        return xgb_model_candidate

    print("⚠️ xgb_model_old NOT fitted -> refitting a temporary XGB on past data (<= last_old_date)")

    # 1) take only finished matches up to last_old_date
    hist = training_all_df.copy()
    hist["Date"] = pd.to_datetime(hist["Date"], dayfirst=True, errors="coerce")
    hist = hist.dropna(subset=["Date"]).sort_values("Date").reset_index(drop=True)

    if "Season" not in hist.columns:
        hist["Season"] = hist["Date"].apply(season_label)

    hist = hist[(hist["Date"] <= last_old_date) & (hist["FTR"].notna())].copy()
    if len(hist) == 0:
        raise ValueError("No historical finished matches found for XGB refit (Date<=last_old_date).")

    # 2) compute features (IMPORTANT: use a COPY of fe to not pollute your global fe state)
    #    如果你 compute_all_features 会更新 fe 状态，这里用浅拷贝避免影响后续流程
    import copy
    fe_tmp = copy.deepcopy(fe)

    hist_fe, _ = compute_all_features(
        hist,
        fe=fe_tmp,
        is_train=True,
        use_state_features=True,
        use_all_adv_block=True,
        use_shot_corners=True,
        use_td_h2h=True,
        use_draw_block=True,
    )

    # ensure y exists
    if "y" not in hist_fe.columns:
        ftr = hist_fe["FTR"].astype(str).str.strip().str.upper()
        mp = {"H":0,"D":1,"A":2,"HOME":0,"DRAW":1,"AWAY":2}
        hist_fe["y"] = ftr.map(mp)

    hist_fe = hist_fe.dropna(subset=["y"]).copy()
    hist_fe["y"] = hist_fe["y"].astype(int)

    # 3) ensure columns
    for c in feature_cols_xgb:
        if c not in hist_fe.columns:
            hist_fe[c] = 0.0

    X = hist_fe[feature_cols_xgb].fillna(0.0).to_numpy(np.float32)
    y = hist_fe["y"].to_numpy()

    # 4) fit a stable temp model (use hist CPU/GPU same as your environment)
    xgb_tmp = xgb.XGBClassifier(
        n_estimators=800,
        learning_rate=0.03,
        max_depth=6,
        min_child_weight=4,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_lambda=2.0,
        objective="multi:softprob",
        num_class=3,
        eval_metric="mlogloss",
        tree_method="hist",
        device="cuda" if torch.cuda.is_available() else "cpu",
        random_state=42,
        n_jobs=-1,
    )

    xgb_tmp.fit(X, y, verbose=False)
    print("✅ temporary XGB refit done:", X.shape)

    return xgb_tmp


# ---------- RUN THIS ONCE ----------
# 你必须有 training_all（就是你从 epl-training.csv 读的那个全量 df）
# last_old_date 你前面已经算过
xgb_model_old = ensure_xgb_for_expand(
    xgb_model_candidate=(xgb_model_old if "xgb_model_old" in globals() else None),
    feature_cols_xgb=feature_cols_xgb,
    training_all_df=training_all,      # ✅ 用你已经读入的 training_all
    last_old_date=last_old_date,       # ✅ 用你已经算出的 last_old_date
    fe=fe,                             # ✅ 用当前 fe（内部会 deepcopy）
)


✅ xgb_model_old is already fitted


In [58]:
def _softmax_np(logits):
    x = logits - logits.max(axis=1, keepdims=True)
    e = np.exp(x)
    return e / e.sum(axis=1, keepdims=True)
def apply_tf_standardization(df_seq, mu_seq, sd_seq, mu_match, sd_match):
    mu_seq = np.asarray(mu_seq, dtype=np.float32)
    sd_seq = np.asarray(sd_seq, dtype=np.float32)
    mu_match = np.asarray(mu_match, dtype=np.float32)
    sd_match = np.asarray(sd_match, dtype=np.float32)

    sd_seq_safe = sd_seq.copy()
    sd_seq_safe[sd_seq_safe < 1e-6] = 1.0
    sd_match_safe = sd_match.copy()
    sd_match_safe[sd_match_safe < 1e-6] = 1.0

    def _norm_seq(x):
        x = np.nan_to_num(np.asarray(x, dtype=np.float32), nan=0.0, posinf=0.0, neginf=0.0)
        return (x - mu_seq) / sd_seq_safe

    def _norm_match(x):
        x = np.nan_to_num(np.asarray(x, dtype=np.float32), nan=0.0, posinf=0.0, neginf=0.0)
        return (x - mu_match) / sd_match_safe

    df_seq["home_form_seq"] = df_seq["home_form_seq"].apply(_norm_seq)
    df_seq["away_form_seq"] = df_seq["away_form_seq"].apply(_norm_seq)
    df_seq["match_features"] = df_seq["match_features"].apply(_norm_match)
    return df_seq

def predict_tf_on_df(model, df_seq):
    # 不用 EnhancedMatchDataset（它要 y）
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    home_seq = np.stack(df_seq["home_form_seq"].values).astype(np.float32)
    away_seq = np.stack(df_seq["away_form_seq"].values).astype(np.float32)
    match_feat = np.stack(df_seq["match_features"].values).astype(np.float32)

    home_t = torch.tensor(home_seq, dtype=torch.float32).to(DEVICE)
    away_t = torch.tensor(away_seq, dtype=torch.float32).to(DEVICE)
    mf_t   = torch.tensor(match_feat, dtype=torch.float32).to(DEVICE)

    model = model.to(DEVICE)
    model.eval()
    with torch.no_grad():
        logits = model(home_t, away_t, mf_t).detach().cpu().numpy()
    proba = _softmax_np(logits).astype(np.float32)
    return proba

In [59]:
# =====================================================
# Load OOF checkpoint (STRICT alignment)
# =====================================================
with open(OOF_CKPT_PATH, "rb") as f:
    ckpt = pickle.load(f)

meta_df_ckpt = ckpt["meta_df"].copy()
oof_xgb = ckpt["oof_xgb"]
oof_dc  = ckpt["oof_dc"]
oof_tf  = ckpt["oof_tf"]

assert len(meta_df_ckpt) == oof_xgb.shape[0], "❌ meta_df_ckpt / oof mismatch"

# =====================================================
# Load TF checkpoint
# =====================================================
(
    model_tf_loaded,
    all_token_features_loaded,
    mu_seq_loaded,
    sd_seq_loaded,
    mu_match_loaded,
    sd_match_loaded,
) = load_enhanced_tf_checkpoint()

# =====================================================
# OLD OOF PART
# =====================================================
mask_ok_old = ~(
    np.isnan(oof_xgb).any(axis=1) |
    np.isnan(oof_dc).any(axis=1)  |
    np.isnan(oof_tf).any(axis=1)
)

meta_oof_old = meta_df_ckpt.loc[mask_ok_old].reset_index(drop=True)
meta_oof_old = _ensure_y(meta_oof_old)

y_old = meta_oof_old["y"].astype(int).values
season_old = meta_oof_old["Season"].values

oof_xgb_ok = _sanitize_proba(oof_xgb[mask_ok_old])
oof_dc_ok  = _sanitize_proba(oof_dc[mask_ok_old])
oof_tf_ok  = _sanitize_proba(oof_tf[mask_ok_old])

X_base_old = np.hstack([oof_xgb_ok, oof_dc_ok, oof_tf_ok]).astype(np.float32)

# ---- pD_special (OLD)
if draw_model_full is not None and draw_feature_cols:
    meta_oof_old = _ensure_cols(meta_oof_old, draw_feature_cols, fill=0.0)
    X_draw_old = meta_oof_old[draw_feature_cols].fillna(0.0).to_numpy(np.float32)
    pD_special_old = _sanitize_p(
        draw_model_full.predict_proba(X_draw_old)[:, 1], default=0.25
    )
else:
    pD_special_old = _sanitize_p(
        (oof_xgb_ok[:,1] + oof_dc_ok[:,1] + oof_tf_ok[:,1]) / 3.0, default=0.25
    )

X_meta_old = np.hstack([X_base_old, pD_special_old[:, None]]).astype(np.float32)

# =====================================================
# NEW DATA PART
# =====================================================
meta_new = _ensure_y(new_results_fe).sort_values("Date").reset_index(drop=True)

# ---- XGB
meta_new = _ensure_cols(meta_new, feature_cols_xgb, fill=0.0)
X_new_xgb = meta_new[feature_cols_xgb].fillna(0.0).to_numpy(np.float32)
proba_new_xgb = _sanitize_proba(xgb_model_old.predict_proba(X_new_xgb))

# ---- DC
if dc_scaler is not None and dc_feature_cols and beta_h_s is not None:
    meta_new = _ensure_cols(meta_new, dc_feature_cols, fill=0.0)
    X_dc = dc_scaler.transform(meta_new[dc_feature_cols].fillna(0.0))
    try:
        proba_new_dc, _ = bayes_dc_predict_full(
            meta_new, X_dc,
            attack_s, defense_s, home_adv_s, rho_s,
            beta_h_s, beta_a_s, tmap
        )
        proba_new_dc = _sanitize_proba(proba_new_dc)
    except Exception as e:
        print("[EXPAND] ⚠️ DC failed → uniform", e)
        proba_new_dc = np.full_like(proba_new_xgb, 1/3)
else:
    proba_new_dc = np.full_like(proba_new_xgb, 1/3)

# ---- TF (FIXED & SAFE)
def tf_predict_from_ckpt(df_feat):
    df_seq, _ = build_team_sequences_fixed(
        df_feat.copy(), all_token_features_loaded, seq_len=5
    )
    df_seq = apply_tf_standardization(
        df_seq,
        mu_seq_loaded, sd_seq_loaded,
        mu_match_loaded, sd_match_loaded
    )
    return predict_tf_on_df(model_tf_loaded, df_seq)

if TORCH_OK and model_tf_loaded is not None:
    try:
        proba_new_tf = _sanitize_proba(tf_predict_from_ckpt(meta_new))
    except Exception as e:
        print("[EXPAND] ⚠️ TF failed → uniform", e)
        proba_new_tf = np.full_like(proba_new_xgb, 1/3)
else:
    proba_new_tf = np.full_like(proba_new_xgb, 1/3)

# ---- pD_special (NEW)
if draw_model_full is not None and draw_feature_cols:
    meta_new = _ensure_cols(meta_new, draw_feature_cols, fill=0.0)
    X_draw_new = meta_new[draw_feature_cols].fillna(0.0).to_numpy(np.float32)
    pD_special_new = _sanitize_p(
        draw_model_full.predict_proba(X_draw_new)[:, 1], default=0.25
    )
else:
    pD_special_new = _sanitize_p(
        (proba_new_xgb[:,1] + proba_new_dc[:,1] + proba_new_tf[:,1]) / 3.0, default=0.25
    )

X_meta_new = np.hstack([
    proba_new_xgb,
    proba_new_dc,
    proba_new_tf,
    pD_special_new[:, None]
]).astype(np.float32)

y_new = meta_new["y"].astype(int).values
season_new = meta_new["Season"].values

# =====================================================
# MERGE + CLEAN
# =====================================================
X_meta_all = np.vstack([X_meta_old, X_meta_new])
y_all = np.concatenate([y_old, y_new])
season_all = np.concatenate([season_old, season_new])

ok = np.isfinite(X_meta_all).all(axis=1)
X_meta_all = X_meta_all[ok]
y_all = y_all[ok]
season_all = season_all[ok]

print("[EXPAND] all:", X_meta_all.shape, y_all.shape)

# =====================================================
# FINAL two-stage REFIT + alpha/tau
# =====================================================
cw_draw = 2.0
stage1_all, stage2_all = fit_two_stage(
    X_meta_all, y_all, cw_draw=cw_draw, C1=0.5, C2=0.5
)
print("[EXPAND] ✅ stage1/stage2 refit done")

seasons_all = sorted(pd.unique(season_all), key=_season_start_year)
alpha_grid = np.linspace(0.5, 1.0, 11)
tau_grid   = np.linspace(0.22, 0.35, 14)

# ---- alpha
alphas = []
for s in seasons_all:
    tr, va = season_all != s, season_all == s
    if tr.sum() == 0 or va.sum() == 0:
        continue
    st1, st2 = fit_two_stage(X_meta_all[tr], y_all[tr], cw_draw=cw_draw)
    pD = st1.predict_proba(X_meta_all[va])[:,1]
    pA = st2.predict_proba(X_meta_all[va])[:,1]
    best = min(alpha_grid, key=lambda a: log_loss(
        y_all[va], make_proba_two_stage_alpha(pD, pA, a), labels=[0,1,2]
    ))
    alphas.append(best)

alpha_final = float(np.mean(alphas)) if alphas else 0.85
print(f"[EXPAND] alpha = {alpha_final:.3f}")

# ---- tau
best_tau, best_f1 = 0.25, -1
for tau in tau_grid:
    f1s = []
    for s in seasons_all:
        tr, va = season_all != s, season_all == s
        if tr.sum() == 0 or va.sum() == 0:
            continue
        st1, st2 = fit_two_stage(X_meta_all[tr], y_all[tr], cw_draw=cw_draw)
        P = make_proba_two_stage_alpha(
            st1.predict_proba(X_meta_all[va])[:,1],
            st2.predict_proba(X_meta_all[va])[:,1],
            alpha_final
        )
        yhat = np.where(P[:,1] > tau, 1, np.where(P[:,2] > P[:,0], 2, 0))
        f1s.append(f1_score(y_all[va], yhat, average="macro"))
    if f1s and np.mean(f1s) > best_f1:
        best_f1, best_tau = np.mean(f1s), tau

print(f"[EXPAND] tau = {best_tau:.3f} (macro-F1={best_f1:.4f})")

# =====================================================
# SAVE
# =====================================================
ms_stack_v2 = {
    "stage1": stage1_all,
    "stage2": stage2_all,
    "alpha": alpha_final,
    "tau": best_tau,
}
if iso_calibrators is not None:
    ms_stack_v2["iso_calibrators"] = iso_calibrators

with open(STACK_V2_PATH, "wb") as f:
    dill.dump(ms_stack_v2, f)

print(f"[EXPAND] ✅ saved -> {STACK_V2_PATH}")


Sequence feature number: 101
Sequence feature : ['form_home', 'form_away', 'form_diff', 'form_home_v2', 'form_away_v2', 'form_diff_v2', 'home_home_form', 'away_away_form', 'win_streak_home', 'win_streak_away']...
[EXPAND] all: (9220, 10) (9220,)
[EXPAND] ✅ stage1/stage2 refit done
[EXPAND] alpha = 0.629
[EXPAND] tau = 0.270 (macro-F1=0.4421)
[EXPAND] ✅ saved -> epl_model_state_stack_v2.pkl


In [60]:
bad = set(feature_cols_xgb + dc_feature_cols) & {
    "home_form_seq", "away_form_seq", "match_features"
}
print("BAD overlap with TF seq cols:", bad)


BAD overlap with TF seq cols: set()


In [61]:
def check_seq_integrity(df, name):
    cols = ["home_form_seq","away_form_seq","match_features"]
    print(f"\n[{name}] rows={len(df)} seasons={sorted(df['Season'].unique())[-3:]}")
    for c in cols:
        ok = df[c].apply(lambda x: isinstance(x, np.ndarray)).mean() if c in df.columns else 0
        print(f"  {c}: exists={c in df.columns}, ndarray_ratio={ok:.3f}")
        if c in df.columns:
            bad = df[~df[c].apply(lambda x: isinstance(x, np.ndarray))][c].head(5).tolist()
            if bad:
                print("    examples of bad values:", bad)

check_seq_integrity(train_df, "train_df")
check_seq_integrity(val_df,   "val_df")
check_seq_integrity(test_df,  "test_df")



[train_df] rows=7980 seasons=['2018/2019', '2019/2020', '2020/2021']
  home_form_seq: exists=True, ndarray_ratio=1.000
  away_form_seq: exists=True, ndarray_ratio=1.000
  match_features: exists=True, ndarray_ratio=1.000

[val_df] rows=380 seasons=['2021/2022']
  home_form_seq: exists=True, ndarray_ratio=1.000
  away_form_seq: exists=True, ndarray_ratio=1.000
  match_features: exists=True, ndarray_ratio=1.000

[test_df] rows=1140 seasons=['2022/2023', '2023/2024', '2024/2025']
  home_form_seq: exists=True, ndarray_ratio=1.000
  away_form_seq: exists=True, ndarray_ratio=1.000
  match_features: exists=True, ndarray_ratio=1.000


In [62]:
# ==========================================================
# EXPANDING OOF for target seasons (e.g. test seasons)
#   - meta_df_all = train+val+test
#   - for each season s: train = all seasons < s, val = season==s
#   - fit XGB/DC/TF on train_fold, predict on val_fold, write OOF
#   - TF fold uses standardize_sequences(train_fold, val_fold) BEFORE training/infer
# ==========================================================

import numpy as np
import pandas as pd

# -------- you choose which seasons to OOF ----------
target_oof_seasons = ["2022/2023", "2023/2024", "2024/2025"]  # change if needed

# -------- build meta_df_all ----------
meta_df_all = pd.concat([train_df, val_df, test_df], ignore_index=True)
meta_df_all = meta_df_all.sort_values("Date").reset_index(drop=True)

# ensure _orig_idx for OOF write-back
meta_df_all["_orig_idx"] = np.arange(len(meta_df_all))
orig_to_pos = {i:i for i in range(len(meta_df_all))}

# -------- utilities ----------
def _ensure_cols(df: pd.DataFrame, cols, fill=0.0):
    df = df.copy()
    for c in cols:
        if c not in df.columns:
            df[c] = fill
    return df

def _sanitize_seq_cell(x, seq_len, feat_dim):
    # x should be ndarray (seq_len, feat_dim)
    if isinstance(x, np.ndarray):
        arr = x.astype(np.float32, copy=False)
        if arr.ndim == 2 and arr.shape == (seq_len, feat_dim):
            return np.nan_to_num(arr, nan=0.0, posinf=0.0, neginf=0.0)
    # fallback
    return np.zeros((seq_len, feat_dim), dtype=np.float32)

def _sanitize_match_cell(x, match_dim):
    # x should be ndarray (match_dim,)
    if isinstance(x, np.ndarray):
        arr = x.astype(np.float32, copy=False)
        if arr.ndim == 1 and arr.shape[0] == match_dim:
            return np.nan_to_num(arr, nan=0.0, posinf=0.0, neginf=0.0)
    return np.zeros((match_dim,), dtype=np.float32)

def _infer_seq_shapes(df):
    # find first valid row to infer dims
    for v in df["home_form_seq"].values:
        if isinstance(v, np.ndarray) and v.ndim == 2:
            return int(v.shape[0]), int(v.shape[1])
    raise ValueError("Cannot infer TF seq shape: no ndarray found in home_form_seq.")

def _infer_match_dim(df):
    for v in df["match_features"].values:
        if isinstance(v, np.ndarray) and v.ndim == 1:
            return int(v.shape[0])
    raise ValueError("Cannot infer match_features dim: no ndarray found.")

def _safe_predict_tf(model, df_fold):
    # assume df_fold already standardized and has TF seq cols
    return predict_tf_fold(model, df_fold)

# -------- check TF seq cols exist if TF is enabled ----------
TF_SEQ_COLS = ["home_form_seq", "away_form_seq", "match_features"]
if TORCH_OK and DO_TF_OOF:
    missing = [c for c in TF_SEQ_COLS if c not in meta_df_all.columns]
    if missing:
        raise ValueError(
            f"❌ meta_df_all missing TF seq cols {missing}. "
            "Run your build_tf_sequences_from_cleaned(...) join step BEFORE this cell."
        )

# -------- allocate OOF arrays for ALL rows (we'll only fill targets) ----------
N = len(meta_df_all)
oof_xgb = np.full((N, 3), np.nan, dtype=np.float32)
oof_dc  = np.full((N, 3), np.nan, dtype=np.float32)
oof_tf  = np.full((N, 3), np.nan, dtype=np.float32)

# -------- run expanding OOF on selected seasons ----------
for s in target_oof_seasons:
    # expanding: train uses strictly earlier seasons
    s_year = season_start_year(s)
    train_fold = meta_df_all[meta_df_all["Season"].apply(season_start_year) < s_year].copy()
    val_fold   = meta_df_all[meta_df_all["Season"] == s].copy()

    if len(train_fold) == 0 or len(val_fold) == 0:
        print(f"[OOF] skip {s}: train={len(train_fold)} val={len(val_fold)}")
        continue

    print(f"\n[OOF] season={s}  train<= {train_fold['Season'].max()}  n_train={len(train_fold)}  n_val={len(val_fold)}")

    # ---------------- XGB ----------------
    train_fold = _ensure_cols(train_fold, feature_cols_xgb, fill=0.0)
    val_fold   = _ensure_cols(val_fold,   feature_cols_xgb, fill=0.0)

    xgb_fold = fit_xgb_fold(train_fold)   # your existing function
    Xv = val_fold[feature_cols_xgb].fillna(0.0).to_numpy(np.float32)
    proba_xgb = xgb_fold.predict_proba(Xv).astype(np.float32)

    # ---------------- DC ----------------
    # (you can switch to your proper fold training or global posterior fallback)
    try:
        if DO_DC_OOF_PROPER:
            dc_state = fit_dc_fold_proper(train_fold, draws=600, tune=300)
            proba_dc = predict_dc_fold_proper(dc_state, val_fold).astype(np.float32)
        else:
            raise RuntimeError("DO_DC_OOF_PROPER=False")
    except Exception as e:
        print(f"  [OOF] DC fallback to global posterior: {e}")
        val_fold = _ensure_cols(val_fold, dc_feature_cols, fill=0.0)
        X_dc = dc_scaler.transform(val_fold[dc_feature_cols].copy().fillna(0.0))
        proba_dc, _ = bayes_dc_predict_full(
            val_fold, X_dc,
            attack_s, defense_s, home_adv_s, rho_s,
            beta_h_s, beta_a_s, tmap
        )
        proba_dc = np.asarray(proba_dc, dtype=np.float32)

    # ---------------- TF ----------------
    if TORCH_OK and DO_TF_OOF:
        # infer dims & sanitize seq cells (prevents float/None causing astype crash)
        seq_len, feat_dim = _infer_seq_shapes(train_fold)
        match_dim = _infer_match_dim(train_fold)

        for col in ["home_form_seq", "away_form_seq"]:
            train_fold[col] = train_fold[col].apply(lambda x: _sanitize_seq_cell(x, seq_len, feat_dim))
            val_fold[col]   = val_fold[col].apply(lambda x: _sanitize_seq_cell(x, seq_len, feat_dim))
        train_fold["match_features"] = train_fold["match_features"].apply(lambda x: _sanitize_match_cell(x, match_dim))
        val_fold["match_features"]   = val_fold["match_features"].apply(lambda x: _sanitize_match_cell(x, match_dim))

        # IMPORTANT: standardize using TRAIN stats only
        train_fold_std, val_fold_std, _, *_ = standardize_sequences(train_fold.copy(), val_fold.copy(), None)

        tf_fold = fit_tf_fold(train_fold_std, val_fold_std, epochs=25, patience=6)  # your function
        proba_tf = _safe_predict_tf(tf_fold, val_fold_std).astype(np.float32)
    else:
        proba_tf = np.full_like(proba_xgb, 1/3, dtype=np.float32)

    # ---------------- write OOF back ----------------
    for oid, px, pd_, pt_ in zip(val_fold["_orig_idx"].values, proba_xgb, proba_dc, proba_tf):
        pos = orig_to_pos[int(oid)]
        oof_xgb[pos] = px
        oof_dc[pos]  = pd_
        oof_tf[pos]  = pt_

    print(f"[OOF] wrote season {s}: filled rows={len(val_fold)}")

# -------- quick sanity ----------
mask_done = ~np.isnan(oof_xgb).any(1)
print("\n[OOF] filled rows:", int(mask_done.sum()), "/", N)
print("[OOF] filled seasons:", sorted(meta_df_all.loc[mask_done, "Season"].unique(), key=season_start_year))



[OOF] season=2022/2023  train<= 2021/2022  n_train=8360  n_val=380


Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 2 jobs)
NUTS: [sigma_att, sigma_def, att_offset, def_offset, home_adv, rho_raw]
Sampling 4 chains for 300 tune and 600 draw iterations (1_200 + 2_400 draws total) took 92 seconds.


[TF fold] class_counts=[3836 2085 2439], class_weights={0: np.float64(0.7264511644073688), 1: np.float64(1.3365307753796962), 2: np.float64(1.1425447587809212)}
  home_seq range: [-47.39, 47.38]
  away_seq range: [-47.39, 47.38]
  match_feat range: [-5.58, 5.60]
  home_seq range: [-9.97, 7.09]
  away_seq range: [-9.97, 7.09]
  match_feat range: [-5.18, 5.60]
[TF fold] seq_len=5, feat_dim=120, match_feat_dim=3
Epoch 01 | lr=0.001000 | train_loss=1.0964 | train_acc=0.389
         | val_logloss=1.0417
         | train_pred_dist: H=0.39 D=0.26 A=0.34
         | val_pred_dist:   H=0.37 D=0.40 A=0.23
Epoch 03 | lr=0.001000 | train_loss=1.0835 | train_acc=0.414
         | val_logloss=1.0461
         | train_pred_dist: H=0.38 D=0.26 A=0.36
         | val_pred_dist:   H=0.54 D=0.02 A=0.44
Epoch 06 | lr=0.000500 | train_loss=1.0716 | train_acc=0.423
         | val_logloss=1.0633
         | train_pred_dist: H=0.36 D=0.33 A=0.31
         | val_pred_dist:   H=0.38 D=0.12 A=0.50
Early stopping at ep

Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 2 jobs)
NUTS: [sigma_att, sigma_def, att_offset, def_offset, home_adv, rho_raw]
Sampling 4 chains for 300 tune and 600 draw iterations (1_200 + 2_400 draws total) took 97 seconds.
There were 2 divergences after tuning. Increase `target_accept` or reparameterize.


[TF fold] class_counts=[4020 2172 2548], class_weights={0: np.float64(0.724709784411277), 1: np.float64(1.3413136893799877), 2: np.float64(1.1433804290947147)}
  home_seq range: [-48.34, 48.34]
  away_seq range: [-48.34, 48.34]
  match_feat range: [-5.59, 5.61]
  home_seq range: [-10.07, 7.85]
  away_seq range: [-10.07, 7.85]
  match_feat range: [-3.59, 5.61]
[TF fold] seq_len=5, feat_dim=120, match_feat_dim=3
Epoch 01 | lr=0.001000 | train_loss=1.0954 | train_acc=0.399
         | val_logloss=1.0897
         | train_pred_dist: H=0.38 D=0.26 A=0.36
         | val_pred_dist:   H=0.17 D=0.38 A=0.46
Epoch 03 | lr=0.001000 | train_loss=1.0827 | train_acc=0.418
         | val_logloss=1.0525
         | train_pred_dist: H=0.41 D=0.24 A=0.35
         | val_pred_dist:   H=0.44 D=0.24 A=0.32
Epoch 06 | lr=0.001000 | train_loss=1.0783 | train_acc=0.418
         | val_logloss=1.0354
         | train_pred_dist: H=0.37 D=0.30 A=0.33
         | val_pred_dist:   H=0.49 D=0.02 A=0.49
Epoch 09 | lr=0.001

Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 2 jobs)
NUTS: [sigma_att, sigma_def, att_offset, def_offset, home_adv, rho_raw]
Sampling 4 chains for 300 tune and 600 draw iterations (1_200 + 2_400 draws total) took 103 seconds.


[TF fold] class_counts=[4195 2254 2671], class_weights={0: np.float64(0.7246722288438617), 1: np.float64(1.3487133984028394), 2: np.float64(1.1381505054286785)}
  home_seq range: [-49.23, 49.23]
  away_seq range: [-49.23, 49.23]
  match_feat range: [-5.60, 5.61]
  home_seq range: [-66.94, 66.93]
  away_seq range: [-66.94, 66.93]
  match_feat range: [-5.60, 5.61]
[TF fold] seq_len=5, feat_dim=120, match_feat_dim=3
Epoch 01 | lr=0.001000 | train_loss=1.0947 | train_acc=0.397
         | val_logloss=1.1118
         | train_pred_dist: H=0.38 D=0.28 A=0.34
         | val_pred_dist:   H=0.20 D=0.38 A=0.42
Epoch 03 | lr=0.001000 | train_loss=1.0804 | train_acc=0.426
         | val_logloss=1.0908
         | train_pred_dist: H=0.42 D=0.23 A=0.35
         | val_pred_dist:   H=0.49 D=0.16 A=0.35
Epoch 06 | lr=0.001000 | train_loss=1.0761 | train_acc=0.427
         | val_logloss=1.1303
         | train_pred_dist: H=0.39 D=0.26 A=0.35
         | val_pred_dist:   H=0.26 D=0.14 A=0.60
Epoch 09 | lr=0.

In [63]:
# ===========================
# Safe pack refit_state
# ===========================
def _get(name, default=None):
    return globals().get(name, default)

refit_state = {
    # ---- base models (final refit versions, if you have them) ----
    "xgb_model": _get("xgb_final", _get("xgb_model_old", _get("xgb_model", None))),
    "dc_posterior_refit": _get("dc_final_state", None),          # may be None
    "tf_model_refit": _get("model_tf_final", _get("model_tf_loaded", None)),

    # ---- stack/meta ----
    "stage1": _get("stage1_all_new", _get("stage1_all", None)),
    "stage2": _get("stage2_all_new", _get("stage2_all", None)),
    "alpha":  _get("alpha_final_new", _get("alpha_final", None)),
    "tau":    _get("tau_final_new", _get("tau_final", None)),
    "iso_calibrators": _get("iso_calibrators", None),

    # ---- feature/schema ----
    "feature_cols": _get("feature_cols_xgb", []),
    "dc_feature_cols": _get("dc_feature_cols", []),
    "dc_scaler": _get("dc_scaler", None),

    "tf_token_features": _get("tf_token_features", _get("all_token_features_loaded", [])) \
        if isinstance(_get("tf_token_features", _get("all_token_features_loaded", [])), (list, tuple)) else [],

    "draw_feature_cols": _get("draw_feature_cols", []) if isinstance(_get("draw_feature_cols", []), (list, tuple)) else [],
    "draw_model_full": _get("draw_model_full", None),
    "fe": _get("fe", None),

    # ---- TF standardization params if available ----
    "tf_norm": {
        "mu_seq": _get("mu_seq_loaded", None),
        "sd_seq": _get("sd_seq_loaded", None),
        "mu_match": _get("mu_match_loaded", None),
        "sd_match": _get("sd_match_loaded", None),
    },

    # ---- global DC posterior snapshot (if exists) ----
    "dc_posterior_global": {
        "attack": _get("attack_s", None),
        "defense": _get("defense_s", None),
        "home_adv": _get("home_adv_s", None),
        "rho": _get("rho_s", None),
        "beta_h_s": _get("beta_h_s", None),
        "beta_a_s": _get("beta_a_s", None),
        "tmap": _get("tmap", None),
        "teams": _get("teams", None),
    },

    "refit_info": {
        "type": "STACK_V2_PLUS_FINAL_REFIT",
        "time_decay_tau": _get("TIME_DECAY_TAU", None),
        "draw_boost": _get("DRAW_CLASS_MULT", None),
        "target_oof_seasons": _get("target_oof_seasons", None),
    }
}

# choose output path safely
REFIT_OUT_SAFE = _get("REFIT_OUT", "refit_state.pkl")

with open(REFIT_OUT_SAFE, "wb") as f:
    dill.dump(refit_state, f)

print(f"✅ Saved production refit model as {REFIT_OUT_SAFE}")


✅ Saved production refit model as epl_model_state_refit.pkl


In [64]:
print("\n" + "="*60)
print("")
print("="*60)
print(" Elo :  ()")
print(" Dixon-Coles OOF:  fold  posterior")
print(" : H2H ")
print(" : ")
print(" : /")
print(" : lambda=0.00325 (~tau~308d) + draw boost x3")
print(" : draw threshold grid 0.22-0.35 (target~0.25)")
print(" Isotonic : " if USE_ISOTONIC else " Isotonic : ")
print(f"\n: {len(feature_cols_xgb)}")
print(f" alpha: {alpha_final:.3f}")
print(f" tau: {tau_final:.3f}")




 Elo :  ()
 Dixon-Coles OOF:  fold  posterior
 : H2H 
 : 
 : /
 : lambda=0.00325 (~tau~308d) + draw boost x3
 : draw threshold grid 0.22-0.35 (target~0.25)
 Isotonic : 

: 120
 alpha: 0.629
 tau: 0.269


## 7. Final predictions on test set

### Output result

In [73]:
import numpy as np

def predict_with_threshold_v2(
    proba,
    tau=0.269,
    draw_bias=0.0,
    ha_margin=0.06,        # ✅ 调小：让像热刺这种(差0.052)更容易进②从而保持A
    draw_margin=0.03,
    ha_close=0.03,         # ✅ 新增：H/A 很接近(<=0.03)时更偏向给D
    tau_soft=0.21          # ✅ 新增：兜底给D时，pD至少要到这个水平
):
    """
    目标：
    - ① 若 D 接近最大且 >= tau -> D（强D）
    - ② 若 H/A 差距明显 -> 选更大者（强方向）
    - ③ 否则：若 H/A 非常接近 且 pD 不太低 -> D（弱D）
    - ④ 否则：回到 H/A（谁大选谁）
    """
    pH, pD, pA = proba[:, 0], proba[:, 1], proba[:, 2]
    pD = pD + draw_bias

    preds = np.zeros(len(proba), dtype=int)

    for i in range(len(proba)):
        h, d, a = float(pH[i]), float(pD[i]), float(pA[i])
        best = max(h, d, a)

        # ① 平局几乎是最可能（或非常接近）且过硬阈值
        if (d >= best - draw_margin) and (d >= tau):
            preds[i] = 1
            continue

        # ② H/A 有明显优势
        if abs(h - a) >= ha_margin:
            preds[i] = 0 if h > a else 2
            continue

        # ③ 兜底：H/A 很接近 + d 不太低 -> 给 D
        if (abs(h - a) <= ha_close) and (d >= tau_soft):
            preds[i] = 1
        else:
            preds[i] = 0 if h > a else 2

    return preds


In [74]:

print("Load the pre-trained model state")

with open("epl_model_state.pkl", "rb") as f:
    ms = dill.load(f)
    
model_tf = ms.get("tf_model", None)
print(f" Transformer model (from pkl): {model_tf is not None}")
fe = ms["fe"]

new_results = pd.read_csv("new_results.csv")
new_results["Date"] = pd.to_datetime(new_results["Date"], dayfirst=True)

new_results["Season"] = new_results["Date"].apply(season_label)
#  fe form
new_results_fe, fe = compute_all_features(
    new_results,
    fe=fe,
    is_train=True, 
    use_state_features=True,
    use_all_adv_block=True,
    use_shot_corners=True,
    use_td_h2h=True,
    use_draw_block=True,
)

#  fe
ms["fe"] = fe
with open("epl_model_state.pkl", "wb") as f:
    dill.dump(ms, f)

print(f"The FeatureEngineering status has been updated using {len(new_results)} new matches.")

# Base models
xgb_final        = ms["xgb_model"]
stage1_all       = ms["stage1"]
stage2_all       = ms["stage2"]
alpha_final      = ms["alpha"]
tau_final        = ms["tau"]
feature_cols_xgb = ms["feature_cols"]
iso_calibrators  = ms.get("iso_calibrators", None)

# Draw specialist
draw_model_full   = ms.get("draw_model_full", None)
draw_feature_cols = ms.get("draw_feature_cols", [])

# DC Correlated
dc_feature_cols = ms.get("dc_feature_cols", [])
dc_scaler       = ms.get("dc_scaler", None)
beta_h_s        = ms.get("beta_h_s", None)
beta_a_s        = ms.get("beta_a_s", None)

fe = ms.get("fe", None)
if fe is None:
    print("If fe is not in pkl, fe in memory will be used (if it exists).")
    if "fe" not in globals():
        raise ValueError(" The feature is neither in pkl nor in memory. Please run feature engineering first!")
else:
    print(" FeatureEngineering has been loaded from pkl")

tf_token_features_from_pkl = ms.get("tf_token_features", [])
if len(tf_token_features_from_pkl) > 0:
    tf_token_features = tf_token_features_from_pkl
    print(f" tf_token_features loaded from pkl: {len(tf_token_features)} features")
elif "tf_token_features" in globals():
    print(f" Using tf_token_features in memory: {len(tf_token_features)} features")
else:
    tf_token_features = []
    print(" tf_token_features unavailable")
    
# DC posterior
dc_post = ms.get("dc_posterior", None)
if dc_post is not None:
    attack_s   = dc_post["attack"]
    defense_s  = dc_post["defense"]
    home_adv_s = dc_post["home_adv"]
    rho_s      = dc_post["rho"]
    tmap       = dc_post["tmap"]
    teams      = dc_post["teams"]

print(f"\n--- loading status ---")
print(f" XGB: {xgb_final is not None}, number of features: {len(feature_cols_xgb)}")
print(f" Draw specialist: {draw_model_full is not None}, number of features: {len(draw_feature_cols)}")
print(f" DC posterior: {dc_post is not None}, number of features: {len(dc_feature_cols)}")
print(f" DC scaler: {dc_scaler is not None}")
print(f" Beta_h/Beta_a: {beta_h_s is not None}")
print(f" FeatureEngineering: {fe is not None}")
print(f" TF features: {len(tf_token_features)}")
print(f" Transformer model (in memory): {'model_tf' in globals() and model_tf is not None}")


fixtures = pd.read_csv("epl-test.csv")
fixtures["Date"] = pd.to_datetime(fixtures["Date"], dayfirst=True, errors="coerce")
fixtures["Season"] = fixtures["Date"].apply(season_label)

result_cols = [
    "FTR", "FTHG", "FTAG", "HTHG", "HTAG",
    "HS", "AS", "HST", "AST", "HC", "AC", 
    "HF", "AF", "HY", "AY", "HR", "AR", "Referee",
]
for c in result_cols:
    if c not in fixtures.columns:
        fixtures[c] = np.nan

fixtures = fixtures.sort_values(["Date", "HomeTeam", "AwayTeam"]).reset_index(drop=True)
print(f"length of epl-test: {len(fixtures)}")
display(fixtures[["Date", "HomeTeam", "AwayTeam"]].head(10))

###################################################
# feature engineering
###################################################

fixtures_fe, _ = compute_all_features(
    fixtures,
    fe=fe,
    is_train=False,
    use_state_features=True,
    use_all_adv_block=True,
    use_shot_corners=True,
    use_td_h2h=True,
    use_draw_block=True,
)
print(f"Feature engineering done. Shape: {fixtures_fe.shape}")

###################################################
# XGB forecast
###################################################

X_fix_xgb = pd.DataFrame(index=fixtures_fe.index)
for col in feature_cols_xgb:
    if col in fixtures_fe.columns:
        X_fix_xgb[col] = fixtures_fe[col]
    else:
        X_fix_xgb[col] = 0.0

if "train_means" in globals():
    for col in feature_cols_xgb:
        if col in train_means.index:
            X_fix_xgb[col] = X_fix_xgb[col].fillna(train_means[col])
        else:
            X_fix_xgb[col] = X_fix_xgb[col].fillna(0.0)
else:
    X_fix_xgb = X_fix_xgb.fillna(0.0)

proba_fix_xgb = xgb_final.predict_proba(X_fix_xgb.values.astype(np.float32))
print(f" XGB proba shape: {proba_fix_xgb.shape}")

###################################################
# Draw specialist
###################################################

if draw_model_full is not None and len(draw_feature_cols) > 0:
    X_draw_fix = pd.DataFrame(index=fixtures_fe.index)
    for col in draw_feature_cols:
        if col in fixtures_fe.columns:
            X_draw_fix[col] = fixtures_fe[col]
        else:
            X_draw_fix[col] = 0.0
    
    X_draw_fix = X_draw_fix[draw_feature_cols].fillna(0.0)
    pD_fix_special = draw_model_full.predict_proba(X_draw_fix.values.astype(np.float32))[:, 1]
    print(f" Draw specialist pD_special, number of features = {len(draw_feature_cols)}")

elif draw_model_full is not None:
    model_draw_cols = list(getattr(draw_model_full, "feature_names_in_", []))
    if len(model_draw_cols) > 0:
        X_draw_fix = fixtures_fe[model_draw_cols].fillna(0.0)
        pD_fix_special = draw_model_full.predict_proba(X_draw_fix.values.astype(np.float32))[:, 1]
        print(f" Draw specialist (from model.feature_names_in_)")
    else:
        pD_fix_special = proba_fix_xgb[:, 1]
        print(" draw_model_full has no feature name, fallback: pD = XGB pD")
else:
    pD_fix_special = proba_fix_xgb[:, 1]
    print("draw_model_full not loaded, fallback: pD = XGB pD")

###################################################
# Bayes Dixon–Coles
###################################################

if dc_post is not None and dc_scaler is not None and beta_h_s is not None and len(dc_feature_cols) > 0:
    try:
        X_dc_fix_raw = pd.DataFrame(index=fixtures_fe.index)
        for col in dc_feature_cols:
            if col in fixtures_fe.columns:
                X_dc_fix_raw[col] = fixtures_fe[col]
            else:
                X_dc_fix_raw[col] = 0.0
        
        X_dc_fix = dc_scaler.transform(X_dc_fix_raw[dc_feature_cols].fillna(0.0))
        
        proba_fix_dc, _ = bayes_dc_predict_full(
            fixtures_fe,
            X_dc_fix,
            attack_s, defense_s,
            home_adv_s, rho_s,
            beta_h_s, beta_a_s,
            tmap,
        )
        print(f" DC proba shape: {proba_fix_dc.shape}")
    except Exception as e:
        print("DC prediction fail")
        proba_fix_dc = np.full_like(proba_fix_xgb, 1/3)
else:
    print("The DC-related variables are incomplete; use a uniform 1/3.")
    proba_fix_dc = np.full_like(proba_fix_xgb, 1/3)

###################################################
# Transformer forecast
###################################################

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

if "model_tf" in globals() and model_tf is not None and len(tf_token_features) > 0:
    try:
        expected_feat_dim = model_tf.input_proj[0].in_features
        print(f" Expected input dimension of the model: {expected_feat_dim}")
        print(f" Current number of tf_token_features: {len(tf_token_features)}")
        
        name_map = {f: str(f).lower() for f in tf_token_features}
        safe_features = []
        for f in tf_token_features:
            name = name_map[f]
            if f not in fixtures_fe.columns:
                continue
            if (
                ("pm" in name) or
                ("form" in name) or
                ("elo" in name) or
                ("position" in name) or
                ("points" in name) or
                ("l10" in name) or
                ("win_streak" in name) or
                ("unbeaten" in name) or
                ("draw" in name) or
                ("xg" in name) or
                ("h2h" in name) or
                ("rest" in name) or
                ("momentum" in name) or
                ("attack" in name) or
                ("defense" in name)
            ):
                safe_features.append(f)
        
        print(f" Number of safe_features after filtering: {len(safe_features)}")
        
        if len(safe_features) != expected_feat_dim:
            print(f" Feature count mismatch ({len(safe_features)} vs {expected_feat_dim}), truncating the first {expected_feat_dim} features")
            safe_features = safe_features[:expected_feat_dim]
        
        fixtures_with_seq, FEAT_DIM = build_team_sequences_fixed(
            fixtures_fe,
            safe_features,
            seq_len=10
        )
        
        print(f"️ Actual FEAT_DIM constructed: {FEAT_DIM}")
        
        home_seq = np.stack(fixtures_with_seq["home_form_seq"].values).astype(np.float32)
        away_seq = np.stack(fixtures_with_seq["away_form_seq"].values).astype(np.float32)
        match_feat = np.stack(fixtures_with_seq["match_features"].values).astype(np.float32)
        
        home_seq = np.nan_to_num(home_seq, nan=0.0, posinf=0.0, neginf=0.0)
        away_seq = np.nan_to_num(away_seq, nan=0.0, posinf=0.0, neginf=0.0)
        match_feat = np.nan_to_num(match_feat, nan=0.0, posinf=0.0, neginf=0.0)
        
        print(f" home_seq shape: {home_seq.shape}")
        
        home_seq_t = torch.tensor(home_seq, dtype=torch.float32).to(DEVICE)
        away_seq_t = torch.tensor(away_seq, dtype=torch.float32).to(DEVICE)
        match_feat_t = torch.tensor(match_feat, dtype=torch.float32).to(DEVICE)
        
        model_tf.eval()
        with torch.no_grad():
            logits = model_tf(home_seq_t, away_seq_t, match_feat_t)
            proba_fix_tf = torch.softmax(logits, dim=1).cpu().numpy()
        
        print(f" Transformer proba shape: {proba_fix_tf.shape}")
        
    except Exception as e:
        print(f"Transformer fail to predict: {e}")
        traceback.print_exc()
        proba_fix_tf = np.full_like(proba_fix_xgb, 1/3)
else:
    missing = []
    if "model_tf" not in globals() or model_tf is None:
        missing.append("model_tf")
    if len(tf_token_features) == 0:
        missing.append("tf_token_features")
    print(f" Transformer missing: {missing}, using even 1/3 placeholders")
    proba_fix_tf = np.full_like(proba_fix_xgb, 1/3)

###################################################
# build meta Feature
###################################################

X_base_fix = np.hstack([proba_fix_xgb, proba_fix_dc, proba_fix_tf])
X_meta_fix = np.hstack([X_base_fix, pD_fix_special.reshape(-1, 1)])
print(f"Meta feature shape: {X_meta_fix.shape}")

###################################################
#  Logistic
###################################################

p_draw_fix = stage1_all.predict_proba(X_meta_fix)[:, 1]
p_away_fix = stage2_all.predict_proba(X_meta_fix)[:, 1]

proba_fix_base = make_proba_two_stage_alpha(p_draw_fix, p_away_fix, alpha_final)

###################################################
# Isotonic Calibration
###################################################

if iso_calibrators is not None:
    proba_fix_cal = proba_fix_base.copy()
    for k, iso in enumerate(iso_calibrators):
        if iso is not None:
            proba_fix_cal[:, k] = iso.predict(proba_fix_base[:, k])
    proba_fix_cal = np.clip(proba_fix_cal, 1e-7, 1)
    proba_fix_cal /= proba_fix_cal.sum(axis=1, keepdims=True)
    print(" using Isotonic calibration")
else:
    proba_fix_cal = proba_fix_base
    print("!!! Isotonic calibration is not applied")

###################################################
# 
###################################################

y_pred_fix = predict_with_threshold_v2(proba_fix_cal, tau=tau_final, draw_bias=0.0)
idx2label = {0: "H", 1: "D", 2: "A"}
pred_label = [idx2label[int(i)] for i in y_pred_fix]

Load the pre-trained model state
 Transformer model (from pkl): True
Added 99 features
corner, foul, yellow/red cards history features are added
Added advanced H2H features (time-decayed & directional)
 Added extended draw-specific features
The FeatureEngineering status has been updated using 159 new matches.
 FeatureEngineering has been loaded from pkl
 tf_token_features loaded from pkl: 120 features

--- loading status ---
 XGB: True, number of features: 120
 Draw specialist: True, number of features: 24
 DC posterior: True, number of features: 12
 DC scaler: True
 Beta_h/Beta_a: True
 FeatureEngineering: True
 TF features: 120
 Transformer model (in memory): True
length of epl-test: 10


Unnamed: 0,Date,HomeTeam,AwayTeam
0,2026-01-31,Aston Villa,Brentford
1,2026-01-31,Brighton,Everton
2,2026-01-31,Chelsea,West Ham
3,2026-01-31,Leeds,Arsenal
4,2026-01-31,Liverpool,Newcastle
5,2026-01-31,Man United,Fulham
6,2026-01-31,Nottingham Forest,Crystal Palace
7,2026-01-31,Sunderland,Burnley
8,2026-01-31,Tottenham,Man City
9,2026-01-31,Wolves,Bournemouth


Added 98 features
corner, foul, yellow/red cards history features are added
Added advanced H2H features (time-decayed & directional)
 Added extended draw-specific features
Feature engineering done. Shape: (10, 161)
 XGB proba shape: (10, 3)
 Draw specialist pD_special, number of features = 24
 DC proba shape: (10, 3)
 Expected input dimension of the model: 101
 Current number of tf_token_features: 120
 Number of safe_features after filtering: 101
Sequence feature number: 101
Sequence feature : ['form_home', 'form_away', 'form_diff', 'form_home_v2', 'form_away_v2', 'form_diff_v2', 'home_home_form', 'away_away_form', 'win_streak_home', 'win_streak_away']...
️ Actual FEAT_DIM constructed: 101
 home_seq shape: (10, 10, 101)
 Transformer proba shape: (10, 3)
Meta feature shape: (10, 10)
 using Isotonic calibration


In [75]:
# ====== FIX: 用 _orig_idx 对齐概率，而不是按行号 ======

# 1️⃣ 给 fixtures 一个永久 id（只基于原始顺序）
fixtures = fixtures.copy()
fixtures["_orig_idx"] = fixtures.index

# 2️⃣ 用生成 proba 时对应的 index 构建概率表
df_proba = pd.DataFrame({
    "_orig_idx": fixtures_fe.index,   # ⚠️ 关键：和 proba_fix_cal 一一对应
    "pH": proba_fix_cal[:, 0],
    "pD": proba_fix_cal[:, 1],
    "pA": proba_fix_cal[:, 2],
    "Pred": pred_label
})

# 3️⃣ merge 回 fixtures（不会错位）
fixtures_fix = fixtures.merge(df_proba, on="_orig_idx", how="left")

# 4️⃣ 构造最终 result（从 merge 后的 fixtures_fix 来）
result = fixtures_fix[['Date', 'HomeTeam', 'AwayTeam', 'pH', 'pD', 'pA', 'Pred']].copy()

# 日期格式（保留你原来的逻辑）
result['Date'] = pd.to_datetime(result['Date']).dt.strftime('%Y-%m-%d')
result['Date'] = " " + result['Date'].astype(str)


print(f"\n==== FINAL PREDICTIONS (tau={tau_final:.3f}, alpha={alpha_final:.3f}) ====")
print(f"Predicted distribution: H={sum(y_pred_fix==0)}, D={sum(y_pred_fix==1)}, A={sum(y_pred_fix==2)}")

# result
display(result[["Date", "HomeTeam", "AwayTeam", "pH", "pD", "pA", "Pred"]])

#  CSV
result.to_csv("epl_predictions.csv", index=False)
print("result has been save to epl_predictions.csv")


==== FINAL PREDICTIONS (tau=0.269, alpha=0.931) ====
Predicted distribution: H=6, D=1, A=3


Unnamed: 0,Date,HomeTeam,AwayTeam,pH,pD,pA,Pred
0,2026-01-31,Aston Villa,Brentford,0.51259,0.215082,0.272328,H
1,2026-01-31,Brighton,Everton,0.418683,0.214227,0.36709,H
2,2026-01-31,Chelsea,West Ham,0.544295,0.212034,0.243672,H
3,2026-01-31,Leeds,Arsenal,0.316292,0.212552,0.471157,A
4,2026-01-31,Liverpool,Newcastle,0.450454,0.230483,0.319063,H
5,2026-01-31,Man United,Fulham,0.471021,0.229174,0.299805,H
6,2026-01-31,Nottingham Forest,Crystal Palace,0.369329,0.215982,0.414688,A
7,2026-01-31,Sunderland,Burnley,0.544295,0.212034,0.243672,H
8,2026-01-31,Tottenham,Man City,0.353012,0.22157,0.425418,A
9,2026-01-31,Wolves,Bournemouth,0.385593,0.210412,0.403994,D


result has been save to epl_predictions.csv
