In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, precision_score
from imblearn.over_sampling import SMOTE
import os
import warnings

# Suppress warnings
warnings.filterwarnings('ignore')

print("--- DeepBallonNet: Ballon d'Or Deep Learning Pipeline ---")

# ==============================================================================
# 1. HELPER FUNCTIONS & SETUP
# ==============================================================================
def engineer_features(df):
    df_featured = df.copy()
    # Handle case sensitivity
    if 'UCL_Progress' in df_featured.columns:
        df_featured.rename(columns={'UCL_Progress': 'UCL_progress'}, inplace=True)
        
    trophy_score = (df_featured.get('Rk_team', 0) == 1).astype(int) * 2
    if 'UCL_progress' in df_featured.columns:
        trophy_score += (df_featured['UCL_progress'] == 'W').astype(int) * 3
        trophy_score += (df_featured['UCL_progress'] == 'F').astype(int) * 1
    df_featured['Trophy_Impact_Score'] = trophy_score

    df_featured['Big_Game_Score'] = (df_featured.get('Gls_league', 0) * 1.0) + \
                                    (df_featured.get('Ast_league', 0) * 0.5) + \
                                    (df_featured.get('Gls_ucl', 0) * 2.0) + \
                                    (df_featured.get('Ast_ucl', 0) * 1.0)
    
    df_featured['Dominance_Ratio'] = df_featured.get('Gls_league', 0) / df_featured.get('GF', 1).replace(0, 1)
    return df_featured

# ==============================================================================
# 2. LOAD & PREPARE HISTORICAL TRAINING DATA
# ==============================================================================
try:
    df = pd.read_csv('../data/master_dataset_2011-2025.csv')
    print("‚úÖ Historical data loaded.")
    # Rename columns to standard names for training
    rename_map = {'xG': 'xG_player', 'xAG': 'xAG_player', 'UCL_Progress': 'UCL_progress'}
    df.rename(columns=rename_map, inplace=True)
except FileNotFoundError:
    print("‚ùå Error: Master dataset not found.")
    raise

# Create Target
ballon_dor_history = {
    '2024-2025': ['Ousmane Dembele', 'Lamine Yamal', 'Vitinha', 'Raphinha', 'Mohammed Salah', 'Kylian Mbappe', 'Achraf Hakimi', 'Desire Doue', 'Kvicha Kvaratskhelia', 'Nuno Mendes'],
    '2023-2024': ['Rodri', 'Vin√≠cius J√∫nior', 'Jude Bellingham', 'Dani Carvajal', 'Lautaro Martinez', 'Toni Kroos', 'Kylian Mbapp√©', 'Harry Kane', 'Phil Foden', 'Lamine Yamal'],
    '2022-2023': ['Lionel Messi', 'Erling Haaland', 'Kylian Mbapp√©', 'Kevin De Bruyne', 'Rodri', 'Vin√≠cius J√∫nior', 'Juli√°n √Ålvarez', 'Victor Osimhen', 'Bernardo Silva', 'Luka Modriƒá'],
    '2021-2022': ['Karim Benzema', 'Sadio Man√©', 'Kevin De Bruyne', 'Robert Lewandowski', 'Mohamed Salah', 'Kylian Mbapp√©', 'Thibaut Courtois', 'Vin√≠cius J√∫nior', 'Luka Modriƒá', 'Erling Haaland'],
    '2020-2021': ['Lionel Messi', 'Robert Lewandowski', 'Jorginho', 'Karim Benzema', 'N\'Golo Kant√©', 'Cristiano Ronaldo', 'Mohamed Salah', 'Kevin De Bruyne', 'Kylian Mbapp√©', 'Gianluigi Donnarumma'],
    '2018-2019': ['Lionel Messi', 'Virgil van Dijk', 'Cristiano Ronaldo', 'Sadio Man√©', 'Mohamed Salah', 'Kylian Mbapp√©', 'Alisson', 'Robert Lewandowski', 'Bernardo Silva', 'Riyad Mahrez'],
    '2017-2018': ['Luka Modriƒá', 'Cristiano Ronaldo', 'Antoine Griezmann', 'Kylian Mbapp√©', 'Lionel Messi', 'Mohamed Salah', 'Rapha√´l Varane', 'Eden Hazard', 'Kevin De Bruyne', 'Harry Kane'],
    '2016-2017': ['Cristiano Ronaldo', 'Lionel Messi', 'Neymar', 'Gianluigi Buffon', 'Luka Modriƒá', 'Sergio Ramos', 'Kylian Mbapp√©', 'N\'Golo Kant√©', 'Robert Lewandowski', 'Harry Kane'],
    '2015-2016': ['Cristiano Ronaldo', 'Lionel Messi', 'Antoine Griezmann', 'Luis Su√°rez', 'Neymar', 'Gareth Bale', 'Riyad Mahrez', 'Jamie Vardy', 'Gianluigi Buffon', 'Pepe'],
    '2014-2015': ['Lionel Messi', 'Cristiano Ronaldo', 'Neymar', 'Robert Lewandowski', 'Luis Su√°rez', 'Thomas M√ºller', 'Manuel Neuer', 'Eden Hazard', 'Andr√©s Iniesta', 'Alexis S√°nchez'],
    '2013-2014': ['Cristiano Ronaldo', 'Lionel Messi', 'Manuel Neuer', 'Arjen Robben', 'Thomas M√ºller', 'Philipp Lahm', 'Neymar', 'James Rodr√≠guez', 'Toni Kroos', '√Ångel Di Mar√≠a'],
    '2012-2013': ['Cristiano Ronaldo', 'Lionel Messi', 'Franck Rib√©ry', 'Zlatan Ibrahimoviƒá', 'Neymar', 'Andr√©s Iniesta', 'Robin van Persie', 'Arjen Robben', 'Gareth Bale', 'Andrea Pirlo'],
    '2011-2012': ['Lionel Messi', 'Cristiano Ronaldo', 'Andr√©s Iniesta', 'Xavi', 'Radamel Falcao', 'Iker Casillas', 'Andrea Pirlo', 'Didier Drogba', 'Robin van Persie', 'Zlatan Ibrahimoviƒá'],
    '2010-2011': ['Lionel Messi', 'Cristiano Ronaldo', 'Xavi', 'Andr√©s Iniesta', 'Wayne Rooney', 'Luis Su√°rez', 'Diego Forl√°n', 'Samuel Eto\'o', 'Iker Casillas', 'Neymar']
}

df['Top_Candidate'] = 0
for season, players in ballon_dor_history.items():
    df.loc[(df['Season'] == season) & (df['Player'].isin(players)), 'Top_Candidate'] = 1

# Feature Engineering
df = engineer_features(df)
progress_mapping = {'W': 1, 'F': 2, 'SF': 3, 'QF': 4, 'R16': 5, 'GR': 6, 'Did Not Qualify': 7}
df['UCL_Progress_Rank'] = df['UCL_progress'].str.strip().map(progress_mapping).fillna(7)

features = ['Age', 'Min_league', 'Gls_league', 'Ast_league', 'xG_player', 'xAG_player', 'Gls_ucl', 'Ast_ucl', 'Min_ucl', 'Rk_team', 'Pts', 'UCL_Progress_Rank', 'Trophy_Impact_Score', 'Big_Game_Score', 'Dominance_Ratio']
X = df[features].fillna(0)
y = df['Top_Candidate']

# Split, Balance (SMOTE), and Scale
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_res)
X_test_scaled = scaler.transform(X_test)

X_train_t = torch.tensor(X_train_scaled, dtype=torch.float32)
y_train_t = torch.tensor(y_train_res.values, dtype=torch.float32).unsqueeze(1)
X_test_t = torch.tensor(X_test_scaled, dtype=torch.float32)
y_test_t = torch.tensor(y_test.values, dtype=torch.float32).unsqueeze(1) # Needed for evaluation

# ==============================================================================
# 3. TRAIN THE NEURAL NETWORK
# ==============================================================================
class PrecisionNet(nn.Module):
    def __init__(self, input_size):
        super(PrecisionNet, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_size, 32),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(32, 1)
        )
    def forward(self, x): return self.model(x)

model = PrecisionNet(X_train.shape[1])
pos_weight = torch.tensor([100.0]) 
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
optimizer = optim.Adam(model.parameters(), lr=0.0005, weight_decay=0.01)

print("Training Neural Network...")
for epoch in range(150):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train_t)
    loss = criterion(outputs, y_train_t)
    loss.backward()
    optimizer.step()
print("‚úÖ Model Trained.")

# Find Best Threshold
model.eval()
with torch.no_grad():
    test_probs = torch.sigmoid(model(X_test_t)).numpy()
best_prec, best_thresh = 0, 0.5
for thresh in np.arange(0.5, 0.99, 0.01):
    preds = (test_probs > thresh).astype(int)
    prec = precision_score(y_test, preds, zero_division=0)
    if prec > best_prec: best_prec, best_thresh = prec, thresh
print(f"üèÜ Best Threshold: {best_thresh:.2f}")
print("\n--- Deep Learning Model Evaluation ---")
print(classification_report(y_test, (test_probs > best_thresh).astype(int), target_names=['Not Candidate', 'Top Candidate']))


# ==============================================================================
# 4. PREDICT 2026 WINNER (FROM MASTER FILE)
# ==============================================================================
print("\n--- Predicting 2026 Ballon d'Or Candidates... ---")
try:
    # --- LOAD MASTER DATASET DIRECTLY ---
    master_df_2026 = pd.read_csv('../data/master_dataset_2026.csv')
    print("‚úÖ 2026 Master Dataset loaded successfully.")

    bdo_df = master_df_2026.copy()
    
    # Rename columns to match training data exactly
    rename_map_2026 = {'xG': 'xG_player', 'xAG': 'xAG_player', 'Rk': 'Rk_team', 'Pts': 'Pts', 'UCL_Progress': 'UCL_progress'}
    bdo_df.rename(columns=rename_map_2026, inplace=True, errors='ignore')

    # Handle duplicates (safety check)
    bdo_df = bdo_df.loc[:,~bdo_df.columns.duplicated()]

    # Feature Engineering
    bdo_df = engineer_features(bdo_df)
    progress_mapping = {'W': 1, 'F': 2, 'SF': 3, 'QF': 4, 'R16': 5, 'GR': 6, 'League Phase': 6, 'Did Not Qualify': 7}
    if 'UCL_progress' in bdo_df.columns:
        bdo_df['UCL_Progress_Rank'] = bdo_df['UCL_progress'].str.strip().map(progress_mapping).fillna(7)
    else:
        bdo_df['UCL_Progress_Rank'] = 7

    # Select & Scale
    for col in features:
        if col not in bdo_df.columns: bdo_df[col] = 0
        # If duplicate columns exist, take the first one
        if isinstance(bdo_df[col], pd.DataFrame):
            bdo_df[col] = bdo_df[col].iloc[:, 0]
        bdo_df[col] = pd.to_numeric(bdo_df[col], errors='coerce')
    
    X_live = bdo_df[features].fillna(0)
    X_live_scaled = scaler.transform(X_live)
    X_live_tensor = torch.tensor(X_live_scaled, dtype=torch.float32)

    # Predict
    model.eval()
    with torch.no_grad():
        logits = model(X_live_tensor)
        probs = torch.sigmoid(logits).numpy().flatten()
    
    bdo_df['BallonDor_Probability'] = probs
    
    print("Top 10 Candidates:")
    # Sort and drop player duplicates to show unique best candidates
    bdo_display = bdo_df.sort_values(by='BallonDor_Probability', ascending=False).drop_duplicates(subset=['Player'])
    display(bdo_display[['Player', 'Squad', 'Gls_league', 'Gls_ucl', 'BallonDor_Probability']].head(10))

except Exception as e:
    print(f"Error during prediction: {e}")

--- DeepBallonNet: Ballon d'Or Deep Learning Pipeline ---
‚úÖ Historical data loaded.
Training Neural Network...
‚úÖ Model Trained.
üèÜ Best Threshold: 0.86

--- Deep Learning Model Evaluation ---
               precision    recall  f1-score   support

Not Candidate       1.00      1.00      1.00      8297
Top Candidate       1.00      0.08      0.15        12

     accuracy                           1.00      8309
    macro avg       1.00      0.54      0.58      8309
 weighted avg       1.00      1.00      1.00      8309


--- Predicting 2026 Ballon d'Or Candidates... ---
‚úÖ 2026 Master Dataset loaded successfully.
Top 10 Candidates:


Unnamed: 0,Player,Squad,Gls_league,Gls_ucl,BallonDor_Probability
1861,Pierre-Emerick Aubameyang,Marseille,4,1.0,0.999899
1070,√Ålex Grimaldo,Leverkusen,4,1.0,0.999895
746,Kylian Mbapp√©,Real Madrid,13,5.0,0.99989
163,Erling Haaland,Manchester City,14,5.0,0.999866
807,Nicolas P√©p√©,Villarreal,2,0.0,0.999852
1130,Joshua Kimmich,Bayern Munich,1,0.0,0.999838
2261,Vitinha,Paris S-G,1,1.0,0.999838
1149,Konrad Laimer,Bayern Munich,1,0.0,0.999828
799,Aitor Paredes,Athletic Club,1,0.0,0.999817
2117,Nuno Mendes,Paris S-G,2,2.0,0.999816


In [13]:
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import os
import warnings

warnings.filterwarnings('ignore')

print("--- Training Elite High-Precision UCL Model ---")

# ==============================================================================
# 1. LOAD & PREPARE HISTORICAL DATA
# ==============================================================================
try:
    historical_df = pd.read_csv('../data/master_dataset_2011-2025.csv')
    rename_map = {'xG': 'xG_player', 'xAG': 'xAG_player', 'UCL_Progress': 'UCL_progress'}
    historical_df.rename(columns=rename_map, inplace=True)
except FileNotFoundError: raise

# --- Feature Engineering Function (Robust) ---
def engineer_elite_features(df):
    df = df.copy()
    league_weights = {'Premier League': 1.0, 'La Liga': 0.95, 'Bundesliga': 0.85, 'Serie A': 0.85, 'Ligue 1': 0.75}
    
    # Handle missing 'League' column gracefully
    if 'League' in df.columns:
        df['League_Weight'] = df['League'].map(league_weights).fillna(0.7)
        df['Is_Big_5'] = df['League'].isin(league_weights.keys()).astype(int)
    else:
        # Default weight if League is missing (assumes reasonably strong teams)
        df['League_Weight'] = 0.85 
        df['Is_Big_5'] = 1 

    df['MP_team'] = df['MP_team'].replace(0, 1)
    df['Adj_Pts_Per_Game'] = (df['Pts'] / df['MP_team']) * df['League_Weight']
    df['Adj_GD_Per_Game'] = (df['GD'] / df['MP_team']) * df['League_Weight']
    
    # Use aggregated column names directly
    df['Squad_Goals'] = df.get('Agg_Gls_league', 0)
    df['Squad_xG'] = df.get('Agg_xG', 0)
    
    return df

# Prepare Data
ucl_df = historical_df[historical_df['UCL_progress'] != 'Did Not Qualify'].copy()
ucl_df['UCL_Winner'] = np.where(ucl_df['UCL_progress'] == 'W', 1, 0)

# Aggregate player stats
player_agg = historical_df.groupby(['Squad', 'Season'])[['Gls_league', 'xG_player']].sum().reset_index().rename(columns={
    'Gls_league': 'Agg_Gls_league', 
    'xG_player': 'Agg_xG'
})
ucl_df = pd.merge(ucl_df, player_agg, on=['Squad', 'Season'], how='left')
ucl_df = engineer_elite_features(ucl_df)
team_level_df = ucl_df.drop_duplicates(subset=['Squad', 'Season'], keep='first').copy()

# Define Features
features = ['Adj_Pts_Per_Game', 'Adj_GD_Per_Game', 'Is_Big_5', 'Squad_Goals', 'Squad_xG']
for col in features:
    if col not in team_level_df.columns: team_level_df[col] = 0

X = team_level_df[features].fillna(0)
y = team_level_df['UCL_Winner']

# Split & Scale
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
scaler = StandardScaler()
X_train_sc = pd.DataFrame(scaler.fit_transform(X_train), columns=X.columns)
X_test_sc = pd.DataFrame(scaler.transform(X_test), columns=X.columns)

# Balance
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train_sc, y_train)

# ==============================================================================
# 2. TRAINING (Elite Ensemble)
# ==============================================================================
print("Training Elite Ensemble...")
clf1 = xgb.XGBClassifier(n_estimators=200, learning_rate=0.05, max_depth=5, random_state=42)
clf2 = lgb.LGBMClassifier(n_estimators=200, learning_rate=0.05, num_leaves=31, verbose=-1, random_state=42)
clf3 = CatBoostClassifier(iterations=200, learning_rate=0.05, depth=5, verbose=0, random_seed=42)

ucl_model = VotingClassifier(estimators=[('xgb', clf1), ('lgbm', clf2), ('cat', clf3)], voting='soft')
ucl_model.fit(X_train_res, y_train_res)
print("‚úÖ Model Trained.")

# Optimal Threshold
probs = ucl_model.predict_proba(X_test_sc)[:, 1]
best_thresh, best_f1 = 0.5, 0
for t in np.arange(0.1, 0.9, 0.05):
    preds = (probs >= t).astype(int)
    score = f1_score(y_test, preds)
    if score > best_f1: best_f1, best_thresh = score, t

print(f"üèÜ Optimal Threshold: {best_thresh:.2f}")
print("\n--- Elite Model Report ---")
print(classification_report(y_test, (probs >= best_thresh).astype(int), target_names=['Not Winner', 'Winner']))

# ==============================================================================
# 3. PREDICT 2026
# ==============================================================================
print("\n--- 2026 UCL Winner Prediction ---")
try:
    # Load 2026 data
    d_p = pd.read_csv('../data/combined_player_stats_2026.csv')
    d_l = pd.read_csv('../data/combined_league_standings_2026.csv')
    d_up = pd.read_csv('../data/ucl_team_progress_2026.csv')
    d_us = pd.read_csv('../data/ucl_player_stats_2026.csv')
    
    current_season = '2025-2026'
    for d in [d_p, d_l, d_up, d_us]: 
        d['Season']=current_season; d.columns=d.columns.str.strip()
        if 'Squad' in d.columns: d['Squad']=d['Squad'].str.strip().replace({'Paris S-G':'Paris Saint-Germain','Inter':'Internazionale','Manchester Utd':'Manchester United','Leverkusen':'Bayer Leverkusen'})

    m_k = ['Squad', 'Season']
    if 'League' in d_p.columns and 'League' in d_l.columns: m_k.append('League')
    df_26 = pd.merge(d_p, d_l, on=m_k, how='left', suffixes=('_player', '_team'))
    df_26 = pd.merge(df_26, d_us[['Player','Squad','Season']], on=['Player','Squad','Season'], how='left')
    df_26 = pd.merge(df_26, d_up, on=['Squad','Season'], how='left')
    if 'UCL_Progress' in df_26.columns: df_26.rename(columns={'UCL_Progress':'UCL_progress'}, inplace=True)
    df_26['UCL_progress'].fillna('Did Not Qualify', inplace=True)
    
    ucl_26 = df_26[df_26['UCL_progress'] != 'Did Not Qualify'].copy()
    rename_26 = {'Gls':'Gls_league', 'xG':'xG_player', 'Pts':'Pts', 'MP':'MP_team', 'W':'W', 'GD':'GD', 'Rk':'Rk_team'}
    ucl_26.rename(columns=rename_26, inplace=True, errors='ignore')
    
    # Aggregation
    p_agg = ucl_26.groupby(['Squad', 'Season'])[['Gls_league', 'xG_player']].sum().reset_index().rename(columns={'Gls_league': 'Agg_Gls_league', 'xG_player': 'Agg_xG'})
    ucl_26 = pd.merge(ucl_26, p_agg, on=['Squad', 'Season'], how='left')
    ucl_26 = ucl_26.drop_duplicates(subset=['Squad'])
    
    # Feature Engineering (Now Safe against missing 'League' column)
    ucl_26 = engineer_elite_features(ucl_26)
    
    # Select Best Features & Scale
    for col in features:
        if col not in ucl_26.columns: ucl_26[col] = 0
    
    X_live = ucl_26[features].fillna(0)
    # Use the same scaler from training!
    X_live_sc = pd.DataFrame(scaler.transform(X_live), columns=features)
    
    # Predict
    ucl_26['Win_Prob'] = ucl_model.predict_proba(X_live_sc)[:, 1]
    print("Top 10 Contenders:")
    cols = ['Squad', 'Win_Prob']
    if 'League' in ucl_26.columns: cols.insert(1, 'League')
    display(ucl_26[cols].sort_values(by='Win_Prob', ascending=False).head(10))

except Exception as e: print(f"Prediction Error: {e}")

--- Training Elite High-Precision UCL Model ---
Training Elite Ensemble...
‚úÖ Model Trained.
üèÜ Optimal Threshold: 0.40

--- Elite Model Report ---
              precision    recall  f1-score   support

  Not Winner       0.98      0.89      0.93        45
      Winner       0.29      0.67      0.40         3

    accuracy                           0.88        48
   macro avg       0.63      0.78      0.67        48
weighted avg       0.93      0.88      0.90        48


--- 2026 UCL Winner Prediction ---
Top 10 Contenders:


Unnamed: 0,Squad,Win_Prob
217,Bayern Munich,0.171574
390,Paris Saint-Germain,0.034413
382,Marseille,0.015267
113,Barcelona,0.012749
12,Arsenal,0.010828
4,Liverpool,0.01066
294,Napoli,0.009711
204,Dortmund,0.009545
110,Real Madrid,0.006659
290,Internazionale,0.004316


In [15]:
# --- Cell: Train and Evaluate UCL Ensemble Model ---
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE
import warnings

warnings.filterwarnings('ignore')

print("--- Training UCL Ensemble Model ---")

# 1. Load Data
try:
    df = pd.read_csv('../data/master_dataset_2011-2025.csv')
    rename_map = {'xG': 'xG_player', 'xAG': 'xAG_player', 'UCL_Progress': 'UCL_progress'}
    df.rename(columns=rename_map, inplace=True)
except FileNotFoundError:
    raise Exception("Data not found!")

# 2. Feature Engineering (Elite UCL)
def engineer_ucl_features(df):
    df = df.copy()
    df['MP_team'] = df['MP_team'].replace(0, 1)
    df['Pts_Per_Game'] = df['Pts'] / df['MP_team']
    df['Goal_Diff_Per_Game'] = df['GD'] / df['MP_team']
    df['Win_Rate'] = df['W'] / df['MP_team']
    df['Dominance_Score'] = (df['Win_Rate'] * 0.7) + (df['Goal_Diff_Per_Game'] * 0.3)
    df['League_Pedigree'] = 1 / df['Rk_team'].replace(0, 20)
    return df

# 3. Prepare Team-Level Data
ucl_df = df[df['UCL_progress'] != 'Did Not Qualify'].copy()
ucl_df['UCL_Winner'] = np.where(ucl_df['UCL_progress'] == 'W', 1, 0)

player_agg = df.groupby(['Squad', 'Season'])[['Gls_league', 'Ast_league', 'xG_player']].sum().reset_index().rename(columns={'Gls_league': 'Squad_Goals', 'Ast_league': 'Squad_Ast', 'xG_player': 'Squad_xG'})
ucl_df = pd.merge(ucl_df, player_agg, on=['Squad', 'Season'], how='left')

ucl_df = engineer_ucl_features(ucl_df)
team_level_df = ucl_df.drop_duplicates(subset=['Squad', 'Season'], keep='first').copy()

features_ucl = ['Pts_Per_Game', 'Goal_Diff_Per_Game', 'Win_Rate', 'Dominance_Score', 'League_Pedigree', 'Squad_Goals', 'Squad_xG', 'xG_team']
# Handle missing xG_team for old seasons
if 'xG_team' not in team_level_df.columns: team_level_df['xG_team'] = 0

X = team_level_df[features_ucl].fillna(0)
y = team_level_df['UCL_Winner']

# Split & Scale
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
scaler = MinMaxScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X.columns)

# Balance
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train_scaled, y_train)

# 4. Train Ensemble
clf1 = xgb.XGBClassifier(n_estimators=200, learning_rate=0.05, max_depth=5, random_state=42, eval_metric='logloss')
clf2 = lgb.LGBMClassifier(n_estimators=200, learning_rate=0.05, num_leaves=31, verbose=-1, random_state=42)
clf3 = CatBoostClassifier(iterations=200, learning_rate=0.05, depth=5, verbose=0, random_seed=42)

ensemble_model = VotingClassifier(estimators=[('xgb', clf1), ('lgbm', clf2), ('cat', clf3)], voting='soft')
ensemble_model.fit(X_train_res, y_train_res)
print("‚úÖ Ensemble Model Trained.")

# 5. Run Advanced Evaluation (Using your function)
if 'evaluate_model_advanced' in locals():
    print("\n>>> Evaluating UCL Winner Model...")
    evaluate_model_advanced(ensemble_model, X_test_scaled, y_test, "Ensemble")
    calculate_top_k_proxy(ensemble_model, X_test_scaled, y_test, "Ensemble")
else:
    print("‚ö†Ô∏è evaluation functions not found. Please run the previous cell containing 'evaluate_model_advanced'.")

--- Training UCL Ensemble Model ---
‚úÖ Ensemble Model Trained.

>>> Evaluating UCL Winner Model...

--- Advanced Evaluation: Ensemble ---
üèÜ Optimal Threshold: 0.2140
   Max F1-Score: 0.1429
   Precision at Optimal: 0.0909
   Recall at Optimal:    0.3333

üìä Average Rank of True Winners in Test Set (Ensemble): 28.3


In [16]:
import pandas as pd
import numpy as np
import torch
from sklearn.metrics import precision_recall_curve, f1_score

# --- 1. Define the Advanced Evaluation Function (FIXED) ---
def evaluate_model_advanced(model, X_test, y_test, model_type="Deep Learning"):
    print(f"\n--- Advanced Evaluation: {model_type} ---")
    
    # Get Probabilities
    # FIX: Check if "Deep Learning" is IN the string, not just equal to it
    if "Deep Learning" in model_type:
        model.eval()
        with torch.no_grad():
            if not isinstance(X_test, torch.Tensor):
                X_test = torch.tensor(X_test, dtype=torch.float32)
            # Forward pass + Sigmoid
            probs = torch.sigmoid(model(X_test)).numpy().flatten()
            
            if isinstance(y_test, torch.Tensor):
                y_true = y_test.numpy().flatten()
            else:
                y_true = y_test
    else: # Ensemble / XGBoost
        probs = model.predict_proba(X_test)[:, 1]
        y_true = y_test

    # 2. Find Optimal Threshold (Maximize F1)
    # Handle NaNs in y_true (just in case)
    mask = ~np.isnan(y_true)
    y_true = y_true[mask]
    probs = probs[mask]
    
    precisions, recalls, thresholds = precision_recall_curve(y_true, probs)
    
    with np.errstate(divide='ignore', invalid='ignore'):
        f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
    f1_scores = np.nan_to_num(f1_scores)
    
    best_idx = np.argmax(f1_scores)
    # Safety check for index bounds
    if best_idx < len(thresholds):
        best_thresh = thresholds[best_idx]
    else:
        best_thresh = 0.5
        
    best_f1 = f1_scores[best_idx]
    
    print(f"üèÜ Optimal Threshold: {best_thresh:.4f}")
    print(f"   Max F1-Score: {best_f1:.4f}")
    print(f"   Precision at Optimal: {precisions[best_idx]:.4f}")
    print(f"   Recall at Optimal:    {recalls[best_idx]:.4f}")

    return best_thresh

# ==============================================================================
# 2. RUN ADVANCED EVALUATION (with Reconstructed Data)
# ==============================================================================

# Note: We assume the data reconstruction part from the previous cell ran successfully
# and X_test_b_tensor, y_test_b_tensor, X_test_u_scaled, y_test_u are available.

if 'model' in locals():
    # Fix: String matching is now handled inside the function
    best_thresh_bdo = evaluate_model_advanced(model, X_test_b_tensor, y_test_b, "Deep Learning (Ballon d'Or)")
else:
    print("‚ùå Error: 'model' (Deep Learning) not found in memory.")

if 'ensemble_model' in locals():
    best_thresh_ucl = evaluate_model_advanced(ensemble_model, X_test_u_scaled, y_test_u, "Ensemble (UCL)")
else:
    print("‚ùå Error: 'ensemble_model' not found in memory.")


# --- 3. Top-K Accuracy Proxy ---
def calculate_top_k_proxy(model, X, y, model_type="Deep Learning"):
    if model_type == "Deep Learning":
        model.eval()
        with torch.no_grad():
             if not isinstance(X, torch.Tensor): X = torch.tensor(X, dtype=torch.float32)
             probs = torch.sigmoid(model(X)).numpy().flatten()
    else:
        probs = model.predict_proba(X)[:, 1]
        
    results = pd.DataFrame({'Actual': y, 'Prob': probs})
    winners = results[results['Actual'] == 1]
    
    if not winners.empty:
        results['Rank'] = results['Prob'].rank(ascending=False)
        avg_winner_rank = results[results['Actual'] == 1]['Rank'].mean()
        print(f"\nüìä Average Rank of True Winners in Test Set ({model_type}): {avg_winner_rank:.1f}")
    else:
        print(f"\n‚ö†Ô∏è No winners found in test set for {model_type}.")

if 'model' in locals(): calculate_top_k_proxy(model, X_test_b_tensor, y_test_b, "Deep Learning")
if 'ensemble_model' in locals(): calculate_top_k_proxy(ensemble_model, X_test_u_scaled, y_test_u, "Ensemble")


--- Advanced Evaluation: Deep Learning (Ballon d'Or) ---
üèÜ Optimal Threshold: 0.7485
   Max F1-Score: 0.2857
   Precision at Optimal: 0.3333
   Recall at Optimal:    0.2500

--- Advanced Evaluation: Ensemble (UCL) ---
üèÜ Optimal Threshold: 0.0140
   Max F1-Score: 0.1290
   Precision at Optimal: 0.0714
   Recall at Optimal:    0.6667

üìä Average Rank of True Winners in Test Set (Deep Learning): 171.2

üìä Average Rank of True Winners in Test Set (Ensemble): 30.3
