In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, precision_score, recall_score
import os
import warnings

# Suppress warnings
warnings.filterwarnings('ignore')

print("--- DeepBallonNet: Unified Deep Learning Training & Prediction Pipeline ---")

# ==============================================================================
# 1. LOAD & PREPARE HISTORICAL DATA
# ==============================================================================
try:
    df = pd.read_csv('../data/master_dataset_2011-2025.csv')
    print("‚úÖ Historical data loaded.")
    
    # Standardize column names
    rename_map = {'xG': 'xG_player', 'xAG': 'xAG_player', 'UCL_Progress': 'UCL_progress'}
    df.rename(columns=rename_map, inplace=True)

except FileNotFoundError:
    print("‚ùå Error: Master dataset not found.")
    raise

# --- Feature Engineering Function (Used by both models) ---
def engineer_features(df):
    df = df.copy()
    if 'Rk_team' in df.columns: trophy = (df['Rk_team'] == 1).astype(int) * 2
    else: trophy = 0
    if 'UCL_progress' in df.columns:
        trophy += (df['UCL_progress'] == 'W').astype(int) * 3
        trophy += (df['UCL_progress'] == 'F').astype(int) * 1
    df['Trophy_Impact_Score'] = trophy
    df['Big_Game_Score'] = (df.get('Gls_league', 0) * 1.0) + (df.get('Gls_ucl', 0) * 2.5)
    return df

# --- Create Targets ---
ballon_dor_history = {
    '2024-2025': ['Ousmane Dembele', 'Lamine Yamal', 'Vitinha', 'Raphinha', 'Mohammed Salah', 'Kylian Mbappe', 'Achraf Hakimi', 'Desire Doue', 'Kvicha Kvaratskhelia', 'Nuno Mendes'],
    '2023-2024': ['Rodri', 'Vin√≠cius J√∫nior', 'Jude Bellingham', 'Dani Carvajal', 'Lautaro Martinez', 'Toni Kroos', 'Kylian Mbapp√©', 'Harry Kane', 'Phil Foden', 'Lamine Yamal'],
    '2022-2023': ['Lionel Messi', 'Erling Haaland', 'Kylian Mbapp√©', 'Kevin De Bruyne', 'Rodri', 'Vin√≠cius J√∫nior', 'Juli√°n √Ålvarez', 'Victor Osimhen', 'Bernardo Silva', 'Luka Modriƒá'],
    '2021-2022': ['Karim Benzema', 'Sadio Man√©', 'Kevin De Bruyne', 'Robert Lewandowski', 'Mohamed Salah', 'Kylian Mbapp√©', 'Thibaut Courtois', 'Vin√≠cius J√∫nior', 'Luka Modriƒá', 'Erling Haaland'],
    '2020-2021': ['Lionel Messi', 'Robert Lewandowski', 'Jorginho', 'Karim Benzema', 'N\'Golo Kant√©', 'Cristiano Ronaldo', 'Mohamed Salah', 'Kevin De Bruyne', 'Kylian Mbapp√©', 'Gianluigi Donnarumma'],
    '2018-2019': ['Lionel Messi', 'Virgil van Dijk', 'Cristiano Ronaldo', 'Sadio Man√©', 'Mohamed Salah', 'Kylian Mbapp√©', 'Alisson', 'Robert Lewandowski', 'Bernardo Silva', 'Riyad Mahrez'],
    '2017-2018': ['Luka Modriƒá', 'Cristiano Ronaldo', 'Antoine Griezmann', 'Kylian Mbapp√©', 'Lionel Messi', 'Mohamed Salah', 'Rapha√´l Varane', 'Eden Hazard', 'Kevin De Bruyne', 'Harry Kane'],
    '2016-2017': ['Cristiano Ronaldo', 'Lionel Messi', 'Neymar', 'Gianluigi Buffon', 'Luka Modriƒá', 'Sergio Ramos', 'Kylian Mbapp√©', 'N\'Golo Kant√©', 'Robert Lewandowski', 'Harry Kane'],
    '2015-2016': ['Cristiano Ronaldo', 'Lionel Messi', 'Antoine Griezmann', 'Luis Su√°rez', 'Neymar', 'Gareth Bale', 'Riyad Mahrez', 'Jamie Vardy', 'Gianluigi Buffon', 'Pepe'],
    '2014-2015': ['Lionel Messi', 'Cristiano Ronaldo', 'Neymar', 'Robert Lewandowski', 'Luis Su√°rez', 'Thomas M√ºller', 'Manuel Neuer', 'Eden Hazard', 'Andr√©s Iniesta', 'Alexis S√°nchez'],
    '2013-2014': ['Cristiano Ronaldo', 'Lionel Messi', 'Manuel Neuer', 'Arjen Robben', 'Thomas M√ºller', 'Philipp Lahm', 'Neymar', 'James Rodr√≠guez', 'Toni Kroos', '√Ångel Di Mar√≠a'],
    '2012-2013': ['Cristiano Ronaldo', 'Lionel Messi', 'Franck Rib√©ry', 'Zlatan Ibrahimoviƒá', 'Neymar', 'Andr√©s Iniesta', 'Robin van Persie', 'Arjen Robben', 'Gareth Bale', 'Andrea Pirlo'],
    '2011-2012': ['Lionel Messi', 'Cristiano Ronaldo', 'Andr√©s Iniesta', 'Xavi', 'Radamel Falcao', 'Iker Casillas', 'Andrea Pirlo', 'Didier Drogba', 'Robin van Persie', 'Zlatan Ibrahimoviƒá'],
    '2010-2011': ['Lionel Messi', 'Cristiano Ronaldo', 'Xavi', 'Andr√©s Iniesta', 'Wayne Rooney', 'Luis Su√°rez', 'Diego Forl√°n', 'Samuel Eto\'o', 'Iker Casillas', 'Neymar']
}
df['Top_Candidate'] = 0
for season, players in ballon_dor_history.items():
    df.loc[(df['Season'] == season) & (df['Player'].isin(players)), 'Top_Candidate'] = 1

df = engineer_features(df)
progress_mapping = {'W': 1, 'F': 2, 'SF': 3, 'QF': 4, 'R16': 5, 'GR': 6, 'Did Not Qualify': 7}
df['UCL_Progress_Rank'] = df['UCL_progress'].str.strip().map(progress_mapping).fillna(7)

# --- Define Features and Targets ---
features = ['Age', 'Min_league', 'Gls_league', 'Ast_league', 'xG_player', 'xAG_player', 'Gls_ucl', 'Ast_ucl', 'Min_ucl', 'Rk_team', 'Pts', 'UCL_Progress_Rank', 'Trophy_Impact_Score', 'Big_Game_Score']
X = df[features].fillna(0)
y_bdo = df['Top_Candidate']
y_ucl = np.where(df['UCL_progress'] == 'W', 1, 0) # Target 2: UCL Winner

# --- Split and Scale Data ---
X_train_raw, X_test_raw, y_train_bdo, y_test_bdo, y_train_ucl, y_test_ucl = train_test_split(
    X, y_bdo, y_ucl, test_size=0.2, random_state=42, stratify=y_bdo
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_raw)
X_test_scaled = scaler.transform(X_test_raw)

# Tensors for BDO Model
X_train_bdo_t = torch.tensor(X_train_scaled, dtype=torch.float32)
y_train_bdo_t = torch.tensor(y_train_bdo.values, dtype=torch.float32).unsqueeze(1)
X_test_bdo_t = torch.tensor(X_test_scaled, dtype=torch.float32)

# Tensors for UCL Model
X_train_ucl_t = torch.tensor(X_train_scaled, dtype=torch.float32)
y_train_ucl_t = torch.tensor(y_train_ucl, dtype=torch.float32).unsqueeze(1)
X_test_ucl_t = torch.tensor(X_test_scaled, dtype=torch.float32)

# ==============================================================================
# 2. TRAIN BALLON D'OR MODEL (High Precision)
# ==============================================================================
print("\n--- Training Ballon d'Or Model (High Precision) ---")
class PrecisionNet(nn.Module):
    def __init__(self, input_size):
        super(PrecisionNet, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_size, 32), nn.ReLU(), nn.Dropout(0.5),
            nn.Linear(32, 1)
        )
    def forward(self, x): return self.model(x)

bdo_model = PrecisionNet(X_train_bdo_t.shape[1])
pos_weight_bdo = torch.tensor([(y_train_bdo == 0).sum() / (y_train_bdo == 1).sum()]) 
criterion_bdo = nn.BCEWithLogitsLoss(pos_weight=pos_weight_bdo)
optimizer_bdo = optim.Adam(bdo_model.parameters(), lr=0.0005, weight_decay=0.01)

for epoch in range(150): # 150 epochs for precision
    bdo_model.train()
    optimizer_bdo.zero_grad()
    loss = criterion_bdo(bdo_model(X_train_bdo_t), y_train_bdo_t)
    loss.backward()
    optimizer_bdo.step()
print("‚úÖ Ballon d'Or Model Trained.")

# Find Best Threshold for BDO Model
bdo_model.eval()
with torch.no_grad():
    test_probs_bdo = torch.sigmoid(bdo_model(X_test_bdo_t)).numpy()
best_prec_bdo = 0
best_thresh_bdo = 0.5
for thresh in np.arange(0.5, 0.99, 0.01):
    preds_bdo = (test_probs_bdo > thresh).astype(int)
    prec_bdo = precision_score(y_test_bdo, preds_bdo, zero_division=0)
    if prec_bdo > best_prec_bdo:
        best_prec_bdo = prec_bdo
        best_thresh_bdo = thresh
print(f"üèÜ BDO Best Threshold: {best_thresh_bdo:.2f} (Precision: {best_prec_bdo:.4f})")


# ==============================================================================
# 3. TRAIN UCL WINNER MODEL
# ==============================================================================
print("\n--- Training UCL Winner Model ---")
class UCLNet(nn.Module):
    def __init__(self, input_size):
        super(UCLNet, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_size, 64), nn.ReLU(), nn.Dropout(0.4),
            nn.Linear(64, 32), nn.ReLU(), nn.Dropout(0.4),
            nn.Linear(32, 1)
        )
    def forward(self, x): return self.model(x)

ucl_model = UCLNet(X_train_ucl_t.shape[1])
pos_weight_ucl = torch.tensor([(y_train_ucl == 0).sum() / (y_train_ucl == 1).sum()])
criterion_ucl = nn.BCEWithLogitsLoss(pos_weight=pos_weight_ucl)
optimizer_ucl = optim.Adam(ucl_model.parameters(), lr=0.001, weight_decay=1e-4)

for epoch in range(100): # 100 epochs for UCL
    ucl_model.train()
    optimizer_ucl.zero_grad()
    loss = criterion_ucl(ucl_model(X_train_ucl_t), y_train_ucl_t)
    loss.backward()
    optimizer_ucl.step()
print("‚úÖ UCL Model Trained.")


# ==============================================================================
# 4. PREDICT 2026 WINNERS (Unified Function)
# ==============================================================================
print("\n\n--- Running Live Predictions for 2025-2026 Season ---")

def clean_squad_names(df):
    if 'Squad' in df.columns:
        df['Squad'] = df['Squad'].astype(str).str.strip()
        df['Squad'] = df['Squad'].apply(lambda x: ' '.join(x.split(' ')[1:]) if len(x.split(' ')) > 1 and x.split(' ')[0] in ['eng', 'es', 'de', 'it', 'fr', 'pt', 'nl'] else x)
        replacements = {'Paris S-G': 'Paris Saint-Germain', 'Inter': 'Internazionale', 'Manchester Utd': 'Manchester United', 'Leverkusen': 'Bayer Leverkusen'}
        df['Squad'] = df['Squad'].replace(replacements)
    return df

try:
    # Load 2026 Data
    data_path = '../data/'
    df_standings = pd.read_csv(os.path.join(data_path, 'combined_league_standings_2026.csv'))
    df_players = pd.read_csv(os.path.join(data_path, 'combined_player_stats_2026.csv'))
    df_ucl_p = pd.read_csv(os.path.join(data_path, 'ucl_player_stats_2026.csv'))
    df_ucl_t = pd.read_csv(os.path.join(data_path, 'ucl_team_progress_2026.csv'))

    # Clean & Merge 2026 Data
    current_season = '2025-2026'
    for d in [df_standings, df_players, df_ucl_p, df_ucl_t]:
        d['Season'] = current_season; d.columns = d.columns.str.strip()
        d = clean_squad_names(d) # Apply squad name cleaning
    
    merge_keys = ['Squad', 'Season']
    if 'League' in df_players.columns and 'League' in df_standings.columns: merge_keys.append('League')
    
    df_2026 = pd.merge(df_players, df_standings, on=merge_keys, how='left', suffixes=('_player', '_team'))
    df_2026 = pd.merge(df_2026, df_ucl_p[['Player', 'Squad', 'Season', 'Gls', 'Ast']], on=['Player', 'Squad', 'Season'], how='left', suffixes=('_league', '_ucl'))
    df_2026 = pd.merge(df_2026, df_ucl_t[['Squad', 'Season', 'UCL_progress']], on=['Squad', 'Season'], how='left')
    
    for c in ['Gls_ucl', 'Ast_ucl']: 
        if c in df_2026.columns: df_2026[c] = df_2026[c].fillna(0)
    df_2026['UCL_progress'].fillna('Did Not Qualify', inplace=True)

    # Rename 2026 columns to match training features
    rename_map_2026 = { 'xG': 'xG_player', 'xAG': 'xAG_player', 'Rk': 'Rk_team', 'Pts': 'Pts', 'Min': 'Min_league', 'Gls': 'Gls_league', 'Ast': 'Ast_league', 'UCL_Progress': 'UCL_progress' }
    df_2026.rename(columns=rename_map_2026, inplace=True, errors='ignore')

    # Engineer 2026 Features
    df_2026 = engineer_features(df_2026)
    df_2026['UCL_Progress_Rank'] = df_2026['UCL_progress'].str.strip().map(progress_mapping).fillna(7)
    for col in features:
        if col not in df_2026.columns: df_2026[col] = 0
        df_2026[col] = pd.to_numeric(df_2026[col], errors='coerce')
    
    X_live = df_2026[features].fillna(0)
    
    # --- Ballon d'Or Prediction ---
    X_live_b_sc = scaler.transform(X_live) # Use BDO scaler
    bdo_model.eval()
    with torch.no_grad():
        df_2026['DL_Ballon_Prob'] = torch.sigmoid(bdo_model(torch.tensor(X_live_b_sc, dtype=torch.float32))).numpy()
    
    print(f"\nüèÜ Top 10 Ballon d'Or Predictions (Deep Learning):")
    display(df_2026.sort_values(by='DL_Ballon_Prob', ascending=False)[['Player', 'Squad', 'Gls_league', 'Gls_ucl', 'DL_Ballon_Prob']].head(10))

    # --- UCL Winner Prediction ---
    ucl_2026 = df_2026[df_2026['UCL_progress'] != 'Did Not Qualify'].copy()
    if not ucl_2026.empty:
        X_live_u = ucl_2026[features].fillna(0)
        X_live_u_sc = scaler.transform(X_live_u) # Use same scaler
        
        ucl_model.eval()
        with torch.no_grad():
            ucl_2026['Player_Win_Prob'] = torch.sigmoid(ucl_model(torch.tensor(X_live_u_sc, dtype=torch.float32))).numpy()
        
        team_probs = ucl_2026.groupby('Squad')['Player_Win_Prob'].mean().sort_values(ascending=False)
        print(f"\nüéØ Top 10 UCL Winner Predictions (Deep Learning):")
        display(team_probs.head(10).to_frame(name="Win Probability"))
    else:
        print("\n‚ö†Ô∏è No active UCL teams found in 2026 data.")

except Exception as e:
    print(f"Error during prediction: {e}")

--- DeepBallonNet: Unified Deep Learning Training & Prediction Pipeline ---
‚úÖ Historical data loaded.

--- Training Ballon d'Or Model (High Precision) ---
‚úÖ Ballon d'Or Model Trained.
üèÜ BDO Best Threshold: 0.97 (Precision: 0.5000)

--- Training UCL Winner Model ---
‚úÖ UCL Model Trained.


--- Running Live Predictions for 2025-2026 Season ---

üèÜ Top 10 Ballon d'Or Predictions (Deep Learning):


Unnamed: 0,Player,Squad,Gls_league,Gls_ucl,DL_Ballon_Prob
1080,Harry Kane,Bayern Munich,11,4.0,0.561246
716,Kylian Mbapp√©,Real Madrid,9,5.0,0.553742
158,Erling Haaland,Manchester City,9,3.0,0.476819
990,Luis D√≠az,Bayern Munich,5,0.0,0.432429
656,Vinicius J√∫nior,Real Madrid,5,0.0,0.429119
629,Arda G√ºler,Real Madrid,3,0.0,0.418776
2083,Joaqu√≠n Panichelli,Strasbourg,7,0.0,0.413317
1161,Michael Olise,Bayern Munich,3,1.0,0.39817
2093,Gon√ßalo Ramos,Paris Saint-Germain,1,2.0,0.391118
453,Juli√°n √Ålvarez,Atl√©tico Madrid,6,1.0,0.385241



üéØ Top 10 UCL Winner Predictions (Deep Learning):


Unnamed: 0_level_0,Win Probability
Squad,Unnamed: 1_level_1
Bayern Munich,0.03274
Real Madrid,0.029826
Paris Saint-Germain,0.020366
Arsenal,0.018999
Napoli,0.01618
Barcelona,0.004372
Manchester City,0.003591
Eint Frankfurt,0.003499
Internazionale,0.003478
Dortmund,0.003378
