In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import os
import warnings

# Suppress warnings
warnings.filterwarnings('ignore')

print("--- DeepBallonNet: Journalist-View Prediction Pipeline ---")

# ==============================================================================
# 1. FEATURE ENGINEERING
# ==============================================================================
def engineer_features(df):
    df = df.copy()
    
    # --- 1. Trophy Score ---
    trophy = 0
    
    # Team Rank Points
    if 'Rk_team' in df.columns: 
        rank = pd.to_numeric(df['Rk_team'], errors='coerce').fillna(0)
        trophy += (rank == 1).astype(int) * 2
        
    # UCL Progress Points
    if 'UCL_progress' in df.columns:
        # Force string, strip whitespace
        ucl = df['UCL_progress'].fillna('None').astype(str).str.strip()
        trophy += (ucl == 'W').astype(int) * 3
        trophy += (ucl == 'F').astype(int) * 1
        
    df['Trophy_Impact_Score'] = trophy

    # --- 2. Big Game Score ---
    # Using .get() ensures we default to 0 if a column is somehow missing
    gls_l = df.get('Gls_league', pd.Series(0, index=df.index)).fillna(0)
    ast_l = df.get('Ast_league', pd.Series(0, index=df.index)).fillna(0)
    gls_u = df.get('Gls_ucl', pd.Series(0, index=df.index)).fillna(0)
    ast_u = df.get('Ast_ucl', pd.Series(0, index=df.index)).fillna(0)

    df['Big_Game_Score'] = (gls_l * 1.0) + (ast_l * 0.5) + (gls_u * 2.5) + (ast_u * 1.0)
    
    # --- 3. Dominance Ratio ---
    # Use Gls_league if available, else default to 0
    goals = df.get('Gls_league', 0)
    team_gf = df.get('GF', 1).replace(0, 1) 
    df['Dominance_Ratio'] = goals / team_gf
    
    return df

# ==============================================================================
# 2. TRAIN MODEL (Historical Data)
# ==============================================================================
print("\n--- Training Model on Historical Data ---")
try:
    hist_df = pd.read_csv('../data/master_dataset_2011-2025.csv')
    
    # Safety: Clean any potential history duplicates
    hist_df = hist_df.loc[:, ~hist_df.columns.duplicated()]
    hist_df = hist_df.reset_index(drop=True)
    
    # Standardize history column names
    rename_map = {'xG': 'xG_player', 'xAG': 'xAG_player', 'UCL_Progress': 'UCL_progress'}
    hist_df.rename(columns=rename_map, inplace=True)
    
except FileNotFoundError:
    raise Exception("Historical data not found!")

# Define Targets
ballon_dor_history = { 
    '2023-2024': ['Rodri', 'Vin√≠cius J√∫nior', 'Jude Bellingham', 'Kylian Mbapp√©', 'Harry Kane'], 
    '2022-2023': ['Lionel Messi', 'Erling Haaland', 'Kylian Mbapp√©', 'Kevin De Bruyne', 'Rodri'], 
    '2021-2022': ['Karim Benzema', 'Sadio Man√©', 'Kevin De Bruyne', 'Robert Lewandowski', 'Mohamed Salah'], 
    '2018-2019': ['Lionel Messi', 'Virgil van Dijk', 'Cristiano Ronaldo', 'Sadio Man√©', 'Mohamed Salah'], 
    '2017-2018': ['Luka Modriƒá', 'Cristiano Ronaldo', 'Antoine Griezmann', 'Kylian Mbapp√©', 'Lionel Messi'], 
    '2016-2017': ['Cristiano Ronaldo', 'Lionel Messi', 'Neymar', 'Gianluigi Buffon', 'Luka Modriƒá'], 
    '2015-2016': ['Cristiano Ronaldo', 'Lionel Messi', 'Antoine Griezmann', 'Luis Su√°rez', 'Neymar'], 
    '2014-2015': ['Lionel Messi', 'Cristiano Ronaldo', 'Neymar', 'Robert Lewandowski', 'Luis Su√°rez'], 
    '2013-2014': ['Cristiano Ronaldo', 'Lionel Messi', 'Manuel Neuer', 'Arjen Robben', 'Thomas M√ºller'], 
    '2012-2013': ['Cristiano Ronaldo', 'Lionel Messi', 'Franck Rib√©ry', 'Zlatan Ibrahimoviƒá', 'Neymar'], 
    '2011-2012': ['Lionel Messi', 'Cristiano Ronaldo', 'Andr√©s Iniesta', 'Xavi', 'Radamel Falcao'], 
    '2010-2011': ['Lionel Messi', 'Cristiano Ronaldo', 'Xavi', 'Andr√©s Iniesta', 'Wayne Rooney'] 
}

hist_df['Top_Candidate'] = 0
for season, players in ballon_dor_history.items():
    hist_df.loc[(hist_df['Season'] == season) & (hist_df['Player'].isin(players)), 'Top_Candidate'] = 1

hist_df = engineer_features(hist_df)
progress_mapping = {'W': 1, 'F': 2, 'SF': 3, 'QF': 4, 'R16': 5, 'GR': 6, 'Did Not Qualify': 7}
hist_df['UCL_Progress_Rank'] = hist_df['UCL_progress'].astype(str).map(progress_mapping).fillna(7)

features = ['Age', 'Min_league', 'Gls_league', 'Ast_league', 'xG_player', 'xAG_player', 
            'Gls_ucl', 'Ast_ucl', 'Min_ucl', 'Rk_team', 'Pts', 'UCL_Progress_Rank', 
            'Trophy_Impact_Score', 'Big_Game_Score', 'Dominance_Ratio']

X = hist_df[features].fillna(0)
y = hist_df['Top_Candidate']

# Train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_res)
X_train_t = torch.tensor(X_train_scaled, dtype=torch.float32)
y_train_t = torch.tensor(y_train_res.values, dtype=torch.float32).unsqueeze(1)

class PrecisionNet(nn.Module):
    def __init__(self, input_size):
        super(PrecisionNet, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_size, 32), nn.ReLU(), nn.Dropout(0.4),
            nn.Linear(32, 1)
        )
    def forward(self, x): return self.model(x)

model = PrecisionNet(X_train.shape[1])
criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([100.0]))
optimizer = optim.Adam(model.parameters(), lr=0.0005, weight_decay=0.01)

for epoch in range(150):
    model.train(); optimizer.zero_grad()
    loss = criterion(model(X_train_t), y_train_t)
    loss.backward(); optimizer.step()
print("‚úÖ Model Trained.")

# ==============================================================================
# 3. PREDICT 2026 (Safe Run)
# ==============================================================================
print("\n--- Predicting 2026 Candidates... ---")

# Load
df_2026 = pd.read_csv('../data/master_dataset_2026.csv', encoding='latin1')

# --- THE FIX ---
# 1. Wipe the Index clean to prevent "duplicate labels" error
df_2026 = df_2026.reset_index(drop=True)

# 2. Smart Rename
# Since your file ALREADY has 'Gls_league' and NOT 'Gls', the rename below will just skip
# the keys that are missing. This is exactly what we want.
target_map = {
    'xG': 'xG_player', 'xAG': 'xAG_player', 
    'Rk': 'Rk_team', 'Pts': 'Pts',
    'Min': 'Min_league', 'Gls': 'Gls_league', 'Ast': 'Ast_league',
    'UCL_Progress': 'UCL_progress'
}
df_2026.rename(columns=target_map, inplace=True, errors='ignore')

print("‚úÖ Data Loaded. Columns verified.")

# Engineer Features
df_2026 = engineer_features(df_2026)

# Map UCL
progress_mapping_live = {'W': 1, 'F': 2, 'SF': 3, 'QF': 4, 'R16': 5, 'GR': 6, 'League Phase': 6, 'Did Not Qualify': 7}

if 'UCL_progress' in df_2026.columns:
    df_2026['UCL_progress'] = df_2026['UCL_progress'].astype(str)
    df_2026['UCL_Progress_Rank'] = df_2026['UCL_progress'].str.strip().map(progress_mapping_live).fillna(7)
else:
    df_2026['UCL_Progress_Rank'] = 7

# Ensure numeric
for col in features:
    if col not in df_2026.columns: df_2026[col] = 0
    df_2026[col] = pd.to_numeric(df_2026[col], errors='coerce')

# Predict
X_live = df_2026[features].fillna(0)
X_live_scaled = scaler.transform(X_live)

model.eval()
with torch.no_grad():
    probs = torch.sigmoid(model(torch.tensor(X_live_scaled, dtype=torch.float32))).numpy().flatten()

df_2026['Model_Probability'] = probs

# Scoring Logic
max_goals = df_2026['Gls_league'].max() if df_2026['Gls_league'].max() > 0 else 1
max_ucl = df_2026['Gls_ucl'].max() if df_2026['Gls_ucl'].max() > 0 else 1
max_prob = df_2026['Model_Probability'].max() if df_2026['Model_Probability'].max() > 0 else 1

df_2026['Norm_Goals'] = df_2026['Gls_league'] / max_goals
df_2026['Norm_UCL'] = df_2026['Gls_ucl'] / max_ucl
df_2026['Norm_Prob'] = df_2026['Model_Probability'] / max_prob

df_2026['Journalist_Score'] = (df_2026['Norm_Goals'] * 0.40) + \
                              (df_2026['Norm_UCL'] * 0.25) + \
                              (df_2026['Norm_Prob'] * 0.35)
                              
final_ranking = df_2026.sort_values(by='Journalist_Score', ascending=False).drop_duplicates(subset=['Player'])

print("\nüèÜ Top 15 Ballon d'Or Candidates (Journalist View):")
display_cols = ['Player', 'Squad', 'Gls_league', 'Gls_ucl', 'Model_Probability', 'Journalist_Score']

try:
    display(final_ranking[display_cols].head(15))
except NameError:
    print(final_ranking[display_cols].head(15))

--- Total Columns: 55 ---
['Rk', 'Player', 'Nation', 'Pos', 'Squad', 'Age', 'Born', 'MP', 'Starts', 'Min_league', '90s', 'Gls_league', 'Ast_league', 'G+A', 'G-PK', 'PK', 'PKatt', 'CrdY', 'CrdR', 'xG_player', 'npxG', 'xAG_player', 'npxG+xAG', 'PrgC', 'PrgP', 'PrgR', 'Gls.1', 'Ast.1', 'G+A.1', 'G-PK.1', 'G+A-PK', 'xG.1', 'xAG.1', 'xG+xAG', 'npxG.1', 'npxG+xAG.1', 'Matches', 'Season', 'Rk_team', 'MP_team', 'W', 'D', 'L', 'GF', 'GA', 'GD', 'Pts', 'Pts/MP', 'Top Team Scorer', 'Goalkeeper', 'Notes', 'Min_ucl', 'Gls_ucl', 'Ast_ucl', 'UCL_progress']

--- Checking for Duplicates ---
‚úÖ No duplicates in raw file.

--- Checking if Target Names Already Exist ---
‚ö†Ô∏è Found 'Gls_league' - Renaming logic might cause a collision!
‚ö†Ô∏è Found 'Ast_league' - Renaming logic might cause a collision!
‚ö†Ô∏è Found 'Min_league' - Renaming logic might cause a collision!
‚ö†Ô∏è Found 'xG_player' - Renaming logic might cause a collision!
‚ö†Ô∏è Found 'UCL_progress' - Renaming logic might cause a collision!

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from imblearn.over_sampling import SMOTE
import os
import warnings

# Suppress warnings
warnings.filterwarnings('ignore')

print("--- DeepBallonNet: Journalist-View Prediction Pipeline ---")

# ==============================================================================
# 1. UTILITIES (Safe Rename & Text Repair)
# ==============================================================================
def safe_rename(df, rename_map):
    """Renames columns only if the TARGET doesn't already exist."""
    clean_map = {}
    for source, target in rename_map.items():
        if source in df.columns:
            if target not in df.columns:
                clean_map[source] = target
    if clean_map:
        df.rename(columns=clean_map, inplace=True)
    return df

def fix_mojibake(text):
    """Fixes encoding errors."""
    if not isinstance(text, str): return text
    
    # 1. Manual Overrides
    replacements = {
        'Du\x9a': 'Du≈°', 'Du≈°': 'Du≈°', 
        'Vlahovi': 'Vlahoviƒá',
        'Gy√É¬∂keres': 'Gy√∂keres',
        'Lewandowski': 'Lewandowski'
    }
    for bad, good in replacements.items():
        if bad in text:
            text = text.replace(bad, good)
            
    # 2. Standard Latin-1 fix
    try:
        return text.encode('latin-1').decode('utf-8')
    except (UnicodeEncodeError, UnicodeDecodeError):
        return text

def engineer_features(df):
    df = df.copy()
    
    # --- Trophy Score ---
    trophy = 0
    if 'Rk_team' in df.columns: 
        rank = pd.to_numeric(df['Rk_team'], errors='coerce').fillna(0)
        trophy += (rank == 1).astype(int) * 2
        
    if 'UCL_progress' in df.columns:
        ucl = df['UCL_progress'].astype(str).str.strip()
        trophy += (ucl == 'W').astype(int) * 3
        trophy += (ucl == 'F').astype(int) * 1
    df['Trophy_Impact_Score'] = trophy

    # --- Big Game Score ---
    default_series = pd.Series(0, index=df.index)
    gls_l = df.get('Gls_league', default_series).fillna(0)
    ast_l = df.get('Ast_league', default_series).fillna(0)
    gls_u = df.get('Gls_ucl', default_series).fillna(0)
    ast_u = df.get('Ast_ucl', default_series).fillna(0)

    df['Big_Game_Score'] = (gls_l * 1.0) + (ast_l * 0.5) + (gls_u * 2.5) + (ast_u * 1.0)
    
    # --- Dominance Ratio ---
    team_gf = df.get('GF', 1).replace(0, 1) 
    df['Dominance_Ratio'] = gls_l / team_gf
    
    return df

# ==============================================================================
# 2. TRAIN MODEL (Historical)
# ==============================================================================
print("\n--- Training Model on Historical Data ---")
try:
    hist_df = pd.read_csv('../data/master_dataset_2011-2025.csv')
    hist_df = hist_df.loc[:, ~hist_df.columns.duplicated()]
    hist_df = hist_df.reset_index(drop=True)
    
    hist_map = {'xG': 'xG_player', 'xAG': 'xAG_player', 'UCL_Progress': 'UCL_progress'}
    hist_df = safe_rename(hist_df, hist_map)
except FileNotFoundError:
    raise Exception("Historical data not found!")

# History Winners
ballon_dor_history = {
    '2024-2025': ['Ousmane Dembele', 'Lamine Yamal', 'Vitinha', 'Raphinha', 'Mohammed Salah', 'Kylian Mbappe', 'Achraf Hakimi', 'Desire Doue', 'Kvicha Kvaratskhelia', 'Nuno Mendes'],
    '2023-2024': ['Rodri', 'Vin√≠cius J√∫nior', 'Jude Bellingham', 'Dani Carvajal', 'Lautaro Martinez', 'Toni Kroos', 'Kylian Mbapp√©', 'Harry Kane', 'Phil Foden', 'Lamine Yamal'],
    '2022-2023': ['Lionel Messi', 'Erling Haaland', 'Kylian Mbapp√©', 'Kevin De Bruyne', 'Rodri', 'Vin√≠cius J√∫nior', 'Juli√°n √Ålvarez', 'Victor Osimhen', 'Bernardo Silva', 'Luka Modriƒá'],
    '2021-2022': ['Karim Benzema', 'Sadio Man√©', 'Kevin De Bruyne', 'Robert Lewandowski', 'Mohamed Salah', 'Kylian Mbapp√©', 'Thibaut Courtois', 'Vin√≠cius J√∫nior', 'Luka Modriƒá', 'Erling Haaland'],
    '2020-2021': ['Lionel Messi', 'Robert Lewandowski', 'Jorginho', 'Karim Benzema', 'N\'Golo Kant√©', 'Cristiano Ronaldo', 'Mohamed Salah', 'Kevin De Bruyne', 'Kylian Mbapp√©', 'Gianluigi Donnarumma'],
    '2018-2019': ['Lionel Messi', 'Virgil van Dijk', 'Cristiano Ronaldo', 'Sadio Man√©', 'Mohamed Salah', 'Kylian Mbapp√©', 'Alisson', 'Robert Lewandowski', 'Bernardo Silva', 'Riyad Mahrez'],
    '2017-2018': ['Luka Modriƒá', 'Cristiano Ronaldo', 'Antoine Griezmann', 'Kylian Mbapp√©', 'Lionel Messi', 'Mohamed Salah', 'Rapha√´l Varane', 'Eden Hazard', 'Kevin De Bruyne', 'Harry Kane'],
    '2016-2017': ['Cristiano Ronaldo', 'Lionel Messi', 'Neymar', 'Gianluigi Buffon', 'Luka Modriƒá', 'Sergio Ramos', 'Kylian Mbapp√©', 'N\'Golo Kant√©', 'Robert Lewandowski', 'Harry Kane'],
    '2015-2016': ['Cristiano Ronaldo', 'Lionel Messi', 'Antoine Griezmann', 'Luis Su√°rez', 'Neymar', 'Gareth Bale', 'Riyad Mahrez', 'Jamie Vardy', 'Gianluigi Buffon', 'Pepe'],
    '2014-2015': ['Lionel Messi', 'Cristiano Ronaldo', 'Neymar', 'Robert Lewandowski', 'Luis Su√°rez', 'Thomas M√ºller', 'Manuel Neuer', 'Eden Hazard', 'Andr√©s Iniesta', 'Alexis S√°nchez'],
    '2013-2014': ['Cristiano Ronaldo', 'Lionel Messi', 'Manuel Neuer', 'Arjen Robben', 'Thomas M√ºller', 'Philipp Lahm', 'Neymar', 'James Rodr√≠guez', 'Toni Kroos', '√Ångel Di Mar√≠a'],
    '2012-2013': ['Cristiano Ronaldo', 'Lionel Messi', 'Franck Rib√©ry', 'Zlatan Ibrahimoviƒá', 'Neymar', 'Andr√©s Iniesta', 'Robin van Persie', 'Arjen Robben', 'Gareth Bale', 'Andrea Pirlo'],
    '2011-2012': ['Lionel Messi', 'Cristiano Ronaldo', 'Andr√©s Iniesta', 'Xavi', 'Radamel Falcao', 'Iker Casillas', 'Andrea Pirlo', 'Didier Drogba', 'Robin van Persie', 'Zlatan Ibrahimoviƒá'],
    '2010-2011': ['Lionel Messi', 'Cristiano Ronaldo', 'Xavi', 'Andr√©s Iniesta', 'Wayne Rooney', 'Luis Su√°rez', 'Diego Forl√°n', 'Samuel Eto\'o', 'Iker Casillas', 'Neymar']
}

hist_df['Top_Candidate'] = 0
for season, players in ballon_dor_history.items():
    hist_df.loc[(hist_df['Season'] == season) & (hist_df['Player'].isin(players)), 'Top_Candidate'] = 1

hist_df = engineer_features(hist_df)
progress_mapping = {'W': 1, 'F': 2, 'SF': 3, 'QF': 4, 'R16': 5, 'GR': 6, 'Did Not Qualify': 7}
hist_df['UCL_Progress_Rank'] = hist_df['UCL_progress'].astype(str).map(progress_mapping).fillna(7)

features = ['Age', 'Min_league', 'Gls_league', 'Ast_league', 'xG_player', 'xAG_player', 
            'Gls_ucl', 'Ast_ucl', 'Min_ucl', 'Rk_team', 'Pts', 'UCL_Progress_Rank', 
            'Trophy_Impact_Score', 'Big_Game_Score', 'Dominance_Ratio']

X = hist_df[features].fillna(0)
y = hist_df['Top_Candidate']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_res)
X_train_t = torch.tensor(X_train_scaled, dtype=torch.float32)
y_train_t = torch.tensor(y_train_res.values, dtype=torch.float32).unsqueeze(1)

class PrecisionNet(nn.Module):
    def __init__(self, input_size):
        super(PrecisionNet, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_size, 32), nn.ReLU(), nn.Dropout(0.4),
            nn.Linear(32, 1)
        )
    def forward(self, x): return self.model(x)

model = PrecisionNet(X_train.shape[1])

# --- REALISM FIX: ADJUST WEIGHTS ---
# Lowered from 12.0 to 3.0.
# This calms the model down so it stops predicting "everyone" is a winner.
criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([6.0])) 

optimizer = optim.Adam(model.parameters(), lr=0.0005, weight_decay=0.01)

for epoch in range(150):
    model.train(); optimizer.zero_grad()
    loss = criterion(model(X_train_t), y_train_t)
    loss.backward(); optimizer.step()

# --- Model Evaluation on Test Data ---
print("‚úÖ Model Trained.")
print("\n--- Model Evaluation on Test Data ---")
model.eval()
with torch.no_grad():
    X_test_scaled = scaler.transform(X_test)
    X_test_t = torch.tensor(X_test_scaled, dtype=torch.float32)
    
    y_logits = model(X_test_t)
    y_probs = torch.sigmoid(y_logits).numpy()
    
    # --- THRESHOLD FIX ---
    # We only count it as a positive prediction if confidence is > 80%
    # This significantly improves PRECISION (reduces false alarms)
    y_pred = (y_probs > 0.70).astype(int) 
    
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, zero_division=0)
    rec = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    
    print(f"üìä Accuracy:  {acc:.2%}")
    print(f"üéØ Precision: {prec:.2%} (Target: >10%)")
    print(f"üîé Recall:    {rec:.2%} (Target: >60%)")
    print(f"‚öñÔ∏è F1 Score:  {f1:.2%}")

# ==============================================================================
# 3. PREDICT 2026 (With Realism Fixes)
# ==============================================================================
print("\n--- Predicting 2026 Candidates... ---")

try:
    df_2026 = pd.read_csv('../data/master_dataset_2026.csv', encoding='latin1')
    df_2026 = df_2026.reset_index(drop=True)
    df_2026 = df_2026.loc[:, ~df_2026.columns.duplicated()]

    # Apply Text Repair
    print("üîß Repairing Player Names...")
    for col in ['Player', 'Squad', 'Nation']:
        if col in df_2026.columns:
            df_2026[col] = df_2026[col].apply(fix_mojibake)
    
    # Safe Rename
    target_map = {
        'xG': 'xG_player', 'xAG': 'xAG_player', 
        'Rk': 'Rk_team', 'Pts': 'Pts',
        'Min': 'Min_league', 'Gls': 'Gls_league', 'Ast': 'Ast_league',
        'UCL_Progress': 'UCL_progress'
    }
    df_2026 = safe_rename(df_2026, target_map)
    
    print(f"‚úÖ Data Ready. Columns: {len(df_2026.columns)}")

    # Engineer Features
    df_2026 = engineer_features(df_2026)
    
    progress_mapping_live = {'W': 1, 'F': 2, 'SF': 3, 'QF': 4, 'R16': 5, 'GR': 6, 'League Phase': 6, 'Did Not Qualify': 7}
    if 'UCL_progress' in df_2026.columns:
        df_2026['UCL_progress'] = df_2026['UCL_progress'].astype(str)
        df_2026['UCL_Progress_Rank'] = df_2026['UCL_progress'].str.strip().map(progress_mapping_live).fillna(7)
    else:
        df_2026['UCL_Progress_Rank'] = 7

    for col in features:
        if col not in df_2026.columns: df_2026[col] = 0
        df_2026[col] = pd.to_numeric(df_2026[col], errors='coerce')

    X_live = df_2026[features].fillna(0)
    X_live_scaled = scaler.transform(X_live)

    # --- REALISM FIX 2: Temperature Scaling ---
    # Instead of sharp 0/1, we soften the probability curve
    model.eval()
    with torch.no_grad():
        logits = model(torch.tensor(X_live_scaled, dtype=torch.float32))
        
        # Temperature > 1.0 makes the probabilities "softer"
        temperature = 2.0 
        probs = torch.sigmoid(logits / temperature).numpy().flatten()

    df_2026['Model_Probability'] = probs

    max_goals = df_2026['Gls_league'].max() if df_2026['Gls_league'].max() > 0 else 1
    max_ucl = df_2026['Gls_ucl'].max() if df_2026['Gls_ucl'].max() > 0 else 1
    max_prob = df_2026['Model_Probability'].max() if df_2026['Model_Probability'].max() > 0 else 1

    df_2026['Norm_Goals'] = df_2026['Gls_league'] / max_goals
    df_2026['Norm_UCL'] = df_2026['Gls_ucl'] / max_ucl
    df_2026['Norm_Prob'] = df_2026['Model_Probability'] / max_prob

    # Weights
    df_2026['Journalist_Score'] = (df_2026['Norm_Goals'] * 0.40) + \
                                  (df_2026['Norm_UCL'] * 0.25) + \
                                  (df_2026['Norm_Prob'] * 0.35)
                                  
    final_ranking = df_2026.sort_values(by='Journalist_Score', ascending=False).drop_duplicates(subset=['Player'])

    print("\nüèÜ Top 15 Ballon d'Or Candidates (Journalist View):")
    display_cols = ['Player', 'Squad', 'Gls_league', 'Gls_ucl', 'Model_Probability', 'Journalist_Score']

    try:
        # Format probabilities to look like percentages
        pd.options.display.float_format = '{:.4f}'.format
        display(final_ranking[display_cols].head(15))
    except NameError:
        print(final_ranking[display_cols].head(15))

except Exception as e:
    print(f"‚ùå An error occurred: {e}")

--- DeepBallonNet: Journalist-View Prediction Pipeline ---

--- Training Model on Historical Data ---
‚úÖ Model Trained.

--- Model Evaluation on Test Data ---
üìä Accuracy:  99.58%
üéØ Precision: 30.00% (Target: >10%)
üîé Recall:    22.22% (Target: >60%)
‚öñÔ∏è F1 Score:  25.53%

--- Predicting 2026 Candidates... ---
üîß Repairing Player Names...
‚úÖ Data Ready. Columns: 55

üèÜ Top 15 Ballon d'Or Candidates (Journalist View):


Unnamed: 0,Player,Squad,Gls_league,Gls_ucl,Model_Probability,Journalist_Score
163,Erling Haaland,Manchester City,14,5.0,0.9907,0.9993
746,Kylian Mbapp√©,Real Madrid,13,5.0,0.9927,0.9714
1121,Harry Kane,Bayern Munich,13,5.0,0.9868,0.9694
1028,Luis D√≠az,Bayern Munich,6,3.0,0.9671,0.6624
476,Juli√°n √Ålvarez,Atl√©tico Madrid,7,2.0,0.9701,0.642
2007,Mason Greenwood,Marseille,8,1.0,0.9837,0.6254
1623,Lautaro Mart√≠nez,Inter,4,4.0,0.8822,0.6253
992,Jonathan Burkardt,Eint Frankfurt,6,2.0,0.973,0.6145
825,Marcus Rashford,Barcelona,2,4.0,0.9868,0.6051
717,Fermin L√≥pez,Barcelona,3,3.0,0.9719,0.5784


In [13]:
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import os
import warnings

warnings.filterwarnings('ignore')

print("--- Training Elite High-Precision UCL Model ---")

# ==============================================================================
# 1. LOAD & PREPARE HISTORICAL DATA
# ==============================================================================
try:
    historical_df = pd.read_csv('../data/master_dataset_2011-2025.csv')
    rename_map = {'xG': 'xG_player', 'xAG': 'xAG_player', 'UCL_Progress': 'UCL_progress'}
    historical_df.rename(columns=rename_map, inplace=True)
except FileNotFoundError: raise

# --- Feature Engineering Function (Robust) ---
def engineer_elite_features(df):
    df = df.copy()
    league_weights = {'Premier League': 1.0, 'La Liga': 0.95, 'Bundesliga': 0.85, 'Serie A': 0.85, 'Ligue 1': 0.75}
    
    # Handle missing 'League' column gracefully
    if 'League' in df.columns:
        df['League_Weight'] = df['League'].map(league_weights).fillna(0.7)
        df['Is_Big_5'] = df['League'].isin(league_weights.keys()).astype(int)
    else:
        # Default weight if League is missing (assumes reasonably strong teams)
        df['League_Weight'] = 0.85 
        df['Is_Big_5'] = 1 

    df['MP_team'] = df['MP_team'].replace(0, 1)
    df['Adj_Pts_Per_Game'] = (df['Pts'] / df['MP_team']) * df['League_Weight']
    df['Adj_GD_Per_Game'] = (df['GD'] / df['MP_team']) * df['League_Weight']
    
    # Use aggregated column names directly
    df['Squad_Goals'] = df.get('Agg_Gls_league', 0)
    df['Squad_xG'] = df.get('Agg_xG', 0)
    
    return df

# Prepare Data
ucl_df = historical_df[historical_df['UCL_progress'] != 'Did Not Qualify'].copy()
ucl_df['UCL_Winner'] = np.where(ucl_df['UCL_progress'] == 'W', 1, 0)

# Aggregate player stats
player_agg = historical_df.groupby(['Squad', 'Season'])[['Gls_league', 'xG_player']].sum().reset_index().rename(columns={
    'Gls_league': 'Agg_Gls_league', 
    'xG_player': 'Agg_xG'
})
ucl_df = pd.merge(ucl_df, player_agg, on=['Squad', 'Season'], how='left')
ucl_df = engineer_elite_features(ucl_df)
team_level_df = ucl_df.drop_duplicates(subset=['Squad', 'Season'], keep='first').copy()

# Define Features
features = ['Adj_Pts_Per_Game', 'Adj_GD_Per_Game', 'Is_Big_5', 'Squad_Goals', 'Squad_xG']
for col in features:
    if col not in team_level_df.columns: team_level_df[col] = 0

X = team_level_df[features].fillna(0)
y = team_level_df['UCL_Winner']

# Split & Scale
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
scaler = StandardScaler()
X_train_sc = pd.DataFrame(scaler.fit_transform(X_train), columns=X.columns)
X_test_sc = pd.DataFrame(scaler.transform(X_test), columns=X.columns)

# Balance
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train_sc, y_train)

# ==============================================================================
# 2. TRAINING (Elite Ensemble)
# ==============================================================================
print("Training Elite Ensemble...")
clf1 = xgb.XGBClassifier(n_estimators=200, learning_rate=0.05, max_depth=5, random_state=42)
clf2 = lgb.LGBMClassifier(n_estimators=200, learning_rate=0.05, num_leaves=31, verbose=-1, random_state=42)
clf3 = CatBoostClassifier(iterations=200, learning_rate=0.05, depth=5, verbose=0, random_seed=42)

ucl_model = VotingClassifier(estimators=[('xgb', clf1), ('lgbm', clf2), ('cat', clf3)], voting='soft')
ucl_model.fit(X_train_res, y_train_res)
print("‚úÖ Model Trained.")

# Optimal Threshold
probs = ucl_model.predict_proba(X_test_sc)[:, 1]
best_thresh, best_f1 = 0.5, 0
for t in np.arange(0.1, 0.9, 0.05):
    preds = (probs >= t).astype(int)
    score = f1_score(y_test, preds)
    if score > best_f1: best_f1, best_thresh = score, t

print(f"üèÜ Optimal Threshold: {best_thresh:.2f}")
print("\n--- Elite Model Report ---")
print(classification_report(y_test, (probs >= best_thresh).astype(int), target_names=['Not Winner', 'Winner']))

# ==============================================================================
# 3. PREDICT 2026
# ==============================================================================
print("\n--- 2026 UCL Winner Prediction ---")
try:
    # Load 2026 data
    d_p = pd.read_csv('../data/combined_player_stats_2026.csv')
    d_l = pd.read_csv('../data/combined_league_standings_2026.csv')
    d_up = pd.read_csv('../data/ucl_team_progress_2026.csv')
    d_us = pd.read_csv('../data/ucl_player_stats_2026.csv')
    
    current_season = '2025-2026'
    for d in [d_p, d_l, d_up, d_us]: 
        d['Season']=current_season; d.columns=d.columns.str.strip()
        if 'Squad' in d.columns: d['Squad']=d['Squad'].str.strip().replace({'Paris S-G':'Paris Saint-Germain','Inter':'Internazionale','Manchester Utd':'Manchester United','Leverkusen':'Bayer Leverkusen'})

    m_k = ['Squad', 'Season']
    if 'League' in d_p.columns and 'League' in d_l.columns: m_k.append('League')
    df_26 = pd.merge(d_p, d_l, on=m_k, how='left', suffixes=('_player', '_team'))
    df_26 = pd.merge(df_26, d_us[['Player','Squad','Season']], on=['Player','Squad','Season'], how='left')
    df_26 = pd.merge(df_26, d_up, on=['Squad','Season'], how='left')
    if 'UCL_Progress' in df_26.columns: df_26.rename(columns={'UCL_Progress':'UCL_progress'}, inplace=True)
    df_26['UCL_progress'].fillna('Did Not Qualify', inplace=True)
    
    ucl_26 = df_26[df_26['UCL_progress'] != 'Did Not Qualify'].copy()
    rename_26 = {'Gls':'Gls_league', 'xG':'xG_player', 'Pts':'Pts', 'MP':'MP_team', 'W':'W', 'GD':'GD', 'Rk':'Rk_team'}
    ucl_26.rename(columns=rename_26, inplace=True, errors='ignore')
    
    # Aggregation
    p_agg = ucl_26.groupby(['Squad', 'Season'])[['Gls_league', 'xG_player']].sum().reset_index().rename(columns={'Gls_league': 'Agg_Gls_league', 'xG_player': 'Agg_xG'})
    ucl_26 = pd.merge(ucl_26, p_agg, on=['Squad', 'Season'], how='left')
    ucl_26 = ucl_26.drop_duplicates(subset=['Squad'])
    
    # Feature Engineering (Now Safe against missing 'League' column)
    ucl_26 = engineer_elite_features(ucl_26)
    
    # Select Best Features & Scale
    for col in features:
        if col not in ucl_26.columns: ucl_26[col] = 0
    
    X_live = ucl_26[features].fillna(0)
    # Use the same scaler from training!
    X_live_sc = pd.DataFrame(scaler.transform(X_live), columns=features)
    
    # Predict
    ucl_26['Win_Prob'] = ucl_model.predict_proba(X_live_sc)[:, 1]
    print("Top 10 Contenders:")
    cols = ['Squad', 'Win_Prob']
    if 'League' in ucl_26.columns: cols.insert(1, 'League')
    display(ucl_26[cols].sort_values(by='Win_Prob', ascending=False).head(10))

except Exception as e: print(f"Prediction Error: {e}")

--- Training Elite High-Precision UCL Model ---
Training Elite Ensemble...
‚úÖ Model Trained.
üèÜ Optimal Threshold: 0.40

--- Elite Model Report ---
              precision    recall  f1-score   support

  Not Winner       0.98      0.89      0.93        45
      Winner       0.29      0.67      0.40         3

    accuracy                           0.88        48
   macro avg       0.63      0.78      0.67        48
weighted avg       0.93      0.88      0.90        48


--- 2026 UCL Winner Prediction ---
Top 10 Contenders:


Unnamed: 0,Squad,Win_Prob
217,Bayern Munich,0.171574
390,Paris Saint-Germain,0.034413
382,Marseille,0.015267
113,Barcelona,0.012749
12,Arsenal,0.010828
4,Liverpool,0.01066
294,Napoli,0.009711
204,Dortmund,0.009545
110,Real Madrid,0.006659
290,Internazionale,0.004316


In [15]:
# --- Cell: Train and Evaluate UCL Ensemble Model ---
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE
import warnings

warnings.filterwarnings('ignore')

print("--- Training UCL Ensemble Model ---")

# 1. Load Data
try:
    df = pd.read_csv('../data/master_dataset_2011-2025.csv')
    rename_map = {'xG': 'xG_player', 'xAG': 'xAG_player', 'UCL_Progress': 'UCL_progress'}
    df.rename(columns=rename_map, inplace=True)
except FileNotFoundError:
    raise Exception("Data not found!")

# 2. Feature Engineering (Elite UCL)
def engineer_ucl_features(df):
    df = df.copy()
    df['MP_team'] = df['MP_team'].replace(0, 1)
    df['Pts_Per_Game'] = df['Pts'] / df['MP_team']
    df['Goal_Diff_Per_Game'] = df['GD'] / df['MP_team']
    df['Win_Rate'] = df['W'] / df['MP_team']
    df['Dominance_Score'] = (df['Win_Rate'] * 0.7) + (df['Goal_Diff_Per_Game'] * 0.3)
    df['League_Pedigree'] = 1 / df['Rk_team'].replace(0, 20)
    return df

# 3. Prepare Team-Level Data
ucl_df = df[df['UCL_progress'] != 'Did Not Qualify'].copy()
ucl_df['UCL_Winner'] = np.where(ucl_df['UCL_progress'] == 'W', 1, 0)

player_agg = df.groupby(['Squad', 'Season'])[['Gls_league', 'Ast_league', 'xG_player']].sum().reset_index().rename(columns={'Gls_league': 'Squad_Goals', 'Ast_league': 'Squad_Ast', 'xG_player': 'Squad_xG'})
ucl_df = pd.merge(ucl_df, player_agg, on=['Squad', 'Season'], how='left')

ucl_df = engineer_ucl_features(ucl_df)
team_level_df = ucl_df.drop_duplicates(subset=['Squad', 'Season'], keep='first').copy()

features_ucl = ['Pts_Per_Game', 'Goal_Diff_Per_Game', 'Win_Rate', 'Dominance_Score', 'League_Pedigree', 'Squad_Goals', 'Squad_xG', 'xG_team']
# Handle missing xG_team for old seasons
if 'xG_team' not in team_level_df.columns: team_level_df['xG_team'] = 0

X = team_level_df[features_ucl].fillna(0)
y = team_level_df['UCL_Winner']

# Split & Scale
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
scaler = MinMaxScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X.columns)

# Balance
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train_scaled, y_train)

# 4. Train Ensemble
clf1 = xgb.XGBClassifier(n_estimators=200, learning_rate=0.05, max_depth=5, random_state=42, eval_metric='logloss')
clf2 = lgb.LGBMClassifier(n_estimators=200, learning_rate=0.05, num_leaves=31, verbose=-1, random_state=42)
clf3 = CatBoostClassifier(iterations=200, learning_rate=0.05, depth=5, verbose=0, random_seed=42)

ensemble_model = VotingClassifier(estimators=[('xgb', clf1), ('lgbm', clf2), ('cat', clf3)], voting='soft')
ensemble_model.fit(X_train_res, y_train_res)
print("‚úÖ Ensemble Model Trained.")

# 5. Run Advanced Evaluation (Using your function)
if 'evaluate_model_advanced' in locals():
    print("\n>>> Evaluating UCL Winner Model...")
    evaluate_model_advanced(ensemble_model, X_test_scaled, y_test, "Ensemble")
    calculate_top_k_proxy(ensemble_model, X_test_scaled, y_test, "Ensemble")
else:
    print("‚ö†Ô∏è evaluation functions not found. Please run the previous cell containing 'evaluate_model_advanced'.")

--- Training UCL Ensemble Model ---
‚úÖ Ensemble Model Trained.

>>> Evaluating UCL Winner Model...

--- Advanced Evaluation: Ensemble ---
üèÜ Optimal Threshold: 0.2140
   Max F1-Score: 0.1429
   Precision at Optimal: 0.0909
   Recall at Optimal:    0.3333

üìä Average Rank of True Winners in Test Set (Ensemble): 28.3


In [16]:
import pandas as pd
import numpy as np
import torch
from sklearn.metrics import precision_recall_curve, f1_score

# --- 1. Define the Advanced Evaluation Function (FIXED) ---
def evaluate_model_advanced(model, X_test, y_test, model_type="Deep Learning"):
    print(f"\n--- Advanced Evaluation: {model_type} ---")
    
    # Get Probabilities
    # FIX: Check if "Deep Learning" is IN the string, not just equal to it
    if "Deep Learning" in model_type:
        model.eval()
        with torch.no_grad():
            if not isinstance(X_test, torch.Tensor):
                X_test = torch.tensor(X_test, dtype=torch.float32)
            # Forward pass + Sigmoid
            probs = torch.sigmoid(model(X_test)).numpy().flatten()
            
            if isinstance(y_test, torch.Tensor):
                y_true = y_test.numpy().flatten()
            else:
                y_true = y_test
    else: # Ensemble / XGBoost
        probs = model.predict_proba(X_test)[:, 1]
        y_true = y_test

    # 2. Find Optimal Threshold (Maximize F1)
    # Handle NaNs in y_true (just in case)
    mask = ~np.isnan(y_true)
    y_true = y_true[mask]
    probs = probs[mask]
    
    precisions, recalls, thresholds = precision_recall_curve(y_true, probs)
    
    with np.errstate(divide='ignore', invalid='ignore'):
        f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
    f1_scores = np.nan_to_num(f1_scores)
    
    best_idx = np.argmax(f1_scores)
    # Safety check for index bounds
    if best_idx < len(thresholds):
        best_thresh = thresholds[best_idx]
    else:
        best_thresh = 0.5
        
    best_f1 = f1_scores[best_idx]
    
    print(f"üèÜ Optimal Threshold: {best_thresh:.4f}")
    print(f"   Max F1-Score: {best_f1:.4f}")
    print(f"   Precision at Optimal: {precisions[best_idx]:.4f}")
    print(f"   Recall at Optimal:    {recalls[best_idx]:.4f}")

    return best_thresh

# ==============================================================================
# 2. RUN ADVANCED EVALUATION (with Reconstructed Data)
# ==============================================================================

# Note: We assume the data reconstruction part from the previous cell ran successfully
# and X_test_b_tensor, y_test_b_tensor, X_test_u_scaled, y_test_u are available.

if 'model' in locals():
    # Fix: String matching is now handled inside the function
    best_thresh_bdo = evaluate_model_advanced(model, X_test_b_tensor, y_test_b, "Deep Learning (Ballon d'Or)")
else:
    print("‚ùå Error: 'model' (Deep Learning) not found in memory.")

if 'ensemble_model' in locals():
    best_thresh_ucl = evaluate_model_advanced(ensemble_model, X_test_u_scaled, y_test_u, "Ensemble (UCL)")
else:
    print("‚ùå Error: 'ensemble_model' not found in memory.")


# --- 3. Top-K Accuracy Proxy ---
def calculate_top_k_proxy(model, X, y, model_type="Deep Learning"):
    if model_type == "Deep Learning":
        model.eval()
        with torch.no_grad():
             if not isinstance(X, torch.Tensor): X = torch.tensor(X, dtype=torch.float32)
             probs = torch.sigmoid(model(X)).numpy().flatten()
    else:
        probs = model.predict_proba(X)[:, 1]
        
    results = pd.DataFrame({'Actual': y, 'Prob': probs})
    winners = results[results['Actual'] == 1]
    
    if not winners.empty:
        results['Rank'] = results['Prob'].rank(ascending=False)
        avg_winner_rank = results[results['Actual'] == 1]['Rank'].mean()
        print(f"\nüìä Average Rank of True Winners in Test Set ({model_type}): {avg_winner_rank:.1f}")
    else:
        print(f"\n‚ö†Ô∏è No winners found in test set for {model_type}.")

if 'model' in locals(): calculate_top_k_proxy(model, X_test_b_tensor, y_test_b, "Deep Learning")
if 'ensemble_model' in locals(): calculate_top_k_proxy(ensemble_model, X_test_u_scaled, y_test_u, "Ensemble")


--- Advanced Evaluation: Deep Learning (Ballon d'Or) ---
üèÜ Optimal Threshold: 0.7485
   Max F1-Score: 0.2857
   Precision at Optimal: 0.3333
   Recall at Optimal:    0.2500

--- Advanced Evaluation: Ensemble (UCL) ---
üèÜ Optimal Threshold: 0.0140
   Max F1-Score: 0.1290
   Precision at Optimal: 0.0714
   Recall at Optimal:    0.6667

üìä Average Rank of True Winners in Test Set (Deep Learning): 171.2

üìä Average Rank of True Winners in Test Set (Ensemble): 30.3
