In [6]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, precision_score, recall_score
import os
import warnings
import unicodedata

# Suppress warnings
warnings.filterwarnings('ignore')

print("--- DeepBallonNet: Unified Deep Learning Training & Prediction Pipeline ---")

# ==============================================================================
# 1. LOAD & PREPARE HISTORICAL DATA
# ==============================================================================
try:
    # Use latin1 to handle accents in history file
    df = pd.read_csv('../data/master_dataset_2011-2025.csv', encoding='latin1')
    print("‚úÖ Historical data loaded.")
    
    # Standardize column names
    rename_map = {'xG': 'xG_player', 'xAG': 'xAG_player', 'UCL_Progress': 'UCL_progress'}
    df.rename(columns=rename_map, inplace=True)

except FileNotFoundError:
    print("‚ùå Error: Master dataset not found.")
    raise

# --- Feature Engineering Function ---
def engineer_features(df):
    df = df.copy()
    if 'Rk_team' in df.columns: trophy = (df['Rk_team'] == 1).astype(int) * 2
    else: trophy = 0
    if 'UCL_progress' in df.columns:
        ucl_col = df['UCL_progress'].astype(str).str.strip()
        trophy += (ucl_col == 'W').astype(int) * 3
        trophy += (ucl_col == 'F').astype(int) * 1
    df['Trophy_Impact_Score'] = trophy
    
    df['Big_Game_Score'] = (df.get('Gls_league', 0) * 1.0) + (df.get('Gls_ucl', 0) * 2.5)
    return df

# --- Create Targets ---
ballon_dor_history = {
    '2024-2025': ['Ousmane Dembele', 'Lamine Yamal', 'Vitinha', 'Raphinha', 'Mohammed Salah', 'Kylian Mbappe', 'Achraf Hakimi', 'Desire Doue', 'Kvicha Kvaratskhelia', 'Nuno Mendes'],
    '2023-2024': ['Rodri', 'Vin√≠cius J√∫nior', 'Jude Bellingham', 'Dani Carvajal', 'Lautaro Martinez', 'Toni Kroos', 'Kylian Mbapp√©', 'Harry Kane', 'Phil Foden', 'Lamine Yamal'],
    '2022-2023': ['Lionel Messi', 'Erling Haaland', 'Kylian Mbapp√©', 'Kevin De Bruyne', 'Rodri', 'Vin√≠cius J√∫nior', 'Juli√°n √Ålvarez', 'Victor Osimhen', 'Bernardo Silva', 'Luka Modriƒá'],
    '2021-2022': ['Karim Benzema', 'Sadio Man√©', 'Kevin De Bruyne', 'Robert Lewandowski', 'Mohamed Salah', 'Kylian Mbapp√©', 'Thibaut Courtois', 'Vin√≠cius J√∫nior', 'Luka Modriƒá', 'Erling Haaland'],
    '2020-2021': ['Lionel Messi', 'Robert Lewandowski', 'Jorginho', 'Karim Benzema', 'N\'Golo Kant√©', 'Cristiano Ronaldo', 'Mohamed Salah', 'Kevin De Bruyne', 'Kylian Mbapp√©', 'Gianluigi Donnarumma'],
    '2018-2019': ['Lionel Messi', 'Virgil van Dijk', 'Cristiano Ronaldo', 'Sadio Man√©', 'Mohamed Salah', 'Kylian Mbapp√©', 'Alisson', 'Robert Lewandowski', 'Bernardo Silva', 'Riyad Mahrez'],
    '2017-2018': ['Luka Modriƒá', 'Cristiano Ronaldo', 'Antoine Griezmann', 'Kylian Mbapp√©', 'Lionel Messi', 'Mohamed Salah', 'Rapha√´l Varane', 'Eden Hazard', 'Kevin De Bruyne', 'Harry Kane'],
    '2016-2017': ['Cristiano Ronaldo', 'Lionel Messi', 'Neymar', 'Gianluigi Buffon', 'Luka Modriƒá', 'Sergio Ramos', 'Kylian Mbapp√©', 'N\'Golo Kant√©', 'Robert Lewandowski', 'Harry Kane'],
    '2015-2016': ['Cristiano Ronaldo', 'Lionel Messi', 'Antoine Griezmann', 'Luis Su√°rez', 'Neymar', 'Gareth Bale', 'Riyad Mahrez', 'Jamie Vardy', 'Gianluigi Buffon', 'Pepe'],
    '2014-2015': ['Lionel Messi', 'Cristiano Ronaldo', 'Neymar', 'Robert Lewandowski', 'Luis Su√°rez', 'Thomas M√ºller', 'Manuel Neuer', 'Eden Hazard', 'Andr√©s Iniesta', 'Alexis S√°nchez'],
    '2013-2014': ['Cristiano Ronaldo', 'Lionel Messi', 'Manuel Neuer', 'Arjen Robben', 'Thomas M√ºller', 'Philipp Lahm', 'Neymar', 'James Rodr√≠guez', 'Toni Kroos', '√Ångel Di Mar√≠a'],
    '2012-2013': ['Cristiano Ronaldo', 'Lionel Messi', 'Franck Rib√©ry', 'Zlatan Ibrahimoviƒá', 'Neymar', 'Andr√©s Iniesta', 'Robin van Persie', 'Arjen Robben', 'Gareth Bale', 'Andrea Pirlo'],
    '2011-2012': ['Lionel Messi', 'Cristiano Ronaldo', 'Andr√©s Iniesta', 'Xavi', 'Radamel Falcao', 'Iker Casillas', 'Andrea Pirlo', 'Didier Drogba', 'Robin van Persie', 'Zlatan Ibrahimoviƒá'],
    '2010-2011': ['Lionel Messi', 'Cristiano Ronaldo', 'Xavi', 'Andr√©s Iniesta', 'Wayne Rooney', 'Luis Su√°rez', 'Diego Forl√°n', 'Samuel Eto\'o', 'Iker Casillas', 'Neymar']
}
df['Top_Candidate'] = 0
for season, players in ballon_dor_history.items():
    df.loc[(df['Season'] == season) & (df['Player'].isin(players)), 'Top_Candidate'] = 1

df = engineer_features(df)
progress_mapping = {'W': 1, 'F': 2, 'SF': 3, 'QF': 4, 'R16': 5, 'GR': 6, 'Did Not Qualify': 7}
df['UCL_Progress_Rank'] = df['UCL_progress'].astype(str).str.strip().map(progress_mapping).fillna(7)

# --- Define Features and Targets ---
features = ['Age', 'Min_league', 'Gls_league', 'Ast_league', 'xG_player', 'xAG_player', 'Gls_ucl', 'Ast_ucl', 'Min_ucl', 'Rk_team', 'Pts', 'UCL_Progress_Rank', 'Trophy_Impact_Score', 'Big_Game_Score']
X = df[features].fillna(0)
y_bdo = df['Top_Candidate']
y_ucl = np.where(df['UCL_progress'] == 'W', 1, 0) 

X_train_raw, X_test_raw, y_train_bdo, y_test_bdo, y_train_ucl, y_test_ucl = train_test_split(
    X, y_bdo, y_ucl, test_size=0.2, random_state=42, stratify=y_bdo
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_raw)
X_test_scaled = scaler.transform(X_test_raw)

# Tensors
X_train_bdo_t = torch.tensor(X_train_scaled, dtype=torch.float32)
y_train_bdo_t = torch.tensor(y_train_bdo.values, dtype=torch.float32).unsqueeze(1)
X_test_bdo_t = torch.tensor(X_test_scaled, dtype=torch.float32)

X_train_ucl_t = torch.tensor(X_train_scaled, dtype=torch.float32)
y_train_ucl_t = torch.tensor(y_train_ucl, dtype=torch.float32).unsqueeze(1)
X_test_ucl_t = torch.tensor(X_test_scaled, dtype=torch.float32)

# ==============================================================================
# 2. TRAIN MODELS
# ==============================================================================
print("\n--- Training Models ---")
class PrecisionNet(nn.Module):
    def __init__(self, input_size):
        super(PrecisionNet, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_size, 32), nn.ReLU(), nn.Dropout(0.5),
            nn.Linear(32, 1)
        )
    def forward(self, x): return self.model(x)

# Ballon d'Or Model
bdo_model = PrecisionNet(X_train_bdo_t.shape[1])
pos_weight_bdo = torch.tensor([(y_train_bdo == 0).sum() / (y_train_bdo == 1).sum()]) 
criterion_bdo = nn.BCEWithLogitsLoss(pos_weight=pos_weight_bdo)
optimizer_bdo = optim.Adam(bdo_model.parameters(), lr=0.0005, weight_decay=0.01)

for epoch in range(150):
    bdo_model.train(); optimizer_bdo.zero_grad()
    loss = criterion_bdo(bdo_model(X_train_bdo_t), y_train_bdo_t)
    loss.backward(); optimizer_bdo.step()
print("‚úÖ Ballon d'Or Model Trained.")

# UCL Model
class UCLNet(nn.Module):
    def __init__(self, input_size):
        super(UCLNet, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_size, 64), nn.ReLU(), nn.Dropout(0.4),
            nn.Linear(64, 32), nn.ReLU(), nn.Dropout(0.4),
            nn.Linear(32, 1)
        )
    def forward(self, x): return self.model(x)

ucl_model = UCLNet(X_train_ucl_t.shape[1])
pos_weight_ucl = torch.tensor([(y_train_ucl == 0).sum() / (y_train_ucl == 1).sum()])
criterion_ucl = nn.BCEWithLogitsLoss(pos_weight=pos_weight_ucl)
optimizer_ucl = optim.Adam(ucl_model.parameters(), lr=0.001, weight_decay=1e-4)

for epoch in range(100):
    ucl_model.train(); optimizer_ucl.zero_grad()
    loss = criterion_ucl(ucl_model(X_train_ucl_t), y_train_ucl_t)
    loss.backward(); optimizer_ucl.step()
print("‚úÖ UCL Model Trained.")


# ==============================================================================
# 3. PREDICT 2026 WINNERS
# ==============================================================================
print("\n\n--- Running Live Predictions for 2025-2026 Season ---")

def fix_mojibake(text):
    if not isinstance(text, str): return text
    try: return text.encode('latin-1').decode('utf-8')
    except: return text

def normalize_text(text):
    if not isinstance(text, str): return str(text)
    # 1. Fix Mojibake first (critical)
    text = fix_mojibake(text)
    # 2. Normalize accents (√© -> e)
    text = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('utf-8')
    # 3. Lowercase and remove spaces/dots
    return text.lower().replace(" ", "").replace("-", "").replace(".", "")

try:
    # 1. Load Data
    data_path = '../data/'
    # Use utf-8 by default, fallback to latin1 if needed
    try:
        df_2026 = pd.read_csv(os.path.join(data_path, 'master_dataset_2026.csv'), encoding='utf-8')
    except UnicodeDecodeError:
        df_2026 = pd.read_csv(os.path.join(data_path, 'master_dataset_2026.csv'), encoding='latin1')

    # 2. Text Repair & Cleaning
    for col in ['Player', 'Squad', 'Nation']:
        if col in df_2026.columns:
            df_2026[col] = df_2026[col].apply(fix_mojibake)

    # 3. Rename columns safely to match training features
    rename_map_2026 = { 
        'xG': 'xG_player', 'xAG': 'xAG_player', 
        'Rk': 'Rk_team', 'Pts': 'Pts', 
        'Min': 'Min_league', 'Gls': 'Gls_league', 'Ast': 'Ast_league', 
        'UCL_Progress': 'UCL_progress' 
    }
    
    for source, target in rename_map_2026.items():
        if source in df_2026.columns and target not in df_2026.columns:
            df_2026.rename(columns={source: target}, inplace=True)

    # 4. Fill Missing Values
    for c in ['Gls_ucl', 'Ast_ucl']: 
        if c in df_2026.columns: df_2026[c] = df_2026[c].fillna(0)
    
    if 'UCL_progress' in df_2026.columns:
        df_2026['UCL_progress'] = df_2026['UCL_progress'].fillna('Did Not Qualify')

    # 5. Engineer Features
    df_2026 = engineer_features(df_2026)
    
    if 'UCL_progress' in df_2026.columns:
        df_2026['UCL_Progress_Rank'] = df_2026['UCL_progress'].str.strip().map(progress_mapping).fillna(7)
    else:
        df_2026['UCL_Progress_Rank'] = 7
    
    for col in features:
        if col not in df_2026.columns: df_2026[col] = 0
        df_2026[col] = pd.to_numeric(df_2026[col], errors='coerce')
    
    X_live = df_2026[features].fillna(0)
    
    # --- Ballon d'Or Prediction ---
    X_live_b_sc = scaler.transform(X_live)
    bdo_model.eval()
    with torch.no_grad():
        df_2026['DL_Ballon_Prob'] = torch.sigmoid(bdo_model(torch.tensor(X_live_b_sc, dtype=torch.float32))).numpy()
    
    print(f"\nüèÜ Top 10 Ballon d'Or Predictions (Deep Learning):")
    # Clean output for display
    display(df_2026.sort_values(by='DL_Ballon_Prob', ascending=False)[['Player', 'Squad', 'Gls_league', 'Gls_ucl', 'DL_Ballon_Prob']].head(10))

    # --- UCL Winner Prediction ---
    if 'UCL_progress' in df_2026.columns:
        ucl_2026 = df_2026[df_2026['UCL_progress'] != 'Did Not Qualify'].copy()
    else:
        ucl_2026 = pd.DataFrame()

    if not ucl_2026.empty:
        X_live_u = ucl_2026[features].fillna(0)
        X_live_u_sc = scaler.transform(X_live_u) 
        
        ucl_model.eval()
        with torch.no_grad():
            ucl_2026['Player_Win_Prob'] = torch.sigmoid(ucl_model(torch.tensor(X_live_u_sc, dtype=torch.float32))).numpy()
        
        team_probs = ucl_2026.groupby('Squad')['Player_Win_Prob'].mean().sort_values(ascending=False)
        print(f"\nüéØ Top 10 UCL Winner Predictions (Deep Learning):")
        display(team_probs.head(10).to_frame(name="Win Probability"))
    else:
        print("\n‚ö†Ô∏è No active UCL teams found in 2026 data.")

except Exception as e:
    print(f"Error during prediction: {e}")

--- DeepBallonNet: Unified Deep Learning Training & Prediction Pipeline ---
‚úÖ Historical data loaded.

--- Training Models ---
‚úÖ Ballon d'Or Model Trained.
‚úÖ UCL Model Trained.


--- Running Live Predictions for 2025-2026 Season ---

üèÜ Top 10 Ballon d'Or Predictions (Deep Learning):


Unnamed: 0,Player,Squad,Gls_league,Gls_ucl,DL_Ballon_Prob
788,Kylian Mbapp√©,Real Madrid,18,9.0,0.918411
1680,Harry Kane,Bayern Munich,19,5.0,0.890247
175,Erling Haaland,Manchester City,19,6.0,0.878959
1771,Michael Olise,Bayern Munich,7,1.0,0.878731
868,Marcus Rashford,Barcelona,2,4.0,0.801916
506,Juli√°n √Ålvarez,Atl√©tico Madrid,7,4.0,0.789034
1949,Pierre-Emerick Aubameyang,Marseille,5,3.0,0.773372
2096,Mason Greenwood,Marseille,11,3.0,0.771654
1580,Luis D√≠az,Bayern Munich,8,3.0,0.759569
1269,Lautaro Mart√≠nez,Inter,9,4.0,0.746094



üéØ Top 10 UCL Winner Predictions (Deep Learning):


Unnamed: 0_level_0,Win Probability
Squad,Unnamed: 1_level_1
Bayern Munich,0.084766
Barcelona,0.082033
Arsenal,0.080971
Inter,0.073158
Paris S-G,0.026088
Liverpool,0.023618
Dortmund,0.021943
Manchester City,0.020153
Real Madrid,0.017614
Marseille,0.01631
