In [3]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE

def engineer_advanced_features(df):
    """
    Engineers advanced, narrative-driven features using the correct column names.
    """
    df_featured = df.copy()

    # 1. Trophy Impact Score
    # --- THIS IS THE FIX: Changed 'League_Rk' to 'Rk_team' ---
    trophy_score = (df_featured['Rk_team'] == 1).astype(int) * 2  # 2 points for league title
    trophy_score += (df_featured['UCL_progress'] == 'W').astype(int) * 3 # 3 points for UCL title
    trophy_score += (df_featured['UCL_progress'] == 'F').astype(int) * 1 # 1 point for UCL final
    df_featured['Trophy_Impact_Score'] = trophy_score

    # 2. "Big Game" Performance Score
    df_featured['Big_Game_Score'] = (df_featured['Gls_league'] * 1.0) + \
                                    (df_featured['Ast_league'] * 0.5) + \
                                    (df_featured['Gls_ucl'] * 2.0) + \
                                    (df_featured['Ast_ucl'] * 1.0)

    # 3. Individual Dominance Ratio
    df_featured['Dominance_Ratio'] = df_featured['Gls_league'] / df_featured['GF'].replace(0, 1)

    return df_featured

# ==============================================================================
# --- FINAL TRAINING SCRIPT WITH ADVANCED FEATURES & SMOTE ---
# ==============================================================================

# --- Load and prepare historical data ---
historical_df = pd.read_csv('../data/master_dataset_2011-2025.csv')

# --- Create the Target Variable (Top_Candidate) ---
ballon_dor_history = {
    '2024-2025': ['Ousmane Dembele', 'Lamine Yamal', 'Vitinha', 'Raphinha', 'Mohammed Salah', 'Kylian Mbappe', 'Achraf Hakimi', 'Desire Doue', 'Kvicha Kvaratskhelia', 'Nuno Mendes'],
    '2023-2024': ['Rodri', 'Vinícius Júnior', 'Jude Bellingham', 'Dani Carvajal', 'Lautaro Martinez', 'Toni Kroos', 'Kylian Mbappé', 'Harry Kane', 'Phil Foden', 'Lamine Yamal'],
    '2022-2023': ['Lionel Messi', 'Erling Haaland', 'Kylian Mbappé', 'Kevin De Bruyne', 'Rodri', 'Vinícius Júnior', 'Julián Álvarez', 'Victor Osimhen', 'Bernardo Silva', 'Luka Modrić'],
    '2021-2022': ['Karim Benzema', 'Sadio Mané', 'Kevin De Bruyne', 'Robert Lewandowski', 'Mohamed Salah', 'Kylian Mbappé', 'Thibaut Courtois', 'Vinícius Júnior', 'Luka Modrić', 'Erling Haaland'],
    '2020-2021': ['Lionel Messi', 'Robert Lewandowski', 'Jorginho', 'Karim Benzema', 'N\'Golo Kanté', 'Cristiano Ronaldo', 'Mohamed Salah', 'Kevin De Bruyne', 'Kylian Mbappé', 'Gianluigi Donnarumma'],
    '2018-2019': ['Lionel Messi', 'Virgil van Dijk', 'Cristiano Ronaldo', 'Sadio Mané', 'Mohamed Salah', 'Kylian Mbappé', 'Alisson', 'Robert Lewandowski', 'Bernardo Silva', 'Riyad Mahrez'],
    '2017-2018': ['Luka Modrić', 'Cristiano Ronaldo', 'Antoine Griezmann', 'Kylian Mbappé', 'Lionel Messi', 'Mohamed Salah', 'Raphaël Varane', 'Eden Hazard', 'Kevin De Bruyne', 'Harry Kane'],
    '2016-2017': ['Cristiano Ronaldo', 'Lionel Messi', 'Neymar', 'Gianluigi Buffon', 'Luka Modrić', 'Sergio Ramos', 'Kylian Mbappé', 'N\'Golo Kanté', 'Robert Lewandowski', 'Harry Kane'],
    '2015-2016': ['Cristiano Ronaldo', 'Lionel Messi', 'Antoine Griezmann', 'Luis Suárez', 'Neymar', 'Gareth Bale', 'Riyad Mahrez', 'Jamie Vardy', 'Gianluigi Buffon', 'Pepe'],
    '2014-2015': ['Lionel Messi', 'Cristiano Ronaldo', 'Neymar', 'Robert Lewandowski', 'Luis Suárez', 'Thomas Müller', 'Manuel Neuer', 'Eden Hazard', 'Andrés Iniesta', 'Alexis Sánchez'],
    '2013-2014': ['Cristiano Ronaldo', 'Lionel Messi', 'Manuel Neuer', 'Arjen Robben', 'Thomas Müller', 'Philipp Lahm', 'Neymar', 'James Rodríguez', 'Toni Kroos', 'Ángel Di María'],
    '2012-2013': ['Cristiano Ronaldo', 'Lionel Messi', 'Franck Ribéry', 'Zlatan Ibrahimović', 'Neymar', 'Andrés Iniesta', 'Robin van Persie', 'Arjen Robben', 'Gareth Bale', 'Andrea Pirlo'],
    '2011-2012': ['Lionel Messi', 'Cristiano Ronaldo', 'Andrés Iniesta', 'Xavi', 'Radamel Falcao', 'Iker Casillas', 'Andrea Pirlo', 'Didier Drogba', 'Robin van Persie', 'Zlatan Ibrahimović'],
    '2010-2011': ['Lionel Messi', 'Cristiano Ronaldo', 'Xavi', 'Andrés Iniesta', 'Wayne Rooney', 'Luis Suárez', 'Diego Forlán', 'Samuel Eto\'o', 'Iker Casillas', 'Neymar']
}
historical_df['Top_Candidate'] = 0
for season, players in ballon_dor_history.items():
    historical_df.loc[(historical_df['Season'] == season) & (historical_df['Player'].isin(players)), 'Top_Candidate'] = 1

# --- Apply Advanced Feature Engineering ---
historical_df_advanced = engineer_advanced_features(historical_df)
print("✅ Advanced features engineered successfully.")

# --- Prepare data for the model ---
progress_mapping = {'W': 1, 'F': 2, 'SF': 3, 'QF': 4, 'R16': 5, 'GR': 6, 'Did Not Qualify': 7}
historical_df_advanced['UCL_Progress_Rank'] = historical_df_advanced['UCL_progress'].str.strip().map(progress_mapping).fillna(7)

# Define our new, more powerful feature set (using correct column names)
features_advanced = [
    'Age', 'Min_league', 'Gls_league', 'Ast_league', 'xG', 'xAG', 
    'Gls_ucl', 'Ast_ucl', 'Min_ucl', 'Rk_team', 'Pts', 'UCL_Progress_Rank',
    'Trophy_Impact_Score', 'Big_Game_Score', 'Dominance_Ratio' # Our new features!
]
X = historical_df_advanced[features_advanced].fillna(0)
y = historical_df_advanced['Top_Candidate']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# --- Apply SMOTE for advanced data balancing ---
print("Applying SMOTE to balance the dataset...")
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
print(f"Original training set size: {len(X_train)}")
print(f"New balanced training set size with SMOTE: {len(X_train_smote)}")

# --- Train the final model on the balanced data ---
final_ballon_model = xgb.XGBClassifier(objective='binary:logistic', random_state=42, eval_metric='logloss')
final_ballon_model.fit(X_train_smote, y_train_smote)
print("✅ Final Ballon d'Or model trained successfully on balanced data!")

# --- Evaluate the final model ---
final_predictions = final_ballon_model.predict(X_test)
final_report = classification_report(y_test, final_predictions, target_names=['Not a Candidate (0)', 'Top Candidate (1)'])

print("\n--- Final Model Evaluation Report (with Advanced Features & SMOTE) ---")
print(final_report)

✅ Advanced features engineered successfully.
Applying SMOTE to balance the dataset...
Original training set size: 33234
New balanced training set size with SMOTE: 66256
✅ Final Ballon d'Or model trained successfully on balanced data!

--- Final Model Evaluation Report (with Advanced Features & SMOTE) ---
                     precision    recall  f1-score   support

Not a Candidate (0)       1.00      1.00      1.00      8282
  Top Candidate (1)       0.28      0.26      0.27        27

           accuracy                           1.00      8309
          macro avg       0.64      0.63      0.63      8309
       weighted avg       1.00      1.00      1.00      8309

