In [2]:
import pandas as pd
import numpy as np

# Load the master dataset
file_path = '../data/master_dataset_2024.csv'
master_df = pd.read_csv(file_path)

# --- CORRECTED & MORE ROBUST METHOD ---
# We now use a list of tuples, with (Player, Squad)
top_candidates_tuples = [
    ('Rodri', 'Manchester City'),
    ('Vinicius Júnior', 'Real Madrid'),
    ('Jude Bellingham', 'Real Madrid'),
    ('Dani Carvajal', 'Real Madrid'),
    ('Lautaro Martínez', 'Inter'), # Note: FBref uses 'Inter' not 'Inter Milan'
    ('Toni Kroos', 'Real Madrid'),
    ('Kylian Mbappé', 'Paris S-G'), # Note: FBref uses 'Paris S-G'
    ('Harry Kane', 'Bayern Munich'),
    ('Phil Foden', 'Manchester City'),
    ('Lamine Yamal', 'Barcelona')
]

# Create a boolean series that is 'True' for any row that matches one of our tuples
# This is a more advanced but much more accurate way to find the players
conditions = pd.Series(False, index=master_df.index)
for player, squad in top_candidates_tuples:
    conditions |= ((master_df['Player'] == player) & (master_df['Squad'] == squad))

# Create the target variable column based on these conditions
master_df['Top_Candidate'] = np.where(conditions, 1, 0)

print("Target variable 'Top_Candidate' created using Player and Squad matching.")
print("Number of top candidates found in dataset:", master_df['Top_Candidate'].sum())

# Display the final, correct list of players found
display(master_df[master_df['Top_Candidate'] == 1][[
    'Player', 'Squad', 'Gls_league', 'Gls_ucl', 'UCL_Progress', 'Top_Candidate'
]])

Target variable 'Top_Candidate' created using Player and Squad matching.
Number of top candidates found in dataset: 10


Unnamed: 0,Player,Squad,Gls_league,Gls_ucl,UCL_Progress,Top_Candidate
186,Phil Foden,Manchester City,19,0.0,Quarter Finals,1
458,Rodri,Manchester City,8,0.0,Quarter Finals,1
642,Jude Bellingham,Real Madrid,19,0.0,Winner,1
674,Dani Carvajal,Real Madrid,4,0.0,Winner,1
846,Vinicius Júnior,Real Madrid,15,0.0,Winner,1
857,Toni Kroos,Real Madrid,1,0.0,Winner,1
1182,Lamine Yamal,Barcelona,5,0.0,Did Not Qualify,1
1548,Lautaro Martínez,Inter,24,0.0,Did Not Qualify,1
2023,Harry Kane,Bayern Munich,36,0.0,Semi FInals,1
2654,Kylian Mbappé,Paris S-G,27,0.0,Did Not Qualify,1


In [4]:
# --- Final Debugging Script: Fuzzy Search ---

print("--- Running a fuzzy search for the missing players ---")

# Simple search terms that are guaranteed to be in their names
search_terms = ['Mbapp', 'Harry']

for term in search_terms:
    print(f"\nSearching for any player name containing '{term}'...")
    
    # Use .str.contains() for a fuzzy search (case=False makes it case-insensitive)
    results = master_df[master_df['Player'].str.contains(term, case=False, na=False)]
    
    if not results.empty:
        # Loop through all potential matches found
        for index, row in results.iterrows():
            exact_player_name = row['Player']
            exact_squad_name = row['Squad']
            print(f"  --> FOUND A MATCH!")
            print(f"      EXACT Player Name: '{exact_player_name}'")
            print(f"      EXACT Squad Name:  '{exact_squad_name}'")
    else:
        print(f"  --> No player found containing the term '{term}'.")

--- Running a fuzzy search for the missing players ---

Searching for any player name containing 'Mbapp'...
  --> FOUND A MATCH!
      EXACT Player Name: 'Ethan Mbappé'
      EXACT Squad Name:  'Paris S-G'
  --> FOUND A MATCH!
      EXACT Player Name: 'Kylian Mbappé'
      EXACT Squad Name:  'Paris S-G'

Searching for any player name containing 'Harry'...
  --> FOUND A MATCH!
      EXACT Player Name: 'Harry Maguire'
      EXACT Squad Name:  'Manchester Utd'
  --> FOUND A MATCH!
      EXACT Player Name: 'Harry Toffolo'
      EXACT Squad Name:  'Nott'ham Forest'
  --> FOUND A MATCH!
      EXACT Player Name: 'Harry Wilson'
      EXACT Squad Name:  'Fulham'
  --> FOUND A MATCH!
      EXACT Player Name: 'Harry Kane'
      EXACT Squad Name:  'Bayern Munich'


In [6]:
# Run this cell to see all the column names in your master_df
print(master_df.columns.tolist())

['Rk_x', 'Player', 'Nation', 'Pos', 'Squad', 'Age', 'Born', 'MP_x', 'Starts', 'Min_league', '90s', 'Gls_league', 'Ast_league', 'G+A', 'G-PK', 'PK', 'PKatt', 'CrdY', 'CrdR', 'xG_x', 'npxG', 'xAG_league', 'npxG+xAG', 'PrgC', 'PrgP', 'PrgR', 'Gls.1', 'Ast.1', 'G+A.1', 'G-PK.1', 'G+A-PK', 'xG.1', 'xAG.1', 'xG+xAG', 'npxG.1', 'npxG+xAG.1', 'Matches', 'League', 'Rk_y', 'MP_y', 'W', 'D', 'L', 'GF', 'GA', 'GD', 'Pts', 'Pts/MP', 'xG_y', 'xGA', 'xGD', 'xGD/90', 'Attendance', 'Top Team Scorer', 'Goalkeeper', 'Notes', 'Last 5', 'Gls_ucl', 'Ast_ucl', 'Min_ucl', 'xG', 'xAG_ucl', 'UCL_Progress', 'Top_Candidate', 'UCL_Progress_Rank']


In [9]:
from sklearn.model_selection import train_test_split

# Convert the 'UCL_Progress' column into a numerical rank
progress_mapping = {
    'Winner': 1, 'Runner-up': 2, 'Semifinals': 3, 'Quarterfinals': 4,
    'Round of 16': 5, 'Group Stage': 6, 'Did Not Qualify': 7
}
master_df['UCL_Progress_Rank'] = master_df['UCL_Progress'].map(progress_mapping)

# --- CORRECTED: Using the exact column names from your DataFrame ---
features_to_use = [
    'Age', 
    'Min_league', 
    'Gls_league', 
    'Ast_league', 
    'xG_x',  # Correct name for player's league xG
    'xAG_league',
    'Gls_ucl', 
    'Ast_ucl', 
    'Min_ucl',
    'Rk_y',  # Correct name for team's final league rank
    'Pts',   # Correct name for team's league points
    'UCL_Progress_Rank'
]

# Create our feature matrix (X) and target vector (y)
X = master_df[features_to_use].fillna(0)
y = master_df['Top_Candidate']

# Split the data into a training set and a testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("Data has been successfully prepared and split for training!")

Data has been successfully prepared and split for training!


In [10]:
import xgboost as xgb
from sklearn.metrics import classification_report

# Initialize the XGBoost Classifier
# The 'scale_pos_weight' parameter helps the model focus on our rare top candidates
scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
model = xgb.XGBClassifier(objective='binary:logistic', scale_pos_weight=scale_pos_weight, random_state=42)

# Train the model on the training data
model.fit(X_train, y_train)
print("Ballon d'Or prediction model has been trained successfully!")

# --- Evaluation ---
# Use the trained model to make predictions on the test data
predictions = model.predict(X_test)

# Generate a performance report
report = classification_report(y_test, predictions, target_names=['Not a Candidate (0)', 'Top Candidate (1)'])

print("\n--- Model Evaluation Report ---")
print(report)

Ballon d'Or prediction model has been trained successfully!

--- Model Evaluation Report ---
                     precision    recall  f1-score   support

Not a Candidate (0)       1.00      0.99      1.00       569
  Top Candidate (1)       0.00      0.00      0.00         2

           accuracy                           0.99       571
          macro avg       0.50      0.50      0.50       571
       weighted avg       0.99      0.99      0.99       571



In [14]:
from sklearn.metrics import classification_report

# Get the raw prediction probabilities instead of just 0s and 1s
# This gives us a score for each player from 0.0 to 1.0
probabilities = model.predict_proba(X_test)[:, 1] # Get probabilities for the "Top Candidate" class

# --- Set a new, lower threshold ---
# Instead of 0.5, let's use 0.1
new_threshold = 0.01
new_predictions = (probabilities >= new_threshold).astype(int)

# --- Generate the new report ---
print(f"--- New Evaluation Report with a {new_threshold} Threshold ---")
new_report = classification_report(y_test, new_predictions, target_names=['Not a Candidate (0)', 'Top Candidate (1)'])
print(new_report)

--- New Evaluation Report with a 0.01 Threshold ---
                     precision    recall  f1-score   support

Not a Candidate (0)       1.00      0.99      0.99       569
  Top Candidate (1)       0.14      0.50      0.22         2

           accuracy                           0.99       571
          macro avg       0.57      0.74      0.61       571
       weighted avg       1.00      0.99      0.99       571

