## XG Model

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold
import xgboost as xgb
import lightgbm as lgb
from sklearn.pipeline import Pipeline

# Load data
data_full = pd.read_csv(r'C:\Users\Enter Computer\Desktop\FBREF.csv.xls')  
data_2025 = pd.read_csv(r'C:\Users\Enter Computer\Desktop\test2025.csv.xls') 

# Add position dummies
for df in [data_full, data_2025]:
    df['is_FW'] = df['pos'].str.contains('FW', na=False).astype(int)
    df['is_MF'] = df['pos'].str.contains('MF', na=False).astype(int)
    df['is_DF'] = df['pos'].str.contains('DF', na=False).astype(int)

# Target
data_full['Performance_Gls_next'] = data_full.groupby('player')['Performance_Gls'].shift(-1)

# Feature Engineering
for df in [data_full, data_2025]:
    df['Goal_Conversion_Rate'] = df['Performance_Gls'] / df['Standard_SoT'].replace(0, 1)
    df['xG_Overperformance'] = df['Performance_Gls'] - df['Expected_npxG']
    df['Shot_Accuracy'] = df['Standard_SoT'] / df['Expected_npxG'].replace(0, 1)
    df['Gls_trend'] = df.groupby('player')['Performance_Gls'].pct_change().fillna(0)
    df['Gls_per_90'] = df['Performance_Gls'] / df['90s'].replace(0, 1)
    df['Gls_per_90_trend'] = df.groupby('player')['Gls_per_90'].pct_change().fillna(0)
    df['Recent_xG_Form'] = df.groupby('player')['Expected_npxG'].rolling(2, min_periods=1).mean().reset_index(level=0, drop=True)
    df['Adjusted_Goal_Rate'] = (df['Performance_Gls'] / df['90s'].replace(0, 1)) * (1 + 0.3 * (df['Performance_Gls'] > 20).astype(int))
    df['Minutes_per_90'] = df['Playing Time_Min'] / 90
    df['Minutes_per_90_trend'] = df.groupby('player')['Minutes_per_90'].pct_change().fillna(0)
    df['Breakout_Signal'] = df['Gls_per_90_trend'] * df['Minutes_per_90_trend'] * df['Team Success_onG']
    df.loc[df['season'] == 2024, 'Gls_per_90_trend'] *= 2.0
    df.loc[df['season'] == 2024, 'Breakout_Signal'] *= 10.0
    df['Age'] = 2025 - df['born']
    df['Age_Decline'] = np.where(df['Age'] > 32, (df['Age'] - 32) * -0.1, 0)
    league_avg_goals = df.groupby('league')['Team Success_onG'].mean()
    df['Team_Strength'] = df['Team Success_onG'] / df['league'].map(league_avg_goals)
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.fillna(0, inplace=True)

# Drop features
drop_features = ['Performance_G+A', 'Performance_G-PK', 'Standard_Gls', 'Playing Time_Starts', 'nation', 'Unnamed: 0', 'age']
data_full.drop(columns=[col for col in drop_features if col in data_full.columns], inplace=True)

# Feature set
exclude_columns = ['player', 'season', 'pos', 'alter_position', 'Performance_Gls', 'Performance_Gls_next', 'league', 'team']
X_features = data_full.drop(columns=[col for col in exclude_columns if col in data_full.columns])

# Models
models = {
    'RF': Pipeline([('scaler', StandardScaler()), ('RF', RandomForestRegressor(n_estimators=100, max_depth=5, random_state=42, n_jobs=-1))]),
    'XGBoost': Pipeline([('scaler', StandardScaler()), ('XGBoost', xgb.XGBRegressor(objective='reg:squarederror', n_estimators=150, max_depth=5, learning_rate=0.05))]),
    'LightGBM': Pipeline([('scaler', StandardScaler()), ('LightGBM', lgb.LGBMRegressor(n_estimators=150, max_depth=5, learning_rate=0.05, num_leaves=31, random_state=42))])
}

# Weighted training
train_data = pd.concat([
    data_full[data_full['season'].isin([2020, 2021])],
    data_full[data_full['season'] == 2022] * 2,
    data_full[data_full['season'] == 2023] * 2,
    data_full[data_full['season'] == 2024] * 4
]).dropna(subset=['Performance_Gls_next'])

X_train = train_data[X_features.columns]
y_train = train_data['Performance_Gls_next']

# Cross-validation
kf = KFold(n_splits=3, shuffle=True, random_state=42)
cv_mae = {name: [] for name in models}
for train_idx, val_idx in kf.split(X_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    for name, model in models.items():
        model.fit(X_tr, y_tr)
        y_pred = model.predict(X_val)
        cv_mae[name].append(mean_absolute_error(y_val, y_pred))
for name in models:
    print(f"{name} - CV MAE: {np.mean(cv_mae[name]):.2f} ± {np.std(cv_mae[name]):.2f}")

# Train final models
for name, model in models.items():
    model.fit(X_train, y_train)

# Predict 2025
X_2024 = data_full[data_full['season'] == 2024][X_features.columns]
individual_preds = {name: np.maximum(model.predict(X_2024), 0) for name, model in models.items()}

# Assign predictions
for name, preds in individual_preds.items():
    data_full[f'predicted_goals_{name.lower()}_2025'] = np.nan
    data_full.loc[data_full['season'] == 2024, f'predicted_goals_{name.lower()}_2025'] = preds

# Ensemble
weights = {'RF': 0.4, 'LightGBM': 0.3, 'XGBoost': 0.3}
ensemble_pred = sum(weights[name] * individual_preds[name] for name in weights)
data_full['predicted_goals_ensemble_2025'] = np.nan
data_full.loc[data_full['season'] == 2024, 'predicted_goals_ensemble_2025'] = np.maximum(ensemble_pred, 0)

# Breakout boost and scaled floor
actual_2025 = data_2025[['player', 'Performance_Gls']].set_index('player').to_dict()['Performance_Gls']
for idx, row in data_full[data_full['season'] == 2024].iterrows():
    player = row['player']
    partial_goals = actual_2025.get(player, 0)
    if (row['Gls_per_90_trend'] > 0.3 and partial_goals > row['Performance_Gls'] * 0.5):
        data_full.loc[idx, 'predicted_goals_ensemble_2025'] *= 1.5
    if partial_goals > row['Expected_npxG']:
        data_full.loc[idx, 'predicted_goals_ensemble_2025'] = max(data_full.loc[idx, 'predicted_goals_ensemble_2025'], partial_goals * 1.5)
data_full['predicted_goals_ensemble_2025'] = data_full['predicted_goals_ensemble_2025'].clip(upper=40)

# Results
print("\n2025 Full-Season Predictions (All Players):")
results_2025 = data_full[data_full['season'] == 2024][['player', 'team', 'league', 'pos', 'is_FW', 'is_MF', 'is_DF', 
                                                       'Performance_Gls', 'Expected_npxG', 'predicted_goals_ensemble_2025']]
all_players_2025 = results_2025.sort_values('Performance_Gls', ascending=False).drop_duplicates('player', keep='last')
print(all_players_2025)

# Validation (Ensemble only)
pred_2025_df = all_players_2025.set_index('player')
players_to_validate = list(set(actual_2025.keys()) & set(pred_2025_df.index))

ensemble_pred_vals = pred_2025_df.loc[players_to_validate, 'predicted_goals_ensemble_2025'].dropna().values
valid_players_ensemble = pred_2025_df.loc[players_to_validate, 'predicted_goals_ensemble_2025'].dropna().index
actual_vals_ensemble = [actual_2025[p] for p in valid_players_ensemble]
mae_ensemble = mean_absolute_error(actual_vals_ensemble, ensemble_pred_vals)
print(f"\nMAE on Partial 2025 Data (All Players): {mae_ensemble:.2f}")

# MAE by Position
print("\nMAE by Position (Partial 2025 Data):")
for pos in ['FW', 'MF', 'DF']:
    pos_players = pred_2025_df[pred_2025_df[f'is_{pos}'] == 1].index
    pos_valid = [p for p in players_to_validate if p in pos_players]
    if pos_valid:
        actual_vals = [actual_2025[p] for p in pos_valid]
        pred_vals = pred_2025_df.loc[pos_valid, 'predicted_goals_ensemble_2025'].values
        mae_pos = mean_absolute_error(actual_vals, pred_vals)
        print(f"{pos} MAE: {mae_pos:.2f} (n={len(pos_valid)})")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.022853 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 27796
[LightGBM] [Info] Number of data points in the train set: 8783, number of used features: 169
[LightGBM] [Info] Start training from score 1.952522






[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.021388 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 27845
[LightGBM] [Info] Number of data points in the train set: 8783, number of used features: 169
[LightGBM] [Info] Start training from score 1.986907






[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.020404 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 27902
[LightGBM] [Info] Number of data points in the train set: 8784, number of used features: 169
[LightGBM] [Info] Start training from score 1.979508






RF - CV MAE: 1.86 ± 0.05
XGBoost - CV MAE: 1.83 ± 0.03
LightGBM - CV MAE: 1.83 ± 0.04
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.029219 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 28605
[LightGBM] [Info] Number of data points in the train set: 13175, number of used features: 169
[LightGBM] [Info] Start training from score 1.972979







2025 Full-Season Predictions (All Players):
                       player             team              league pos  is_FW  \
12171              Harry Kane    Bayern Munich      GER-Bundesliga  FW      1   
12525         Serhou Guirassy        Stuttgart      GER-Bundesliga  FW      1   
10847          Erling Haaland  Manchester City  ENG-Premier League  FW      1   
12010           Kylian Mbappé        Paris S-G         FRA-Ligue 1  FW      1   
11331            Artem Dovbyk           Girona         ESP-La Liga  FW      1   
...                       ...              ...                 ...  ..    ...   
12053       Yaya Kader Fofana            Reims         FRA-Ligue 1  DF      0   
12052  Valentin Atangana Edoa            Reims         FRA-Ligue 1  MF      0   
12051            Thomas Foket            Reims         FRA-Ligue 1  DF      0   
12050        Thibault De Smet            Reims         FRA-Ligue 1  DF      0   
11850          Quentin Merlin        Marseille         FRA-Ligue

In [4]:
# Save results to CSV
output_path = r'C:\Users\Enter Computer\Desktop\xg_predictions_2025.csv'
all_players_2025.to_csv(output_path, index=False)
print(f"\nPredicted results saved to: {output_path}")


Predicted results saved to: C:\Users\Enter Computer\Desktop\predicted_goals_2025.csv
