## XA Model

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import KFold
import xgboost as xgb
import lightgbm as lgb
from sklearn.pipeline import Pipeline

# Load data
data_full = pd.read_csv(r'C:\Users\Enter Computer\Desktop\FBREF.csv.xls')  
data_2025 = pd.read_csv(r'C:\Users\Enter Computer\Desktop\test2025.csv.xls') 

# Add position dummies
for df in [data_full, data_2025]:
    df['is_FW'] = df['pos'].str.contains('FW', na=False).astype(int)
    df['is_MF'] = df['pos'].str.contains('MF', na=False).astype(int)
    df['is_DF'] = df['pos'].str.contains('DF', na=False).astype(int)

# Target
data_full['Performance_Ast_next'] = data_full.groupby('player')['Performance_Ast'].shift(-1)

# Feature Engineering
for df in [data_full, data_2025]:
    df['Assist_Conversion_Rate'] = df['Performance_Ast'] / df['Expected_xA'].replace(0, 1)
    df['xA_Overperformance'] = df['Performance_Ast'] - df['Expected_xA']
    df['Ast_trend'] = df.groupby('player')['Performance_Ast'].pct_change().fillna(0)
    df['Ast_per_90'] = df['Performance_Ast'] / df['90s'].replace(0, 1)
    df['Ast_per_90_trend'] = df.groupby('player')['Ast_per_90'].pct_change().fillna(0)
    df['Recent_xA_Form'] = df.groupby('player')['Expected_xA'].rolling(2, min_periods=1).mean().reset_index(level=0, drop=True)
    df['Adjusted_Assist_Rate'] = (df['Performance_Ast'] / df['90s'].replace(0, 1)) * (1 + 0.3 * (df['Performance_Ast'] > 10).astype(int))
    df['Minutes_per_90'] = df['Playing Time_Min'] / 90
    df['Minutes_per_90_trend'] = df.groupby('player')['Minutes_per_90'].pct_change().fillna(0)
    df['Breakout_Signal'] = df['Ast_per_90_trend'] * df['Minutes_per_90_trend'] * df['Team Success_onG']
    df.loc[df['season'] == 2024, 'Ast_per_90_trend'] *= 2.0
    df.loc[df['season'] == 2024, 'Breakout_Signal'] *= 10.0
    df['Age'] = 2025 - df['born']
    df['Age_Decline'] = np.where(df['Age'] > 32, (df['Age'] - 32) * -0.1, 0)
    league_avg_assists = df.groupby('league')['Team Success_onG'].mean()
    df['Team_Strength'] = df['Team Success_onG'] / df['league'].map(league_avg_assists)
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.fillna(0, inplace=True)

# Drop unnecessary features
drop_features = ['Performance_G+A', 'Performance_G-PK', 'Standard_Gls', 'Playing Time_Starts', 'nation', 'Unnamed: 0', 'age']
data_full.drop(columns=[col for col in drop_features if col in data_full.columns], inplace=True)

# Feature set
exclude_columns = ['player', 'season', 'pos', 'alter_position', 'Performance_Ast', 'Performance_Ast_next', 'league', 'team', 'Performance_Gls']
X_features = data_full.drop(columns=[col for col in exclude_columns if col in data_full.columns])

# Models
models = {
    'RF': Pipeline([('scaler', StandardScaler()), ('RF', RandomForestRegressor(n_estimators=100, max_depth=5, random_state=42, n_jobs=-1))]),
    'XGBoost': Pipeline([('scaler', StandardScaler()), ('XGBoost', xgb.XGBRegressor(objective='reg:squarederror', n_estimators=150, max_depth=5, learning_rate=0.05))]),
    'LightGBM': Pipeline([('scaler', StandardScaler()), ('LightGBM', lgb.LGBMRegressor(n_estimators=150, max_depth=5, learning_rate=0.05, num_leaves=31, random_state=42))])
}

# Weighted training data
train_data = pd.concat([
    data_full[data_full['season'].isin([2020, 2021])],
    data_full[data_full['season'] == 2022] * 2,
    data_full[data_full['season'] == 2023] * 2,
    data_full[data_full['season'] == 2024] * 4
]).dropna(subset=['Performance_Ast_next'])

X_train = train_data[X_features.columns]
y_train = train_data['Performance_Ast_next']

# Cross-validation
kf = KFold(n_splits=3, shuffle=True, random_state=42)
cv_mae = {name: [] for name in models}
for train_idx, val_idx in kf.split(X_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    for name, model in models.items():
        model.fit(X_tr, y_tr)
        y_pred = model.predict(X_val)
        cv_mae[name].append(mean_absolute_error(y_val, y_pred))

for name in models:
    print(f"{name} CV MAE: {np.mean(cv_mae[name]):.2f} ± {np.std(cv_mae[name]):.2f}")

# Train final models
for model in models.values():
    model.fit(X_train, y_train)

# Predict 2025
X_2024 = data_full[data_full['season'] == 2024][X_features.columns]
individual_preds = {name: np.maximum(model.predict(X_2024), 0) for name, model in models.items()}

# Ensemble predictions
weights = {'RF': 0.4, 'LightGBM': 0.3, 'XGBoost': 0.3}
ensemble_pred = sum(weights[name] * individual_preds[name] for name in weights)
data_full.loc[data_full['season'] == 2024, 'predicted_assists_ensemble_2025_raw'] = np.maximum(ensemble_pred, 0)

# Apply boost/floor
actual_2025 = data_2025[['player', 'Performance_Ast']].set_index('player').to_dict()['Performance_Ast']
data_full['predicted_assists_ensemble_2025'] = data_full['predicted_assists_ensemble_2025_raw']
for idx, row in data_full[data_full['season'] == 2024].iterrows():
    player = row['player']
    partial_assists = actual_2025.get(player, 0)
    if row['Ast_per_90_trend'] > 0.3 and partial_assists > row['Performance_Ast'] * 0.5:
        data_full.loc[idx, 'predicted_assists_ensemble_2025'] *= 1.5
    if partial_assists > row['Expected_xA']:
        data_full.loc[idx, 'predicted_assists_ensemble_2025'] = max(data_full.loc[idx, 'predicted_assists_ensemble_2025'], partial_assists * 1.2)
data_full['predicted_assists_ensemble_2025'] = data_full['predicted_assists_ensemble_2025'].clip(upper=20)

# Results
results_2025 = data_full[data_full['season'] == 2024][['player', 'team', 'league', 'pos', 'is_FW', 'is_MF', 'is_DF', 
                                                       'Performance_Ast', 'Expected_xA', 'predicted_assists_ensemble_2025']]
all_players_2025 = results_2025.sort_values('Performance_Ast', ascending=False).drop_duplicates('player', keep='last')
print("\n2025 Full-Season Assist Predictions:\n", all_players_2025)

# Validation
pred_2025_df = all_players_2025.set_index('player')
players_to_validate = list(set(actual_2025.keys()) & set(pred_2025_df.index))
scaling_factor = 1.5

# Raw MAE
raw_pred_vals = data_full.loc[data_full['season'] == 2024].set_index('player').loc[players_to_validate, 'predicted_assists_ensemble_2025_raw'].dropna().values
valid_players_raw = data_full.loc[data_full['season'] == 2024].set_index('player').loc[players_to_validate, 'predicted_assists_ensemble_2025_raw'].dropna().index
actual_vals_raw = [actual_2025[p] * scaling_factor for p in valid_players_raw]
print(f"\nRaw MAE: {mean_absolute_error(actual_vals_raw, raw_pred_vals):.2f}")

# Boosted MAE
ensemble_pred_vals = pred_2025_df.loc[players_to_validate, 'predicted_assists_ensemble_2025'].dropna().values
valid_players_ensemble = pred_2025_df.loc[players_to_validate, 'predicted_assists_ensemble_2025'].dropna().index
actual_vals_ensemble = [actual_2025[p] * scaling_factor for p in valid_players_ensemble]
print(f"Boosted MAE: {mean_absolute_error(actual_vals_ensemble, ensemble_pred_vals):.2f}")

# RMSE
print(f"Boosted RMSE: {np.sqrt(mean_squared_error(actual_vals_ensemble, ensemble_pred_vals)):.2f}")

# MAE by Position
print("\nMAE by Position (Boosted):")
for pos in ['FW', 'MF', 'DF']:
    pos_players = pred_2025_df[pred_2025_df[f'is_{pos}'] == 1].index
    pos_valid = [p for p in players_to_validate if p in pos_players]
    if pos_valid:
        actual_vals = [actual_2025[p] * scaling_factor for p in pos_valid]
        pred_vals = pred_2025_df.loc[pos_valid, 'predicted_assists_ensemble_2025'].values
        print(f"{pos} MAE: {mean_absolute_error(actual_vals, pred_vals):.2f} (n={len(pos_valid)})")

# Feature Importance
lgb_model = models['LightGBM'].named_steps['LightGBM']
lgb_importance = pd.DataFrame({'Feature': X_features.columns, 'Importance': lgb_model.feature_importances_})
print("\nLightGBM Feature Importance:\n", lgb_importance.sort_values('Importance', ascending=False).head(10))

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.018755 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 27418
[LightGBM] [Info] Number of data points in the train set: 8783, number of used features: 167
[LightGBM] [Info] Start training from score 1.372196






[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.020476 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 27490
[LightGBM] [Info] Number of data points in the train set: 8783, number of used features: 167
[LightGBM] [Info] Start training from score 1.379483






[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.020302 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 27540
[LightGBM] [Info] Number of data points in the train set: 8784, number of used features: 167
[LightGBM] [Info] Start training from score 1.393443


RF CV MAE: 1.35 ± 0.03
XGBoost CV MAE: 1.33 ± 0.02
LightGBM CV MAE: 1.33 ± 0.02




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.028653 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 28222
[LightGBM] [Info] Number of data points in the train set: 13175, number of used features: 167
[LightGBM] [Info] Start training from score 1.381708







2025 Full-Season Assist Predictions:
                     player           team              league pos  is_FW  \
11601           Alex Baena     Villarreal         ESP-La Liga  FW      1   
10570        Ollie Watkins    Aston Villa  ENG-Premier League  FW      1   
12450        Álex Grimaldo     Leverkusen      GER-Bundesliga  DF      0   
12361     Jan-Niklas Beste     Heidenheim      GER-Bundesliga  MF      0   
12179           Leroy Sané  Bayern Munich      GER-Bundesliga  FW      1   
...                    ...            ...                 ...  ..    ...   
11265          Aiham Ousou          Cádiz         ESP-La Liga  DF      0   
12183     Matthijs de Ligt  Bayern Munich      GER-Bundesliga  DF      0   
12182  Matteo Perez Vinlöf  Bayern Munich      GER-Bundesliga  FW      1   
11266        Borja Vázquez          Cádiz         ESP-La Liga  FW      1   
13174               Walace        Udinese         ITA-Serie A  MF      0   

       is_MF  is_DF  Performance_Ast  Expected_x

In [3]:
results_2025.to_csv(r'C:\Users\Enter Computer\Desktop\xa_predictions_2025.csv', index=False)