<a href="https://colab.research.google.com/github/AtfastrSlushyMaker/pl-standings-prediction-project/blob/main/notebooks/algorithms/XGBoost/xgboost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ⚽ Premier League Team Performance Prediction — XGBoost


In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path

sns.set_style("whitegrid")
pd.set_option("display.max_columns", 50)

from xgboost import XGBRegressor

## Load Data

In [None]:
# Load dataset
candidate_paths = [
    Path('data/processed/team_season_aggregated.csv'),
    Path('../data/processed/team_season_aggregated.csv'),
    Path('../../data/processed/team_season_aggregated.csv'),
    Path('../../../data/processed/team_season_aggregated.csv'),
    Path('/content/team_season_aggregated.csv')
]
agg_path = next((p for p in candidate_paths if p.exists()), None)
if agg_path is None:
    raise FileNotFoundError('team_season_aggregated.csv not found. Run preprocessing first.')
print(f"✅ Loading dataset: {agg_path}")
df = pd.read_csv(agg_path)
print("Shape:", df.shape)
print("Seasons:", sorted(df['Season'].unique()))
print(df.head())

# Feature set (inputs)
feature_cols = [
    'Team_encoded', 'Season_encoded',
    'Wins', 'Draws', 'Losses',
    'Goals_Scored', 'Goals_Conceded', 'Goal_Difference',
    'Avg_Goals_Scored', 'Avg_Goals_Conceded',
    'Total_Shots', 'Total_Shots_On_Target', 'Avg_Shots', 'Avg_Shots_On_Target',
    'Shot_Accuracy', 'Clean_Sheets', 'Clean_Sheet_Rate',
    'Yellow_Cards', 'Red_Cards', 'Fouls', 'Corners',
    'Win_Rate', 'Home_Win_Rate', 'Away_Win_Rate', 'Points_Per_Game'
]

# Targets (performance metrics to predict)
# Adjust this list if your dataset lacks any of these columns
target_cols = [
    'Points', 'Points_Per_Game', 'Wins', 'Goals_Scored', 'Goals_Conceded', 'Goal_Difference', 'Win_Rate'
]

# Validate target availability
missing_targets = [t for t in target_cols if t not in df.columns]
if missing_targets:
    raise ValueError(f"Missing target columns in dataset: {missing_targets}")

X = df[feature_cols].copy()
print(f"Features: {len(feature_cols)} | Targets: {len(target_cols)}")

## Train-Test Split

In [None]:
train_mask = df['Season'] != '2024-25'
test_mask  = df['Season'] == '2024-25'

X_train = X[train_mask]
X_test  = X[test_mask]

# Build per-target y dictionaries
y_train_dict = {t: df.loc[train_mask, t].values for t in target_cols}
y_test_dict  = {t: df.loc[test_mask,  t].values for t in target_cols}

print(f"Training samples: {len(X_train)} | Test samples: {len(X_test)}")

# Anchor target used for hyperparameter tuning and early stopping
anchor_target = 'Points_Per_Game' if 'Points_Per_Game' in target_cols else target_cols[0]

from sklearn.model_selection import train_test_split
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train_dict[anchor_target], test_size=0.15, random_state=42)
print(f"Internal train: {len(X_tr)} | Validation: {len(X_val)} | Anchor target: {anchor_target}")

## Hyperparameter Tuning (Randomized Search)
We focus on key boosting + regularization parameters to balance performance and overfitting control.

In [None]:
from sklearn.model_selection import RandomizedSearchCV
import scipy.stats as st
import time

xgb_base = XGBRegressor(
    objective='reg:squarederror',
    tree_method='hist',  # fast on tabular
    booster='gbtree',
    n_estimators=1000,   # high upper bound, early stopping will truncate
    random_state=42,
    verbosity=0
)

param_distributions = {
    'learning_rate': st.uniform(0.01, 0.25),
    'max_depth': st.randint(3, 9),
    'min_child_weight': st.randint(1, 8),
    'subsample': st.uniform(0.6, 0.4),
    'colsample_bytree': st.uniform(0.6, 0.4),
    'gamma': st.uniform(0, 0.6),
    'reg_lambda': st.uniform(0.5, 2.5),
    'reg_alpha': st.uniform(0, 0.4)
}

n_iter = 40
print(f"Starting RandomizedSearchCV (n_iter={n_iter}) on anchor target: {anchor_target}...")
start = time.time()
search = RandomizedSearchCV(
    estimator=xgb_base,
    param_distributions=param_distributions,
    n_iter=n_iter,
    scoring='neg_mean_absolute_error',
    cv=5,
    n_jobs=-1,
    verbose=1,
    random_state=42
)
search.fit(X_train, y_train_dict[anchor_target])
elapsed = time.time() - start
print(f"✅ Search complete in {elapsed/60:.1f} min")
print("Best MAE:", -search.best_score_)
print("Best Params:")
for k,v in search.best_params_.items():
    print(f"  {k}: {v}")

best_params = search.best_params_

## Train Final Models with Early Stopping (Per Target)

In [None]:
# Train one XGBoost model per target using best_params and early stopping
import xgboost
print(f"XGBoost version: {xgboost.__version__}")

xgb_models = {}
train_preds = {}
test_preds = {}

for target in target_cols:
    print("\n" + "-"*80)
    print(f"Training target: {target}")
    model = XGBRegressor(
        objective='reg:squarederror',
        tree_method='hist',
        booster='gbtree',
        n_estimators=5000,
        random_state=42,
        verbosity=0,
        eval_metric='mae', 
        **best_params
    )
    try:
        model.fit(
            X_tr, y_tr if target == anchor_target else y_train_dict[anchor_target][:(len(X_tr))],  # anchor for early stopping
            eval_set=[(X_val, y_val)],
            early_stopping_rounds=75,
            verbose=False
        )
    except TypeError:
        try:
            model.fit(X_tr, y_tr if target == anchor_target else y_train_dict[anchor_target][:(len(X_tr))], eval_set=[(X_val, y_val)])
        except TypeError:
            model.fit(X_tr, y_tr if target == anchor_target else y_train_dict[anchor_target][:(len(X_tr))])
    # Refit on full training data for this target using best_iteration if available
    n_estimators_final = getattr(model, 'best_iteration', None)
    if n_estimators_final is not None and isinstance(n_estimators_final, (int, np.integer)) and n_estimators_final > 0:
        model_final = XGBRegressor(
            objective='reg:squarederror',
            tree_method='hist',
            booster='gbtree',
            n_estimators=int(n_estimators_final),
            random_state=42,
            verbosity=0,
            eval_metric='mae',
            **best_params
        )
    else:
        model_final = XGBRegressor(
            objective='reg:squarederror',
            tree_method='hist',
            booster='gbtree',
            n_estimators=best_params.get('n_estimators', 800),
            random_state=42,
            verbosity=0,
            eval_metric='mae',
            **{k:v for k,v in best_params.items() if k != 'n_estimators'}
        )
    model_final.fit(X_train, y_train_dict[target])

    xgb_models[target] = model_final
    train_preds[target] = model_final.predict(X_train)
    test_preds[target]  = model_final.predict(X_test)

print("\n✅ Trained models for:", ", ".join(xgb_models.keys()))

## Evaluation Metrics (Per Target)

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Ensure target list and ground truth dicts exist even if cells ran out of order
try:
    target_cols
except NameError:
    if 'train_preds' in globals() and isinstance(train_preds, dict) and len(train_preds) > 0:
        target_cols = list(train_preds.keys())
        print(f"Inferred target_cols from trained models: {target_cols}")
    else:
        raise NameError("target_cols is not defined. Please run the data loading and training cells first.")

if 'y_train_dict' not in globals() or 'y_test_dict' not in globals():
    if 'df' in globals() and 'train_mask' in globals() and 'test_mask' in globals():
        y_train_dict = {t: df.loc[train_mask, t].values for t in target_cols if t in df.columns}
        y_test_dict  = {t: df.loc[test_mask,  t].values for t in target_cols if t in df.columns}
        missing_truth = [t for t in target_cols if t not in y_train_dict]
        if missing_truth:
            print(f"⚠️ Missing ground truth for targets: {missing_truth}. They will be skipped in metrics.")
    else:
        raise NameError("Ground truth not available. Run 'Load Data' and 'Train-Test Split' cells.")

# Filter to targets present in both predictions and truth
available_targets = [
    t for t in target_cols
    if (t in train_preds and t in test_preds and t in y_train_dict and t in y_test_dict)
]
if not available_targets:
    raise RuntimeError("No common targets available for evaluation. Train models first.")

rows = []
for t in available_targets:
    ytr = y_train_dict[t]
    yte = y_test_dict[t]
    ptr = train_preds[t]
    pte = test_preds[t]
    mae_tr = mean_absolute_error(ytr, ptr)
    mae_te = mean_absolute_error(yte, pte)
    rmse_tr = np.sqrt(mean_squared_error(ytr, ptr))
    rmse_te = np.sqrt(mean_squared_error(yte, pte))
    r2_tr = r2_score(ytr, ptr)
    r2_te = r2_score(yte, pte)
    rows.append([t, mae_tr, mae_te, rmse_tr, rmse_te, r2_tr, r2_te])

metrics_df = pd.DataFrame(rows, columns=[
    'Target','MAE_Train','MAE_Test','RMSE_Train','RMSE_Test','R2_Train','R2_Test'
]).sort_values('MAE_Test')

print("="*80)
print("XGBOOST TEAM PERFORMANCE — METRICS BY TARGET")
print("="*80)
print(metrics_df.to_string(index=False, float_format=lambda x: f"{x:.3f}"))
print("="*80)

## Feature Importance 

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Use anchor target model for interpretability
anchor_model = xgb_models[anchor_target]
booster = anchor_model.get_booster()
importance_gain = booster.get_score(importance_type='gain')
importance_weight = booster.get_score(importance_type='weight')

imp_df = pd.DataFrame([
    (feat, importance_gain.get(feat, 0), importance_weight.get(feat, 0))
    for feat in booster.feature_names
], columns=['Feature','Gain','Weight']).sort_values('Gain', ascending=False)

print(f"Top 15 features by Gain for target: {anchor_target}")
for _, row in imp_df.head(15).iterrows():
    bar = '█' * int((row['Gain']/max(imp_df['Gain'].max(), 1e-9))*25)
    print(f"{row['Feature']:25s} Gain={row['Gain']:.2f} {bar}")

fig, axes = plt.subplots(1,2, figsize=(14,6))
axes[0].barh(imp_df.head(15)['Feature'], imp_df.head(15)['Gain'], color='steelblue')
axes[0].invert_yaxis(); axes[0].set_title(f'Top 15 (Gain) – {anchor_target}'); axes[0].set_xlabel('Gain')
axes[1].barh(imp_df.head(15)['Feature'], imp_df.head(15)['Weight'], color='darkgreen')
axes[1].invert_yaxis(); axes[1].set_title(f'Top 15 (Split Frequency) – {anchor_target}'); axes[1].set_xlabel('Weight')
plt.tight_layout(); plt.show()

## 2024-25 Test Season — Team Performance Predictions

In [None]:
test_df = df[df['Season'] == '2024-25'].copy()
test_df['Raw_Prediction'] = pred_test
# Lower prediction => better (closer to 1)
ranked = test_df.sort_values('Raw_Prediction').reset_index(drop=True)
ranked['Predicted_Position'] = range(1, len(ranked)+1)
ranked['Error'] = ranked['Final_Position'] - ranked['Predicted_Position']

show_cols = ['Final_Position','Predicted_Position','Error','Raw_Prediction','Team','Points','Wins','Goal_Difference']
output = ranked.sort_values('Final_Position')[show_cols].copy()
output.columns = ['Actual','Predicted','Error','RawPred','Team','Pts','W','GD']
print("="*90) 
print("XGBOOST PREDICTED STANDINGS 2024-25 (Test Season)")
print("="*90)
print(output.to_string(index=False))

mae = output['Error'].abs().mean()
perfect = (output['Error']==0).sum()
within1 = (output['Error'].abs()<=1).sum()
within2 = (output['Error'].abs()<=2).sum()
print("\nSummary:")
print(f" MAE: {mae:.2f}")
print(f" Perfect: {perfect}/20")
print(f" ±1: {within1}/20 ({within1/20*100:.0f}%)")
print(f" ±2: {within2}/20 ({within2/20*100:.0f}%)")
print("="*90)

## 2025-26 Season Forecast — Team Performance

In [None]:
# Forecast team performance for next season using historical average profiles
current_teams = df[df['Season'] == '2024-25']['Team'].unique()
forecast_rows = []
for team in current_teams:
    hist = df[df['Team'] == team]
    if hist.empty:
        continue
    row = {
        'Team': team,
        'Team_encoded': hist['Team_encoded'].iloc[0],
        'Season_encoded': 26  # hypothetical code for 2025-26
    }
    for col in feature_cols[2:]:  # skip encodings
        row[col] = hist[col].mean()
    row['Seasons_Used'] = len(hist)
    forecast_rows.append(row)
forecast_features = pd.DataFrame(forecast_rows)
X_forecast = forecast_features[feature_cols]

# Predict all targets
forecast_preds = {t: xgb_models[t].predict(X_forecast) for t in target_cols}
forecast_df = pd.DataFrame({'Team': forecast_features['Team'], **forecast_preds})
forecast_df = forecast_df.sort_values('Points', ascending=False if 'Points' in forecast_df.columns else True).reset_index(drop=True)

print("PREDICTED TEAM PERFORMANCE — 2025-26")
print("="*100)
cols_to_show = ['Team'] + [t for t in ['Points','Points_Per_Game','Wins','Goals_Scored','Goals_Conceded','Goal_Difference','Win_Rate'] if t in forecast_df.columns]
print(forecast_df[cols_to_show].to_string(index=False, float_format=lambda x: f"{x:.2f}"))
print("="*100)

print("\nInsights:")
if 'Points' in forecast_df.columns:
    print(' Top projected teams (by Points):', ', '.join(forecast_df.head(5)['Team']))
if 'Goals_Scored' in forecast_df.columns:
    top_attack = forecast_df.sort_values('Goals_Scored', ascending=False).head(5)['Team']
    print(' Best projected attacks:', ', '.join(top_attack))
if 'Goals_Conceded' in forecast_df.columns:
    best_def = forecast_df.sort_values('Goals_Conceded').head(5)['Team']
    print(' Best projected defenses (fewest GA):', ', '.join(best_def))

print("\nMethod: historical averages -> model(s) -> per-team metric forecasts.")

---
## Summary – XGBoost for Team Performance

### What we predict
- Points, Points per Game, Wins, Goals Scored/Conceded, Goal Difference, Win Rate

### Workflow
1. Load engineered team-season data  
2. Time-aware split (train: historical, test: 2024-25)  
3. Randomized search on anchor target (default: Points per Game)  
4. Train one model per target with early stopping and refit  
5. Evaluate per-target MAE / RMSE / R²  
6. Interpret (Gain/Weight importance + optional SHAP)  
7. 2024-25 predictions (actual vs predicted)  
8. 2025-26 team performance forecast

### Regularization & Overfitting Control
- Structural: max_depth, min_child_weight, gamma  
- Stochastic: subsample, colsample_bytree  
- Penalty: reg_alpha (L1), reg_lambda (L2)  
- Procedural: early stopping on validation MAE

### Outputs
- Per-target metrics table (train/test)  
- Per-team predicted metrics for 2024-25  
- Next-season (2025-26) forecast table  
- Feature importance for the anchor target

### Next steps
- Calibrate uncertainty (quantile GBR / conformal intervals)  
- Add domain priors (home/away splits, rolling form)  
- Ensembling with RF/LightGBM for incremental lift  
- Promotion team adjustment via similarity to nearest historical peers

Artifacts: `xgb_models` (dict of trained models), `metrics_df`, `results`, `forecast_df`.
---