In [2]:
import pandas as pd

df = pd.read_csv("data/full_player_stats_rollings_2023_2024.csv")

In [3]:
df.head()

Unnamed: 0,Player,#,Nation,Pos,Age,Min,Performance_Gls,Performance_Ast,Performance_PK,Performance_PKatt,...,rolling_avg_SCA_SCA,rolling_avg_SCA_GCA,rolling_avg_Passes_Cmp,rolling_avg_Passes_Att,rolling_avg_Passes_CmpPct,rolling_avg_Passes_PrgP,rolling_avg_Carries_Carries,rolling_avg_Carries_PrgC,rolling_avg_Take-Ons_Att,rolling_avg_Take-Ons_Succ
0,Callum Hudson-Odoi,14.0,eng ENG,LW,22-315,70,1,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Callum Hudson-Odoi,14.0,eng ENG,"WB,LW",22-320,35,0,0,0,0,...,3.0,0.0,18.0,25.0,72.0,2.0,21.0,6.0,2.0,2.0
2,Callum Hudson-Odoi,14.0,eng ENG,LW,22-328,59,0,0,0,0,...,2.5,0.0,20.5,25.5,80.25,3.0,22.0,5.5,1.5,1.5
3,Callum Hudson-Odoi,14.0,eng ENG,LW,22-334,65,0,0,0,0,...,2.666667,0.0,17.666667,23.333333,74.566667,2.333333,19.666667,5.333333,2.0,1.666667
4,Callum Hudson-Odoi,14.0,eng ENG,CM,23-018,29,0,0,0,0,...,2.5,0.0,19.0,24.5,76.45,2.0,20.25,4.5,3.0,2.0


In [4]:
df.columns

Index(['Player', '#', 'Nation', 'Pos', 'Age', 'Min', 'Performance_Gls',
       'Performance_Ast', 'Performance_PK', 'Performance_PKatt',
       'Performance_Sh', 'Performance_SoT', 'Performance_CrdY',
       'Performance_CrdR', 'Performance_Touches', 'Performance_Tkl',
       'Performance_Int', 'Performance_Blocks', 'Expected_xG', 'Expected_npxG',
       'Expected_xAG', 'SCA_SCA', 'SCA_GCA', 'Passes_Cmp', 'Passes_Att',
       'Passes_CmpPct', 'Passes_PrgP', 'Carries_Carries', 'Carries_PrgC',
       'Take-Ons_Att', 'Take-Ons_Succ', 'match_id', 'event_time',
       'team_indicator', 'player_id', 'rolling_avg_Min',
       'rolling_avg_Performance_Gls', 'rolling_avg_Performance_Ast',
       'rolling_avg_Performance_PK', 'rolling_avg_Performance_PKatt',
       'rolling_avg_Performance_Sh', 'rolling_avg_Performance_SoT',
       'rolling_avg_Performance_CrdY', 'rolling_avg_Performance_CrdR',
       'rolling_avg_Performance_Touches', 'rolling_avg_Performance_Tkl',
       'rolling_avg_Performan

In [6]:


# --- 1. Calculate Target Variables (Match Goals) ---
# Group by match and team, then sum the Performance_Gls column
match_goals = df.groupby(['match_id', 'team_indicator'])['Performance_Gls'].sum().reset_index()
match_goals.rename(columns={'Performance_Gls': 'Goals_Scored'}, inplace=True)

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import root_mean_squared_error, r2_score

In [None]:
# --- 2. Define Metrics and Positions to Aggregate ---

# List of rolling average columns for features
ROLLING_COLS = [col for col in df.columns if col.startswith('rolling_avg_')]

# Key positions for aggregation
POSITIONS = ['FW', 'MF', 'DF'] 

def create_match_features(df, rolling_cols, positions):
    """Aggregates player rolling metrics by position and team."""
    
    df_filtered = df[df['Pos'].isin(positions)].copy()
    
    agg_features = []
    for pos in positions:
        pos_df = df_filtered[df_filtered['Pos'] == pos]
        
        # Aggregation: mean of rolling averages for that position
        agg_dict = {col: f'{pos}_Avg_{col}' for col in rolling_cols}
        
        pos_agg = pos_df.groupby(['match_id', 'team_indicator'])[rolling_cols].mean().reset_index()
        pos_agg.rename(columns=agg_dict, inplace=True)
        agg_features.append(pos_agg)

    # Merge all positional features into a single team-match level DataFrame
    team_match_df = agg_features[0]
    for next_df in agg_features[1:]:
        team_match_df = pd.merge(team_match_df, next_df, 
                                 on=['match_id', 'team_indicator'], how='outer')
        
    return team_match_df

# Run the aggregation
team_match_df = create_match_features(df, ROLLING_COLS, POSITIONS)

# Merge the target variables (match_goals) with the features
team_match_df = pd.merge(team_match_df, match_goals, 
                         on=['match_id', 'team_indicator'], how='left')


# --- 3. Create Match-Level Row by Joining Team A and Team B ---

# We'll designate the team with the lexicographically smaller name as 'TeamA' for consistency
teams = sorted(team_match_df['team_indicator'].unique())
if len(teams) < 2:
    raise ValueError("Need at least two teams for match-level joining.")

team_A_name = teams[0]
team_B_name = teams[1]

# Separate data for Team A and Team B
team_A = team_match_df[team_match_df['team_indicator'] == team_A_name].drop('team_indicator', axis=1).copy()
team_B = team_match_df[team_match_df['team_indicator'] == team_B_name].drop('team_indicator', axis=1).copy()

# Rename columns
team_A.columns = ['match_id'] + [f'TeamA_{col}' for col in team_A.columns if col != 'match_id']
team_B.columns = ['match_id'] + [f'TeamB_{col}' for col in team_B.columns if col != 'match_id']

# Merge the two teams into a single match row
match_df = pd.merge(team_A, team_B, on='match_id', how='inner')

# --- 4. Finalize Target Variable ---
match_df['Total_Match_Goals'] = match_df['TeamA_Goals_Scored'] + match_df['TeamB_Goals_Scored']

print("--- Aggregated Match Features (First 5 Rows) ---")
print(match_df.head())
print(f"Total Matches in Dataset: {len(match_df)}")


# --- 5. Prepare Data for XGBoost ---

# Target variable: Total Match Goals
Y = match_df['Total_Match_Goals']

# Features (excluding match_id and target columns)
FEATURE_COLS = [col for col in match_df.columns 
                if col not in ['match_id', 'TeamA_Goals_Scored', 'TeamB_Goals_Scored', 'Total_Match_Goals']]
X = match_df[FEATURE_COLS]

# Create Difference Features (Highly Recommended for relative strength)
diff_features = pd.DataFrame()
for col in X.columns:
    if col.startswith('TeamA_'):
        # Find the corresponding Team B column
        teamB_col = col.replace('TeamA_', 'TeamB_')
        if teamB_col in X.columns:
            new_col = col.replace('TeamA_', 'Diff_')
            # Calculate the difference (Team A's strength relative to Team B's)
            diff_features[new_col] = X[col] - X[teamB_col]
            
X = pd.concat([X, diff_features], axis=1)

# Split data (using a random split for simplicity, but consider a time-based split for real forecasting)
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42
)

# --- 6. Train the XGBoost Regressor Model ---

print("\n--- Training XGBoost Model for Total Match Goals ---")
# Use the Poisson objective for count data
xgb_model = XGBRegressor(
    objective='count:poisson', 
    n_estimators=100, 
    learning_rate=0.1, 
    max_depth=3, 
    random_state=42
)

xgb_model.fit(X_train, Y_train)
print("Training complete.")

# --- 7. Evaluation ---

Y_pred = xgb_model.predict(X_test)
Y_pred_int = np.round(Y_pred).astype(int)

rmse = root_mean_squared_error(Y_test, Y_pred_int, squared=False)
r2 = r2_score(Y_test, Y_pred_int)

print(f"\n### Model Evaluation (Total Match Goals) ###")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"R-squared (R2) Score: {r2:.4f}")

# Optional: Feature Importance
print("\nTop 5 Feature Importances:")
feature_importances = pd.Series(xgb_model.feature_importances_, index=X.columns)
print(feature_importances.nlargest(5))