In [1]:
#import libraries 

import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit, train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import (accuracy_score, f1_score, roc_auc_score,
                             mean_squared_error, mean_absolute_error, r2_score)
import xgboost as xgb
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

# Load cleaned data
stats = pd.read_csv('cleaned_stats.csv')
results = pd.read_csv('cleaned_results.csv')


In [4]:
# 3.1 Problem Formulation & Target Creation 

# We need to compute final standings from results for each season
# Create a table of points per team per season from results
def compute_season_points(df):
    # Home points
    home = df.groupby(['season', 'home_team']).agg(
        home_wins=('result', lambda x: (x=='H').sum()),
        home_draws=('result', lambda x: (x=='D').sum()),
        home_goals_for=('home_goals', 'sum'),
        home_goals_against=('away_goals', 'sum')
    ).reset_index().rename(columns={'home_team': 'team'})

    # Away points
    away = df.groupby(['season', 'away_team']).agg(
        away_wins=('result', lambda x: (x=='A').sum()),
        away_draws=('result', lambda x: (x=='D').sum()),
        away_goals_for=('away_goals', 'sum'),
        away_goals_against=('home_goals', 'sum')
    ).reset_index().rename(columns={'away_team': 'team'})

    # Merge and compute totals
    standings = home.merge(away, on=['season', 'team'], how='outer').fillna(0)
    standings['wins'] = standings['home_wins'] + standings['away_wins']
    standings['draws'] = standings['home_draws'] + standings['away_draws']
    standings['losses'] = 38 - standings['wins'] - standings['draws']
    standings['points'] = 3*standings['wins'] + standings['draws']
    standings['goals_for'] = standings['home_goals_for'] + standings['away_goals_for']
    standings['goals_against'] = standings['home_goals_against'] + standings['away_goals_against']
    standings['goal_diff'] = standings['goals_for'] - standings['goals_against']

    # Rank within season
    standings = standings.sort_values(['season', 'points', 'goal_diff', 'goals_for'],
                                       ascending=[True, False, False, False])
    standings['rank'] = standings.groupby('season').cumcount() + 1
    return standings

standings = compute_season_points(results)

# Merge with stats to get features and targets together
# stats already contains wins, draws, losses, goals, goals_conceded, etc.
# We'll use stats for features and also add targets from standings where needed.
# But note: stats has aggregated numbers that match the season totals (should be consistent)
# We'll join on team and season
stats['season'] = stats['season'].astype(str)
standings['season'] = standings['season'].astype(str)

df = stats.merge(standings[['season', 'team', 'rank', 'points', 'goals_for', 'goals_against',
                            'wins', 'draws', 'losses']],
                 on=['season', 'team'], how='left', suffixes=('', '_target'))

# Verify consistency (optional)
# assert (df['wins'] == df['wins_target']).all()  # might fail if stats are from a different source; we'll trust stats.

# Create binary targets
df['top4'] = (df['rank'] <= 4).astype(int)
df['relegated'] = (df['rank'] >= 18).astype(int)  # 20 teams, bottom 3

# Also create targets for the regression tasks (already present in stats or we can use stats)
# Goals scored, goals conceded, clean sheets, shots on target, big chances missed, total passes, touches
# But note: some targets like big chances missed are only in stats (big_chance_missed)
# We'll keep stats columns as targets directly for those tasks.
# However, we must ensure we don't use the same stats as features for the same season (data leakage).
# We will use only features from *previous* seasons. That requires creating lagged features.




In [5]:
# 3.2 Feature Engineering (Lagged Features) 



# Sort by team and season
df = df.sort_values(['team', 'season_start'])

# Create lag features for each numeric stat (excluding targets we want to predict)
feature_cols = [c for c in stats.columns if c not in
                ['team', 'season', 'season_start', 'wins', 'losses', 'draws',
                 'points', 'goal_diff', 'total_matches', 'strength', 'cluster']]
# Also exclude the targets we'll predict separately (to avoid leakage)
# We'll create lags for all numeric features.

# For each team, shift numeric features by 1 season
lagged = df.groupby('team')[feature_cols].shift(1).add_suffix('_lag1')
df = pd.concat([df, lagged], axis=1)

# Also create 2-season moving averages for stability
for col in feature_cols:
    df[col + '_ma2'] = df.groupby('team')[col].transform(lambda x: x.rolling(2, min_periods=1).mean().shift(1))

# Drop rows where no lag features exist (first season for each team)
df = df.dropna(subset=df.filter(like='_lag1').columns, how='all').reset_index(drop=True)


In [6]:
# 3.3 Define Training and Test Sets (Temporal Split)


# Use seasons up to 2014 for training, 2015 for validation, 2016-2017 for test
train = df[df['season_start'] <= 2014]
valid = df[df['season_start'] == 2015]
test = df[df['season_start'] >= 2016]

print(f"Train: {train.shape}, Valid: {valid.shape}, Test: {test.shape}")

# Features: all lag columns + any static team info? We can one-hot encode team.
# But team identity may be important; we'll include as categorical feature.
# We'll use Label Encoding for tree models, or one-hot for linear.

# Prepare feature matrix (X) and target vectors for each task
# We'll define a function to train/evaluate each task

Train: (144, 138), Valid: (19, 138), Test: (38, 138)


In [8]:
# 3.4 Helper Functions for Modeling  

def get_X_y(df, target_col, classification=True):
    """Return features and target for given target column."""
    # Features: all lag1 and ma2 columns, plus team encoded
    feature_pattern = ['_lag1$', '_ma2$']  # columns ending with these
    feat_cols = [c for c in df.columns if any(p in c for p in feature_pattern)]
    # Also add team as categorical (we'll encode later)
    X = df[feat_cols].copy()
    # Add team label encoding (simple)
    le = LabelEncoder()
    X['team_encoded'] = le.fit_transform(df['team'])
    y = df[target_col].copy()
    if classification and y.dtype not in ['int64', 'int32']:
        y = y.astype(int)
    return X, y, feat_cols

def evaluate_regression(y_true, y_pred, model_name):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    print(f"{model_name} - MAE: {mae:.2f}, RMSE: {rmse:.2f}, R2: {r2:.3f}")
    return {'MAE': mae, 'RMSE': rmse, 'R2': r2}

def evaluate_classification(y_true, y_pred, y_prob, model_name):
    acc = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average='weighted')
    auc = roc_auc_score(y_true, y_prob[:, 1]) if y_prob is not None else np.nan
    print(f"{model_name} - Acc: {acc:.3f}, F1: {f1:.3f}, AUC: {auc:.3f}")
    return {'Accuracy': acc, 'F1': f1, 'AUC': auc}

# Define models
reg_models = {
    'Linear Regression': LinearRegression(),
    'Ridge': Ridge(alpha=1.0),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1),
    'XGBoost': xgb.XGBRegressor(n_estimators=100, random_state=42, verbosity=0),
    'LightGBM': lgb.LGBMRegressor(n_estimators=100, random_state=42, verbose=-1)
}

clf_models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
    'XGBoost': xgb.XGBClassifier(n_estimators=100, random_state=42, verbosity=0),
    'LightGBM': lgb.LGBMClassifier(n_estimators=100, random_state=42, verbose=-1)
}


In [10]:
# 3.5 Train Models for Each Target


# Define all targets with their type and column name in df
targets = [
    {'name': 'top4', 'type': 'classification', 'col': 'top4'},
    {'name': 'relegated', 'type': 'classification', 'col': 'relegated'},
    {'name': 'goals_scored', 'type': 'regression', 'col': 'goals'},
    {'name': 'goals_conceded', 'type': 'regression', 'col': 'goals_conceded'},
    {'name': 'wins', 'type': 'regression', 'col': 'wins'},          # but wins are already in stats; we'll predict wins from lagged features
    {'name': 'draws', 'type': 'regression', 'col': 'draws'},
    {'name': 'losses', 'type': 'regression', 'col': 'losses'},
    {'name': 'clean_sheets', 'type': 'regression', 'col': 'clean_sheet'},
    {'name': 'shots_on_target', 'type': 'regression', 'col': 'ontarget_scoring_att'},
    {'name': 'big_chances_missed', 'type': 'regression', 'col': 'big_chance_missed'},
    {'name': 'total_passes', 'type': 'regression', 'col': 'total_pass'},
    {'name': 'touches', 'type': 'regression', 'col': 'touches'}
]

# For multi-output (wins, draws, losses) we'll treat as separate regressions,
# but could also use multi-output regressor. We'll keep separate for simplicity.

results_summary = {}

for target in targets:
    print(f"\n--- Target: {target['name']} ---")
    X_train, y_train, feat_list = get_X_y(train, target['col'], classification=(target['type']=='classification'))
    X_val, y_val, _ = get_X_y(valid, target['col'], classification=(target['type']=='classification'))

    # Standardize features for linear models only (trees don't need scaling)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)

    if target['type'] == 'classification':
        models = clf_models
        best_model = None
        best_auc = 0
        for name, model in models.items():
            model.fit(X_train_scaled if name in ['Logistic Regression'] else X_train, y_train)
            y_pred = model.predict(X_val_scaled if name in ['Logistic Regression'] else X_val)
            y_prob = model.predict_proba(X_val_scaled if name in ['Logistic Regression'] else X_val) if hasattr(model, 'predict_proba') else None
            metrics = evaluate_classification(y_val, y_pred, y_prob, name)
            if metrics['AUC'] > best_auc:
                best_auc = metrics['AUC']
                best_model = name
        results_summary[target['name']] = {'best_model': best_model, 'val_auc': best_auc}
    else:
        models = reg_models
        best_model = None
        best_rmse = np.inf
        for name, model in models.items():
            model.fit(X_train_scaled if name in ['Linear Regression', 'Ridge'] else X_train, y_train)
            y_pred = model.predict(X_val_scaled if name in ['Linear Regression', 'Ridge'] else X_val)
            metrics = evaluate_regression(y_val, y_pred, name)
            if metrics['RMSE'] < best_rmse:
                best_rmse = metrics['RMSE']
                best_model = name
        results_summary[target['name']] = {'best_model': best_model, 'val_rmse': best_rmse}

print("\nSummary of best models on validation set:")
for k, v in results_summary.items():
    print(f"{k}: {v}")

# We'll save the best models and scalers for later evaluation on test set.
# For simplicity, we'll retrain on combined train+valid later.



--- Target: top4 ---
Logistic Regression - Acc: 0.789, F1: 0.697, AUC: 0.650
Random Forest - Acc: 0.684, F1: 0.684, AUC: 0.683
XGBoost - Acc: 0.684, F1: 0.684, AUC: 0.542
LightGBM - Acc: 0.737, F1: 0.747, AUC: 0.583

--- Target: relegated ---
Logistic Regression - Acc: 0.842, F1: 0.770, AUC: 0.354
Random Forest - Acc: 0.789, F1: 0.789, AUC: 0.510
XGBoost - Acc: 0.789, F1: 0.789, AUC: 0.573
LightGBM - Acc: 0.789, F1: 0.743, AUC: 0.615

--- Target: goals_scored ---
Linear Regression - MAE: 11.37, RMSE: 13.42, R2: -0.059
Ridge - MAE: 11.37, RMSE: 13.41, R2: -0.058
Random Forest - MAE: 14.09, RMSE: 17.60, R2: -0.821
XGBoost - MAE: 15.41, RMSE: 19.00, R2: -1.123
LightGBM - MAE: 11.74, RMSE: 15.59, R2: -0.430

--- Target: goals_conceded ---
Linear Regression - MAE: 9.34, RMSE: 11.81, R2: -0.103
Ridge - MAE: 9.33, RMSE: 11.80, R2: -0.102
Random Forest - MAE: 10.94, RMSE: 13.68, R2: -0.480
XGBoost - MAE: 11.35, RMSE: 13.74, R2: -0.492
LightGBM - MAE: 10.25, RMSE: 12.96, R2: -0.328

--- Target