In [1]:
# 04_Model_Building_and_Evaluation.ipynb

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import (accuracy_score, f1_score, roc_auc_score,
                             mean_squared_error, mean_absolute_error, r2_score,
                             confusion_matrix, classification_report)
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
import lightgbm as lgb
import shap
import joblib
import warnings
warnings.filterwarnings('ignore')

# Load data and repeat feature engineering (same as in notebook 3)
stats = pd.read_csv('cleaned_stats.csv')
results = pd.read_csv('cleaned_results.csv')

# Recreate standings and df (same steps as before)
def compute_season_points(df):
    home = df.groupby(['season', 'home_team']).agg(
        home_wins=('result', lambda x: (x=='H').sum()),
        home_draws=('result', lambda x: (x=='D').sum()),
        home_goals_for=('home_goals', 'sum'),
        home_goals_against=('away_goals', 'sum')
    ).reset_index().rename(columns={'home_team': 'team'})
    away = df.groupby(['season', 'away_team']).agg(
        away_wins=('result', lambda x: (x=='A').sum()),
        away_draws=('result', lambda x: (x=='D').sum()),
        away_goals_for=('away_goals', 'sum'),
        away_goals_against=('home_goals', 'sum')
    ).reset_index().rename(columns={'away_team': 'team'})
    standings = home.merge(away, on=['season', 'team'], how='outer').fillna(0)
    standings['wins'] = standings['home_wins'] + standings['away_wins']
    standings['draws'] = standings['home_draws'] + standings['away_draws']
    standings['losses'] = 38 - standings['wins'] - standings['draws']
    standings['points'] = 3*standings['wins'] + standings['draws']
    standings['goals_for'] = standings['home_goals_for'] + standings['away_goals_for']
    standings['goals_against'] = standings['home_goals_against'] + standings['away_goals_against']
    standings['goal_diff'] = standings['goals_for'] - standings['goals_against']
    standings = standings.sort_values(['season', 'points', 'goal_diff', 'goals_for'],
                                       ascending=[True, False, False, False])
    standings['rank'] = standings.groupby('season').cumcount() + 1
    return standings

standings = compute_season_points(results)
stats['season'] = stats['season'].astype(str)
standings['season'] = standings['season'].astype(str)

df = stats.merge(standings[['season', 'team', 'rank', 'points', 'goals_for', 'goals_against',
                            'wins', 'draws', 'losses']],
                 on=['season', 'team'], how='left', suffixes=('', '_target'))
df['top4'] = (df['rank'] <= 4).astype(int)
df['relegated'] = (df['rank'] >= 18).astype(int)

# Sort and create lag features
df = df.sort_values(['team', 'season_start'])
feature_cols = [c for c in stats.columns if c not in
                ['team', 'season', 'season_start', 'wins', 'losses', 'draws',
                 'points', 'goal_diff', 'total_matches', 'strength', 'cluster']]
lagged = df.groupby('team')[feature_cols].shift(1).add_suffix('_lag1')
df = pd.concat([df, lagged], axis=1)
for col in feature_cols:
    df[col + '_ma2'] = df.groupby('team')[col].transform(lambda x: x.rolling(2, min_periods=1).mean().shift(1))
df = df.dropna(subset=df.filter(like='_lag1').columns, how='all').reset_index(drop=True)

# Split: train on <=2014, valid 2015, test >=2016
train = df[df['season_start'] <= 2014]
valid = df[df['season_start'] == 2015]
test = df[df['season_start'] >= 2016]

# Combine train and valid for final model training (after hyperparameter tuning)
train_val = pd.concat([train, valid])

# Function to get features and target
def get_X_y(df, target_col, classification=True):
    feat_pattern = ['_lag1$', '_ma2$']
    feat_cols = [c for c in df.columns if any(p in c for p in feat_pattern)]
    X = df[feat_cols].copy()
    le = LabelEncoder()
    X['team_encoded'] = le.fit_transform(df['team'])
    y = df[target_col].copy()
    if classification and y.dtype not in ['int64', 'int32']:
        y = y.astype(int)
    return X, y, feat_cols, le


  from .autonotebook import tqdm as notebook_tqdm
