In [1]:
#import libraries 

import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit, train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import (accuracy_score, f1_score, roc_auc_score,
                             mean_squared_error, mean_absolute_error, r2_score)
import xgboost as xgb
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

# Load cleaned data
stats = pd.read_csv('cleaned_stats.csv')
results = pd.read_csv('cleaned_results.csv')


In [4]:
# 3.1 Problem Formulation & Target Creation 

# We need to compute final standings from results for each season
# Create a table of points per team per season from results
def compute_season_points(df):
    # Home points
    home = df.groupby(['season', 'home_team']).agg(
        home_wins=('result', lambda x: (x=='H').sum()),
        home_draws=('result', lambda x: (x=='D').sum()),
        home_goals_for=('home_goals', 'sum'),
        home_goals_against=('away_goals', 'sum')
    ).reset_index().rename(columns={'home_team': 'team'})

    # Away points
    away = df.groupby(['season', 'away_team']).agg(
        away_wins=('result', lambda x: (x=='A').sum()),
        away_draws=('result', lambda x: (x=='D').sum()),
        away_goals_for=('away_goals', 'sum'),
        away_goals_against=('home_goals', 'sum')
    ).reset_index().rename(columns={'away_team': 'team'})

    # Merge and compute totals
    standings = home.merge(away, on=['season', 'team'], how='outer').fillna(0)
    standings['wins'] = standings['home_wins'] + standings['away_wins']
    standings['draws'] = standings['home_draws'] + standings['away_draws']
    standings['losses'] = 38 - standings['wins'] - standings['draws']
    standings['points'] = 3*standings['wins'] + standings['draws']
    standings['goals_for'] = standings['home_goals_for'] + standings['away_goals_for']
    standings['goals_against'] = standings['home_goals_against'] + standings['away_goals_against']
    standings['goal_diff'] = standings['goals_for'] - standings['goals_against']

    # Rank within season
    standings = standings.sort_values(['season', 'points', 'goal_diff', 'goals_for'],
                                       ascending=[True, False, False, False])
    standings['rank'] = standings.groupby('season').cumcount() + 1
    return standings

standings = compute_season_points(results)

# Merge with stats to get features and targets together
# stats already contains wins, draws, losses, goals, goals_conceded, etc.
# We'll use stats for features and also add targets from standings where needed.
# But note: stats has aggregated numbers that match the season totals (should be consistent)
# We'll join on team and season
stats['season'] = stats['season'].astype(str)
standings['season'] = standings['season'].astype(str)

df = stats.merge(standings[['season', 'team', 'rank', 'points', 'goals_for', 'goals_against',
                            'wins', 'draws', 'losses']],
                 on=['season', 'team'], how='left', suffixes=('', '_target'))

# Verify consistency (optional)
# assert (df['wins'] == df['wins_target']).all()  # might fail if stats are from a different source; we'll trust stats.

# Create binary targets
df['top4'] = (df['rank'] <= 4).astype(int)
df['relegated'] = (df['rank'] >= 18).astype(int)  # 20 teams, bottom 3

# Also create targets for the regression tasks (already present in stats or we can use stats)
# Goals scored, goals conceded, clean sheets, shots on target, big chances missed, total passes, touches
# But note: some targets like big chances missed are only in stats (big_chance_missed)
# We'll keep stats columns as targets directly for those tasks.
# However, we must ensure we don't use the same stats as features for the same season (data leakage).
# We will use only features from *previous* seasons. That requires creating lagged features.




In [5]:
# 3.2 Feature Engineering (Lagged Features) 



# Sort by team and season
df = df.sort_values(['team', 'season_start'])

# Create lag features for each numeric stat (excluding targets we want to predict)
feature_cols = [c for c in stats.columns if c not in
                ['team', 'season', 'season_start', 'wins', 'losses', 'draws',
                 'points', 'goal_diff', 'total_matches', 'strength', 'cluster']]
# Also exclude the targets we'll predict separately (to avoid leakage)
# We'll create lags for all numeric features.

# For each team, shift numeric features by 1 season
lagged = df.groupby('team')[feature_cols].shift(1).add_suffix('_lag1')
df = pd.concat([df, lagged], axis=1)

# Also create 2-season moving averages for stability
for col in feature_cols:
    df[col + '_ma2'] = df.groupby('team')[col].transform(lambda x: x.rolling(2, min_periods=1).mean().shift(1))

# Drop rows where no lag features exist (first season for each team)
df = df.dropna(subset=df.filter(like='_lag1').columns, how='all').reset_index(drop=True)
