In [2]:
import pandas as pd
import numpy as np


np.random.seed(42)

# LOAD DATA 
df = pd.read_csv("data1.csv")
season_stats = pd.read_csv(r"D:\Predictor\khel_metrics\science_exhibition\Seasonal_stats.csv")

# CLEAN & PREP
df['match_date'] = pd.to_datetime(df['match_date'], errors='coerce')
df = df[df['match_date'].notna()]
df['home_team'] = df['home_team'].astype(str)
df['away_team'] = df['away_team'].astype(str)
df = df.sort_values('match_date').reset_index(drop=True)

#  CONFIG 
numeric_cols = [
    'home_xg','away_xg','home_shots','away_shots','home_shots_on_target','away_shots_on_target',
    'home_fouls','away_fouls','home_corners','away_corners',
    'home_yellow_cards','away_yellow_cards','home_red_cards','away_red_cards',
    'attendance','home_possession','away_possession'
]
round_cols = [
    'home_shots','away_shots','home_shots_on_target','away_shots_on_target',
    'home_fouls','away_fouls','home_corners','away_corners',
    'home_yellow_cards','away_yellow_cards','home_red_cards','away_red_cards'
]

# ELO 
seed_elo = {
    "Manchester City": 2047.451538, "Arsenal": 1873.716309 , "Liverpool": 2015.505615 , "Chelsea": 1925.982666,
    "Manchester United": 1845.414917, "Tottenham": 1842.690674 , "Newcastle Utd": 1751.814697, "Brighton": 1741.710449,
    "Brentford": 1723.809448, "Crystal Palace": 1761.731445, "Aston Villa": 1750.061646, "Fulham": 1715.741943,
    "West Ham": 1810.744629, "Wolves": 1745.918213, "Everton": 1698.745483, "Bournemouth": 1679.013306,
    "Leeds United": 1721.379883, "Southampton": 1706.036865 , "Burnley": 1728.838623, "Leicester City": 1791.860107,
    "Nott'ham Forest": 1635.789429, "Sheffield Utd": 1544.853638, "Luton Town": 1584.418335,
    "Ipswich Town": 1568.325562, "Sunderland": 1710.204224
}
default_elo = 1450
team_elo = seed_elo.copy()
K = 40
home_adv = 50

def update_elo(team_elo_dict, home, away, home_goals, away_goals):
    if home not in team_elo_dict: team_elo_dict[home] = default_elo
    if away not in team_elo_dict: team_elo_dict[away] = default_elo
    R_home, R_away = team_elo_dict[home], team_elo_dict[away]
    exp_home = 1 / (1 + 10 ** ((R_away - (R_home + home_adv)) / 400))
    s_home = 1.0 if home_goals > away_goals else 0.5 if home_goals == away_goals else 0.0
    s_away = 1.0 - s_home if s_home in (1.0,0.0) else 0.5
    team_elo_dict[home] = R_home + K * (s_home - exp_home)
    team_elo_dict[away] = R_away + K * (s_away - (1 - exp_home))

# SEASONAL STATS 
season_weights = {'2022/23': 0.05, '2023/24': 0.2, '2024/25': 1.0, '2025/26': 1.5}
season_col_mapping = {
    'home_xg':'xG_per_game', 'away_xg':'xG_per_game',
    'home_shots':'shots_for', 'away_shots':'shots_for',
    'home_shots_on_target':'sot_for', 'away_shots_on_target':'sot_for',
    'home_fouls':'fouls_for', 'away_fouls':'fouls_for',
    'home_corners':'corners_for', 'away_corners':'corners_for',
    'home_yellow_cards':'yellow_crd_per_game', 'away_yellow_cards':'yellow_crd_per_game',
    'home_red_cards':'red_crd_per_game', 'away_red_cards':'red_crd_per_game',
    'home_possession':'possesion_per_game', 'away_possession':'possesion_per_game',
    'attendance':'attendance_per_game'
}

def weighted_avg(series):
    if len(series)==0: return np.nan
    return np.average(series, weights=np.linspace(1,1.5,len(series)))

def cap_stat(col,val):
    caps = {
        'home_xg':(0,5), 'away_xg':(0,5),
        'home_shots':(0,30), 'away_shots':(0,30),
        'home_shots_on_target':(0,15), 'away_shots_on_target':(0,15),
        'home_corners':(0,15), 'away_corners':(0,15),
        'home_fouls':(0,25), 'away_fouls':(0,25),
        'home_yellow_cards':(0,5), 'away_yellow_cards':(0,5),
        'home_red_cards':(0,2), 'away_red_cards':(0,2),
        'home_possession':(20,80), 'away_possession':(20,80)
    }
    return min(max(val,caps.get(col,(val,val))[0]), caps.get(col,(val,val))[1])

#  SPLIT
SPLIT_DATE = pd.to_datetime("2025-11-07")
df_played = df[df['match_date']<SPLIT_DATE].copy()
df_future = df[df['match_date']>=SPLIT_DATE].copy()
df_out = df.copy()

# Update ELO with played matches
for _, r in df_played.sort_values('match_date').iterrows():
    if pd.notna(r.get('home_goals')) and pd.notna(r.get('away_goals')):
        update_elo(team_elo, r['home_team'], r['away_team'], int(r['home_goals']), int(r['away_goals']))

# FILL FUTURE MATCHES 
for idx,row in df_out.iterrows():
    if pd.isna(row.get('home_xg')):
        match_date = row['match_date']
        home, away = row['home_team'], row['away_team']
        stadium = row.get('stadium', np.nan)

        # masks for safe selection
        mask_team_home = (df_out['home_team']==home) | (df_out['away_team']==home)
        mask_team_away = (df_out['home_team']==away) | (df_out['away_team']==away)
        mask_date = df_out['match_date']<match_date

        last_home = df_out.loc[mask_team_home & mask_date].sort_values('match_date',ascending=False).head(3)
        last_away = df_out.loc[mask_team_away & mask_date].sort_values('match_date',ascending=False).head(3)

        mask_h2h = (((df_out['home_team']==home)&(df_out['away_team']==away))|
                    ((df_out['home_team']==away)&(df_out['away_team']==home)))
        h2h = df_out.loc[mask_h2h & mask_date].sort_values('match_date',ascending=False).head(4)

        # deterministic RNG
        seed_val = (hash((home,away,str(match_date))) & 0xffffffff)
        rng = np.random.RandomState(seed_val)

        for col in numeric_cols:
            combined_series = pd.Series(dtype=float)
            if col.startswith('home_'):
                base = col.replace('home_','')
                combined_series = pd.concat([
                    last_home.get(col,pd.Series(dtype=float)).dropna(),
                    h2h.loc[h2h['home_team']==home,f'home_{base}'].dropna(),
                    h2h.loc[h2h['away_team']==home,f'away_{base}'].dropna()
                ])
            elif col.startswith('away_'):
                base = col.replace('away_','')
                combined_series = pd.concat([
                    last_away.get(col,pd.Series(dtype=float)).dropna(),
                    h2h.loc[h2h['away_team']==away,f'away_{base}'].dropna(),
                    h2h.loc[h2h['home_team']==away,f'home_{base}'].dropna()
                ])
            combined_val = weighted_avg(combined_series.tail(5)) if len(combined_series)>0 else np.nan

            # seasonal contribution
            season_values=[]
            for season,weight in season_weights.items():
                team_stats = season_stats[(season_stats['Team']==(home if col.startswith('home_') else away)) &
                                          (season_stats['Season']==season)]
                if not team_stats.empty and season_col_mapping[col] in team_stats.columns:
                    season_values.append(float(team_stats[season_col_mapping[col]].values[0])*weight)
            if season_values:
                seasonal_val = np.mean(season_values)
                combined_val = 0.7*combined_val+0.3*seasonal_val if pd.notna(combined_val) else seasonal_val

            # ELO scaling + bias
            if pd.notna(combined_val):
                e_home, e_away = team_elo.get(home,default_elo), team_elo.get(away,default_elo)
                combined_val *= (e_home/1500)**0.4 / (e_away/1500)**0.4
                combined_val *= rng.uniform(1.05,1.15) if col.startswith('home_') else rng.uniform(0.90,0.98)

                # stat-specific noise
                if 'xg' in col or 'shots' in col:
                    combined_val *= rng.normal(1.0,0.10)
                elif 'corners' in col or 'fouls' in col:
                    combined_val *= rng.normal(1.0,0.15)
                elif 'yellow_cards' in col:
                    combined_val *= rng.normal(1.0,0.20)
                elif 'red_cards' in col:
                    combined_val *= rng.normal(1.0,0.35)
                elif 'possession' in col:
                    combined_val *= rng.normal(1.0,0.05)

            # attendance special
            if col=='attendance':
                avg_att = df_out.loc[df_out['stadium']==stadium,'attendance'].dropna().mean()
                if not np.isnan(avg_att): combined_val = avg_att

            # cap and round
            combined_val = cap_stat(col,combined_val)
            write_val = round(combined_val,4 if 'red_cards' in col else 0) if col in round_cols else round(combined_val,2)
            df_out.at[idx,col] = write_val

        # Update ELO with predicted xG
        update_elo(team_elo, home, away, int(round(df_out.at[idx,'home_xg'])), int(round(df_out.at[idx,'away_xg'])))

#  DERIVED FEATURES 
df_out['xg_diff'] = df_out['home_xg'] - df_out['away_xg']
df_out['shots_ratio'] = (df_out['home_shots']+0.1)/(df_out['away_shots']+0.1)
df_out['possession_ratio'] = (df_out['home_possession']+0.1)/(df_out['away_possession']+0.1)
df_out['home_elo'] = df_out['home_team'].map(team_elo)
df_out['away_elo'] = df_out['away_team'].map(team_elo)
df_out['elo_diff'] = df_out['home_elo'] - df_out['away_elo']


#  ROLLING AVERAGES 
rolling_cols = ['home_xg','away_xg','home_shots','away_shots','home_shots_on_target','away_shots_on_target']
for team_col, base_col in [('home_team','home'), ('away_team','away')]:
    for col in rolling_cols:
        new_col = f"{team_col}_{col}_last3"
        df_out[new_col] = np.nan
        for team in df_out[team_col].unique():
            team_mask = df_out[team_col]==team
            df_out.loc[team_mask, new_col] = df_out.loc[team_mask, col].rolling(3, min_periods=1).mean().values
# SINCE LAST MATCH & MATCH NUMBER 
df_out['days_since_last_home'] = np.nan
df_out['days_since_last_away'] = np.nan
df_out['match_num_home'] = 0
df_out['match_num_away'] = 0

teams = pd.concat([df_out['home_team'], df_out['away_team']]).unique()

for team in teams:
    team_matches = df_out[(df_out['home_team']==team) | (df_out['away_team']==team)].sort_values('match_date')
    last_home_date = None
    last_away_date = None
    home_count = 0
    away_count = 0
    for idx, row in team_matches.iterrows():
        # Home team
        if row['home_team'] == team:
            home_count += 1
            df_out.at[idx, 'match_num_home'] = home_count
            if last_home_date is not None:
                df_out.at[idx, 'days_since_last_home'] = (row['match_date'] - last_home_date).days
            last_home_date = row['match_date']
        # Away team
        if row['away_team'] == team:
            away_count += 1
            df_out.at[idx, 'match_num_away'] = away_count
            if last_away_date is not None:
                df_out.at[idx, 'days_since_last_away'] = (row['match_date'] - last_away_date).days
            last_away_date = row['match_date']

df_out['days_since_last_home'].fillna(18, inplace=True)
df_out['days_since_last_away'].fillna(18, inplace=True)

#SAVE
df_out.to_csv("data2_filled_safe.csv", index=False)
print("✅ Enhanced CSV ready with derived features, ELO diff, rolling stats, H2H and seasonal contributions!")


✅ Enhanced CSV ready with derived features, ELO diff, rolling stats, H2H and seasonal contributions!


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_out['days_since_last_home'].fillna(18, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_out['days_since_last_away'].fillna(18, inplace=True)


In [2]:
import pandas as pd
df = pd.read_csv("data1.csv")
print(df.columns.tolist())

['week', 'home_team', 'home_xg', 'home_goals', 'away_goals', 'away_xg', 'away_team', 'stadium', 'season', 'result', 'ht_home_goals', 'ht_away_goals', 'ht_result', 'home_shots', 'away_shots', 'home_shots_on_target', 'away_shots_on_target', 'home_fouls', 'away_fouls', 'home_corners', 'away_corners', 'home_yellow_cards', 'away_yellow_cards', 'home_red_cards', 'away_red_cards', 'match_date', 'kickoff_time', 'date_code', 'attendance', 'home_possession', 'away_possession']


In [16]:
from penaltyblog.scrapers import ClubElo
import pandas as pd

DATE = "2022-05-01"
seed_elo = {
    "Manchester City": 2047.451538, "Arsenal": 1873.716309 , "Liverpool": 2015.505615 , "Chelsea": 1925.982666,
    "Manchester United": 1845.414917, "Tottenham": 1842.690674 , "Newcastle Utd": 1751.814697, "Brighton": 1741.710449,
    "Brentford": 1723.809448, "Crystal Palace": 1761.731445, "Aston Villa": 1750.061646, "Fulham": 1715.741943,
    "West Ham": 1810.744629, "Wolves": 1745.918213, "Everton": 1698.745483, "Bournemouth": 1679.013306,
    "Leeds United": 1721.379883, "Southampton": 1706.036865 , "Burnley": 1728.838623, "Leicester City": 1791.860107,
    "Nott'ham Forest": 1635.789429, "Sheffield Utd": 1544.853638, "Luton Town": 1584.418335,
    "Ipswich Town": 1568.325562, "Sunderland": 1710.204224
}

# ---------------- FETCH ELO ----------------
ce = ClubElo()
df = ce.get_elo_by_date(DATE)

# Reset index so team names are a column
df = df.reset_index().rename(columns={'team': 'team_name'})

# Filter only teams in seed_elo
elo_df = df[df['team_name'].isin(seed_elo.keys())].copy()

# Sort by Elo descending
elo_df = elo_df.sort_values('elo', ascending=False).reset_index(drop=True)

# Display and save
print(elo_df)
elo_df.to_csv(f"elo_ratings_{DATE}.csv", index=False)
print(f"✅ Elo ratings saved to elo_ratings_{DATE}.csv")


         team_name  rank country  level          elo       from         to
0        Liverpool   1.0     ENG      1  2055.505615 2022-05-01 2022-05-03
1          Chelsea   5.0     ENG      1  1925.982666 2022-04-29 2022-05-01
2          Arsenal   8.0     ENG      1  1873.716309 2022-04-29 2022-05-01
3        Tottenham  13.0     ENG      1  1842.690674 2022-05-01 2022-05-01
4         West Ham  18.0     ENG      1  1810.744629 2022-04-29 2022-05-01
5   Crystal Palace  27.0     ENG      1  1761.731445 2022-05-01 2022-05-01
6      Aston Villa  32.0     ENG      1  1750.061646 2022-05-01 2022-05-03
7           Wolves  36.0     ENG      1  1745.918213 2022-05-01 2022-05-01
8         Brighton  39.0     ENG      1  1741.710449 2022-05-01 2022-05-01
9        Brentford  45.0     ENG      1  1723.809448 2022-05-01 2022-05-01
10         Burnley  47.0     ENG      1  1717.114136 2022-05-01 2022-05-01
11     Southampton  52.0     ENG      1  1706.036865 2022-05-01 2022-05-03
12         Everton  55.0 

In [12]:
from penaltyblog.scrapers import ClubElo
import pandas as pd

# ---------------- CONFIG ----------------
DATE = "2024-08-01"  # Replace with any date in YYYY-MM-DD format

# Initialize scraper
ce = ClubElo()

# Fetch Elo ratings for that specific date
df = ce.get_elo_by_date(DATE)

# Reset index so team names are a column
df = df.reset_index().rename(columns={'team': 'team_name'})

# Filter to England top division (level == 1, country == 'ENG')
eng_div1 = df[(df['country'] == 'ENG') & (df['level'] == 1)].copy()

# Sort by Elo descending
eng_div1 = eng_div1.sort_values('elo', ascending=False).reset_index(drop=True)

# Display
print(eng_div1[['team_name', 'elo', 'rank', 'from', 'to']])




         team_name          elo  rank       from         to
0         Man City  2050.572998   1.0 2024-05-20 2024-08-18
1          Arsenal  1946.902832   4.0 2024-05-23 2024-08-17
2        Liverpool  1900.688354   6.0 2024-06-02 2024-08-17
3          Chelsea  1810.119629  17.0 2024-06-02 2024-08-09
4        Newcastle  1801.797119  20.0 2024-05-25 2024-08-17
5        Tottenham  1790.556641  24.0 2024-05-27 2024-08-01
6       Man United  1779.043945  26.0 2024-05-30 2024-08-01
7      Aston Villa  1770.435059  30.0 2024-06-03 2024-08-08
8   Crystal Palace  1759.708008  36.0 2024-05-26 2024-08-01
9         West Ham  1726.242920  41.0 2024-05-20 2024-08-01
10          Fulham  1716.276367  46.0 2024-07-24 2024-08-04
11        Brighton  1713.163208  48.0 2024-05-27 2024-08-13
12       Brentford  1711.083984  49.0 2024-05-20 2024-08-07
13         Everton  1706.850830  51.0 2024-05-26 2024-08-07
14     Bournemouth  1691.123657  60.0 2024-05-25 2024-08-07
15          Wolves  1677.862305  63.0 20

In [None]:
seed_elo = {
    "Manchester City": 1750, "Arsenal": 1680, "Liverpool": 1650, "Chelsea": 1620,
    "Manchester Utd": 1620, "Tottenham": 1600, "Newcastle Utd": 1550, "Brighton": 1530,
    "Brentford": 1500, "Crystal Palace": 1515, "Aston Villa": 1500, "Fulham": 1470,
    "West Ham": 1480, "Wolves": 1460, "Everton": 1450, "Bournemouth": 1430,
    "Leeds United": 1420, "Southampton": 1400, "Burnley": 1380, "Leicester City": 1425,
    "Nott'ham Forest": 1390, "Sheffield Utd": 1380, "Luton Town": 1375,
    "Ipswich Town": 1370, "Sunderland": 1600
}