In [1]:
import pandas as pd
df = pd.read_csv("../data/raw/matches.csv")
print(df.shape)
print(df.columns.tolist())
df.head()

(380, 106)
['Div', 'Date', 'Time', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'HTHG', 'HTAG', 'HTR', 'Referee', 'HS', 'AS', 'HST', 'AST', 'HF', 'AF', 'HC', 'AC', 'HY', 'AY', 'HR', 'AR', 'B365H', 'B365D', 'B365A', 'BWH', 'BWD', 'BWA', 'IWH', 'IWD', 'IWA', 'PSH', 'PSD', 'PSA', 'WHH', 'WHD', 'WHA', 'VCH', 'VCD', 'VCA', 'MaxH', 'MaxD', 'MaxA', 'AvgH', 'AvgD', 'AvgA', 'B365>2.5', 'B365<2.5', 'P>2.5', 'P<2.5', 'Max>2.5', 'Max<2.5', 'Avg>2.5', 'Avg<2.5', 'AHh', 'B365AHH', 'B365AHA', 'PAHH', 'PAHA', 'MaxAHH', 'MaxAHA', 'AvgAHH', 'AvgAHA', 'B365CH', 'B365CD', 'B365CA', 'BWCH', 'BWCD', 'BWCA', 'IWCH', 'IWCD', 'IWCA', 'PSCH', 'PSCD', 'PSCA', 'WHCH', 'WHCD', 'WHCA', 'VCCH', 'VCCD', 'VCCA', 'MaxCH', 'MaxCD', 'MaxCA', 'AvgCH', 'AvgCD', 'AvgCA', 'B365C>2.5', 'B365C<2.5', 'PC>2.5', 'PC<2.5', 'MaxC>2.5', 'MaxC<2.5', 'AvgC>2.5', 'AvgC<2.5', 'AHCh', 'B365CAHH', 'B365CAHA', 'PCAHH', 'PCAHA', 'MaxCAHH', 'MaxCAHA', 'AvgCAHH', 'AvgCAHA']


Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,...,AvgC<2.5,AHCh,B365CAHH,B365CAHA,PCAHH,PCAHA,MaxCAHH,MaxCAHA,AvgCAHH,AvgCAHA
0,E0,11/08/2023,20:00,Burnley,Man City,0,3,A,0,2,...,2.28,1.5,1.95,1.98,1.95,1.97,,,1.92,1.95
1,E0,12/08/2023,12:30,Arsenal,Nott'm Forest,2,1,H,2,0,...,2.63,-2.0,1.95,1.98,1.93,1.97,2.01,2.09,1.95,1.92
2,E0,12/08/2023,15:00,Bournemouth,West Ham,1,1,D,0,0,...,2.12,0.0,2.02,1.91,2.01,1.92,2.06,1.96,1.96,1.91
3,E0,12/08/2023,15:00,Brighton,Luton,4,1,H,1,0,...,2.48,-1.75,2.01,1.92,2.0,1.91,2.14,1.93,2.0,1.86
4,E0,12/08/2023,15:00,Everton,Fulham,0,1,A,0,0,...,1.71,-0.25,2.06,1.87,2.04,1.88,2.08,1.99,1.98,1.88


In [4]:
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

RAW = Path("../data/raw/matches.csv")
FIGS = Path("../reports/figures")
FIGS.mkdir(parents=True, exist_ok=True)

df = pd.read_csv(RAW)

# 1) Points by team
pts = []
for _,r in df.iterrows():
    if r['FTR']=='H': pts += [(r['HomeTeam'],3),(r['AwayTeam'],0)] # Home win
    elif r['FTR']=='A': pts += [(r['HomeTeam'],0),(r['AwayTeam'],3)] # Away win
    else: pts += [(r['HomeTeam'],1),(r['AwayTeam'],1)] # Draw   
pts_df = pd.DataFrame(pts, columns=['Team','Pts']).groupby('Team', as_index=False).sum()
pts_df = pts_df.sort_values('Pts', ascending=False)

plt.figure(); plt.barh(pts_df['Team'], pts_df['Pts'])
plt.title("Premier League 2023–24 — Points by Team"); plt.gca().invert_yaxis()
plt.tight_layout(); plt.savefig(FIGS/"points_by_team.png", dpi=180); plt.close()

# 2) Goals For vs Goals Against
gf = df.groupby('HomeTeam')['FTHG'].sum().add(df.groupby('AwayTeam')['FTAG'].sum(), fill_value=0)
ga = df.groupby('HomeTeam')['FTAG'].sum().add(df.groupby('AwayTeam')['FTHG'].sum(), fill_value=0)
goals = pd.DataFrame({'Team': gf.index, 'GF': gf.values, 'GA': ga.reindex(gf.index).values}).sort_values('GF', ascending=False)

plt.figure()
plt.bar(goals['Team'], goals['GF'], label='GF')
plt.bar(goals['Team'], -goals['GA'], label='GA')  # GA as negative for visual contrast
plt.xticks(rotation=90)
plt.title("Goals For vs Goals Against"); plt.legend()
plt.tight_layout(); plt.savefig(FIGS/"gf_vs_ga.png", dpi=180); plt.close()

# 3) Home vs Away performance
home_pts = df.assign(Pts=df['FTR'].map({'H':3,'D':1,'A':0})).groupby('HomeTeam')['Pts'].mean()
away_pts = df.assign(Pts=df['FTR'].map({'H':0,'D':1,'A':3})).groupby('AwayTeam')['Pts'].mean()
home_away = pd.DataFrame({'Team': home_pts.index, 'HomePPG': home_pts.values, 'AwayPPG': away_pts.reindex(home_pts.index).values})

plt.figure()
x = np.arange(len(home_away))
plt.bar(x-0.2, home_away['HomePPG'], width=0.4, label='Home PPG')
plt.bar(x+0.2, home_away['AwayPPG'], width=0.4, label='Away PPG')
plt.xticks(x, home_away['Team'], rotation=90)
plt.title("Points per Game: Home vs Away")
plt.legend()
plt.tight_layout(); plt.savefig(FIGS/"home_vs_away_ppg.png", dpi=180); plt.close()

print("[OK] Figures saved in reports/figures/")

# --- 4) Arsenal vs Leader Comparison
leader_team = pts_df.iloc[0]['Team']
arsenal_pts = pts_df.loc[pts_df['Team'] == 'Arsenal', 'Pts'].values[0]
leader_pts = pts_df.iloc[0]['Pts']
gap = leader_pts - arsenal_pts

print(f"Leader: {leader_team} with {leader_pts} points")
print(f"🔴 Arsenal: {arsenal_pts} points ({gap:+d} gap)")

arsenal_goals = goals[goals['Team']=='Arsenal'].iloc[0]
leader_goals = goals[goals['Team']==leader_team].iloc[0]

labels = ['Goals For', 'Goals Against']
arsenal_values = [arsenal_goals['GF'], arsenal_goals['GA']]
leader_values = [leader_goals['GF'], leader_goals['GA']]

x = np.arange(len(labels))
plt.figure()
plt.bar(x-0.15, leader_values, width=0.3, label=leader_team, color='skyblue')
plt.bar(x+0.15, arsenal_values, width=0.3, label='Arsenal', color='red')
plt.xticks(x, labels)
plt.title(f"Arsenal vs {leader_team} — Goals Comparison")
plt.legend()
plt.tight_layout()
plt.savefig("../reports/figures/arsenal_vs_leader.png", dpi=180)
plt.close()

ars_home = home_away.loc[home_away['Team']=='Arsenal', 'HomePPG'].values[0]
ars_away = home_away.loc[home_away['Team']=='Arsenal', 'AwayPPG'].values[0]
ldr_home = home_away.loc[home_away['Team']==leader_team, 'HomePPG'].values[0]
ldr_away = home_away.loc[home_away['Team']==leader_team, 'AwayPPG'].values[0]

print(f"Arsenal Home PPG: {ars_home:.2f}, Away PPG: {ars_away:.2f}")
print(f"{leader_team} Home PPG: {ldr_home:.2f}, Away PPG: {ldr_away:.2f}")

# --- 5) Goals per Shot (Efficiency)
shots = df.groupby('HomeTeam')['HS'].sum().add(df.groupby('AwayTeam')['AS'].sum(), fill_value=0)
efficiency = (gf / shots).sort_values(ascending=False)

plt.figure()
efficiency.plot(kind='bar')
plt.ylabel("Goals per Shot")
plt.title("Goals per Shot (Efficiency by Team)")
plt.tight_layout()
plt.savefig(FIGS/"goals_per_shot.png", dpi=180)
plt.close()

print(f"Arsenal Goals/Shot: {efficiency.loc['Arsenal']:.3f} | {leader_team}: {efficiency.loc[leader_team]:.3f}")

# --- 6) Discipline (Fouls & Cards)
fouls = df.groupby('HomeTeam')['HF'].sum().add(df.groupby('AwayTeam')['AF'].sum(), fill_value=0)
yellows = df.groupby('HomeTeam')['HY'].sum().add(df.groupby('AwayTeam')['AY'].sum(), fill_value=0)
reds = df.groupby('HomeTeam')['HR'].sum().add(df.groupby('AwayTeam')['AR'].sum(), fill_value=0)

discipline = pd.DataFrame({'Fouls': fouls, 'Yellows': yellows, 'Reds': reds}).sort_values('Fouls', ascending=False)

discipline[['Fouls','Yellows']].plot(kind='bar', figsize=(10,5))
plt.title("Team Discipline — Fouls & Yellow Cards")
plt.ylabel("Count")
plt.xticks(rotation=90)
plt.tight_layout()
plt.savefig(FIGS/"team_discipline.png", dpi=180)
plt.close()

print(f"Arsenal fouls: {discipline.loc['Arsenal','Fouls']} | Yellows: {discipline.loc['Arsenal','Yellows']} | Reds: {discipline.loc['Arsenal','Reds']}")

# --- 7) Points Trend Over Time
df['Date'] = pd.to_datetime(df['Date'], dayfirst=True)

match_pts = []
for _, r in df.iterrows():
    date = r['Date']
    if r['FTR'] == 'H':
        match_pts += [(date, r['HomeTeam'], 3), (date, r['AwayTeam'], 0)]
    elif r['FTR'] == 'A':
        match_pts += [(date, r['HomeTeam'], 0), (date, r['AwayTeam'], 3)]
    else:
        match_pts += [(date, r['HomeTeam'], 1), (date, r['AwayTeam'], 1)]

pts_by_date = pd.DataFrame(match_pts, columns=['Date','Team','Pts']).sort_values('Date')
ars_pts_trend = pts_by_date[pts_by_date['Team']=='Arsenal'].groupby('Date')['Pts'].sum().cumsum()
ldr_pts_trend = pts_by_date[pts_by_date['Team']==leader_team].groupby('Date')['Pts'].sum().cumsum()

plt.figure()
plt.plot(ars_pts_trend.index, ars_pts_trend.values, label='Arsenal', color='red')
plt.plot(ldr_pts_trend.index, ldr_pts_trend.values, label=leader_team, color='skyblue')
plt.title(f"Cumulative Points Trend — Arsenal vs {leader_team}")
plt.xlabel("Date")
plt.ylabel("Points")
plt.legend()
plt.tight_layout()
plt.savefig(FIGS/"points_trend.png", dpi=180)
plt.close()

print("[OK] Extra figures saved: goals_per_shot.png, team_discipline.png, points_trend.png")

[OK] Figures saved in reports/figures/
Leader: Man City with 91 points
🔴 Arsenal: 89 points (+2 gap)
Arsenal Home PPG: 2.47, Away PPG: 2.21
Man City Home PPG: 2.47, Away PPG: 2.32
Arsenal Goals/Shot: 0.139 | Man City: 0.139
Arsenal fouls: 391 | Yellows: 62 | Reds: 2
[OK] Extra figures saved: goals_per_shot.png, team_discipline.png, points_trend.png
