In [2]:
# Imports & display preferences
from pathlib import Path
from typing import Dict, List

import os
import numpy as np
import pandas as pd

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Wider table display in notebooks
pd.set_option("display.max_columns", 100)
pd.set_option("display.width", 140)


In [3]:
# Paths & basic config
BASE_DIR = Path(r"C:\Users\Vex\Desktop\football\dataset\season")
TEAM_DIR = BASE_DIR / "bundesliga23_24"
HIGHLIGHT_TEAM = "Bayer Leverkusen"  # team to highlight in charts

# Required metric columns: {csv stem -> column header to plot}
METRIC_COLUMNS: Dict[str, str] = {
    "touches_in_opp_box_team": "Touches in Opposition Box",
    "corner_taken_team": "Corners Taken",
    "penalty_won_team": "Penalties Won",
    "expected_goals_team": "Expected Goals",
    "team_goals_per_match": "Goals per Match",
    "ontarget_scoring_att_team": "Shots on Target per Match",
    "possession_won_att_3rd_team": "Possession Won Final 3rd per Match",
    "big_chance_team": "Big Chances",
    "won_tackle_team": "Successful Tackles per Match",
    "clean_sheet_team": "Clean Sheets",
    "goals_conceded_team_match": "Goals Conceded per Match",
    "expected_goals_conceded_team": "Expected Goals Conceded",
    "interception_team": "Interceptions per Match",
    "penalty_conceded_team": "Penalties Conceded",
    "saves_team": "Saves per Match",
    "effective_clearance_team": "Clearances per Match",
}


In [4]:
# Lightweight IO/utils
def _key_from_filename(p: Path) -> str:
    """Turn a filename stem into a lowercase snake_case key."""
    return p.stem.lower().replace(" ", "_").replace("-", "_")

def _read_csv_robust(p: Path) -> pd.DataFrame:
    """Read a CSV; fall back to latin-1 if UTF-8 fails."""
    try:
        return pd.read_csv(p)
    except UnicodeDecodeError:
        return pd.read_csv(p, encoding="latin-1")

def load_csvs(team_dir: Path, base_dir: Path) -> Dict[str, pd.DataFrame]:
    """
    Load all CSVs in team_dir plus season-level CSVs in base_dir.
    Season-level tables are prefixed with 'season_'.
    """
    store: Dict[str, pd.DataFrame] = {}
    for p in sorted(team_dir.glob("*.csv")):
        store[_key_from_filename(p)] = _read_csv_robust(p)
    for p in sorted(base_dir.glob("*.csv")):
        store["season_" + _key_from_filename(p)] = _read_csv_robust(p)
    return store

def numify(df: pd.DataFrame, cols: List[str]) -> pd.DataFrame:
    """Ensure selected columns are numeric (coerce errors)."""
    for c in cols:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce")
    return df

def get_df(store: Dict[str, pd.DataFrame], key: str) -> pd.DataFrame:
    """Fetch a table by key and return a copy; raise if missing."""
    df = store.get(key)
    if df is None:
        raise KeyError(f"'{key}' not found in dataframes_dict")
    return df.copy()


In [5]:
# Load all tables
dataframes_dict: Dict[str, pd.DataFrame] = load_csvs(TEAM_DIR, BASE_DIR)

# Quick peek at what was loaded
loaded_keys = sorted(dataframes_dict.keys())
print(f"Loaded {len(loaded_keys)} tables. Sample keys:", loaded_keys[:20])


Loaded 62 tables. Sample keys: ['accurate_cross_team', 'accurate_long_balls_team', 'accurate_pass_team', 'big_chance_missed_team', 'big_chance_team', 'clean_sheet_team', 'corner_taken_team', 'effective_clearance_team', 'expected_goals_conceded_team', 'expected_goals_team', 'fk_foul_lost_team', 'goals_conceded_team_match', 'interception_team', 'ontarget_scoring_att_team', 'penalty_conceded_team', 'penalty_won_team', 'player_accurate_long_balls', 'player_accurate_passes', 'player_big_chances_created', 'player_big_chances_missed']


In [6]:
# Check required metric columns and build frames for plotting
print("== Column audit ==")
plots_data: Dict[str, pd.DataFrame] = {}  # {pretty_name -> df[['Team', pretty]]}

for key, pretty in METRIC_COLUMNS.items():
    df = dataframes_dict.get(key)
    if df is None:
        print(f"[MISS] {key:30s} -> file not loaded")
        continue

    has_team = "Team" in df.columns
    has_metric = pretty in df.columns
    status = "OK  " if (has_team and has_metric) else "WARN"
    print(f"[{status}] {key:30s} -> Team:{has_team}, '{pretty}':{has_metric}")

    if not (has_team and has_metric):
        continue

    # Conceded-type metrics are "better when lower" -> sort ascending
    ascending = ("conceded" in pretty.lower()) or ("penalties conceded" in pretty.lower())
    df_sorted = df.sort_values(by=pretty, ascending=ascending).reset_index(drop=True)

    plots_data[pretty] = df_sorted[["Team", pretty]]

if not plots_data:
    raise ValueError("No valid metric tables found; check the audit above.")


== Column audit ==
[OK  ] touches_in_opp_box_team        -> Team:True, 'Touches in Opposition Box':True
[OK  ] corner_taken_team              -> Team:True, 'Corners Taken':True
[OK  ] penalty_won_team               -> Team:True, 'Penalties Won':True
[OK  ] expected_goals_team            -> Team:True, 'Expected Goals':True
[OK  ] team_goals_per_match           -> Team:True, 'Goals per Match':True
[OK  ] ontarget_scoring_att_team      -> Team:True, 'Shots on Target per Match':True
[OK  ] possession_won_att_3rd_team    -> Team:True, 'Possession Won Final 3rd per Match':True
[OK  ] big_chance_team                -> Team:True, 'Big Chances':True
[OK  ] won_tackle_team                -> Team:True, 'Successful Tackles per Match':True
[OK  ] clean_sheet_team               -> Team:True, 'Clean Sheets':True
[OK  ] goals_conceded_team_match      -> Team:True, 'Goals Conceded per Match':True
[OK  ] expected_goals_conceded_team   -> Team:True, 'Expected Goals Conceded':True
[OK  ] interception_team

In [7]:
# Grid of horizontal bars across key metrics
fig = make_subplots(
    rows=4, cols=4,
    subplot_titles=list(plots_data.keys()),
    specs=[[{"type": "bar"}]*4 for _ in range(4)]
)

for i, (metric_name, df_metric) in enumerate(plots_data.items()):
    row = i // 4 + 1
    col = i % 4 + 1
    colors = ["crimson" if t == HIGHLIGHT_TEAM else "skyblue" for t in df_metric["Team"]]
    fig.add_trace(
        go.Bar(
            y=df_metric["Team"],
            x=df_metric[df_metric.columns[1]],
            name=metric_name,
            marker_color=colors,
            orientation="h",
            text=df_metric[df_metric.columns[1]],
            textposition="auto",
            hovertemplate="<b>%{y}</b><br>" + metric_name + ": %{x}<extra></extra>",
        ),
        row=row, col=col
    )

fig.update_layout(
    title_text=f"{HIGHLIGHT_TEAM} vs All Teams — Key Metrics",
    height=2000,
    width=1200,
    showlegend=False,
    margin=dict(l=120, r=40, t=80, b=40),
    template="plotly_white",
)
fig.update_xaxes(matches=None)
fig.show()


In [8]:
# Select the season standings table and inspect
from IPython.display import display

SEASON_KEY = "season_bundesliga_table_2023_24"
df_season = dataframes_dict.get(SEASON_KEY)
if df_season is None:
    raise KeyError(f"Missing '{SEASON_KEY}' in dataframes_dict. Check your loading cell.")

df_season = df_season.copy()

# Show all rows/cols in notebook output
pd.set_option("display.max_rows", 200)
pd.set_option("display.max_columns", 100)
pd.set_option("display.width", 140)

print("Columns:", list(df_season.columns))
print("Rows:", len(df_season))
display(df_season)


Columns: ['idx', 'name', 'played', 'wins', 'draws', 'losses', 'scoresStr', 'goalConDiff', 'pts']
Rows: 18


Unnamed: 0,idx,name,played,wins,draws,losses,scoresStr,goalConDiff,pts
0,1,Bayer Leverkusen,34,28,6,0,89-24,65,90
1,2,VfB Stuttgart,34,23,4,7,78-39,39,73
2,3,Bayern München,34,23,3,8,94-45,49,72
3,4,RB Leipzig,34,19,8,7,77-39,38,65
4,5,Borussia Dortmund,34,18,9,7,68-43,25,63
5,6,Eintracht Frankfurt,34,11,14,9,51-50,1,47
6,7,Hoffenheim,34,13,7,14,66-66,0,46
7,8,FC Heidenheim,34,10,12,12,50-55,-5,42
8,9,Werder Bremen,34,11,9,14,48-54,-6,42
9,10,Freiburg,34,11,9,14,45-58,-13,42


In [9]:
# Normalize common season columns and parse goals for/against
df = df_season.copy()

# Mirror 'Team' -> 'name' if not present
if "name" not in df.columns and "Team" in df.columns:
    df["name"] = df["Team"]

# Create standard numeric fields from common variants
numeric_like = {
    "pts": ["pts", "Points"],
    "wins": ["wins", "W"],
    "draws": ["draws", "D"],
    "losses": ["losses", "L"],
    "goalConDiff": ["goalConDiff", "Goal Difference", "goal_diff", "GD"],
}
for target, candidates in numeric_like.items():
    if target not in df.columns:
        for c in candidates:
            if c in df.columns:
                df[target] = df[c]
                break

# Parse compact score strings (e.g., "72-24")
if "scoresStr" in df.columns:
    parsed = df["scoresStr"].astype(str).str.extract(r"(\d+)\s*[-:]\s*(\d+)")
    parsed = parsed.rename(columns={0: "goals_scored", 1: "goals_conceded"})
    for col in ["goals_scored", "goals_conceded"]:
        df[col] = pd.to_numeric(parsed[col], errors="coerce").fillna(0).astype(int)
else:
    # Fallback to GF/GA variants if present
    if "goals_scored" not in df.columns:
        for c in ["GF", "Goals For", "goalsFor"]:
            if c in df.columns:
                df["goals_scored"] = pd.to_numeric(df[c], errors="coerce").fillna(0).astype(int)
                break
    if "goals_conceded" not in df.columns:
        for c in ["GA", "Goals Against", "goalsAgainst"]:
            if c in df.columns:
                df["goals_conceded"] = pd.to_numeric(df[c], errors="coerce").fillna(0).astype(int)
                break

# Final type cleanup
for c in ["pts", "wins", "draws", "losses", "goalConDiff"]:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce").fillna(0).astype(int)

# Quick peek of normalized columns (relies on display settings)
df[[c for c in ["name", "pts", "wins", "draws", "losses", "goalConDiff", "goals_scored", "goals_conceded"] if c in df.columns]]


Unnamed: 0,name,pts,wins,draws,losses,goalConDiff,goals_scored,goals_conceded
0,Bayer Leverkusen,90,28,6,0,65,89,24
1,VfB Stuttgart,73,23,4,7,39,78,39
2,Bayern München,72,23,3,8,49,94,45
3,RB Leipzig,65,19,8,7,38,77,39
4,Borussia Dortmund,63,18,9,7,25,68,43
5,Eintracht Frankfurt,47,11,14,9,1,51,50
6,Hoffenheim,46,13,7,14,0,66,66
7,FC Heidenheim,42,10,12,12,-5,50,55
8,Werder Bremen,42,11,9,14,-6,48,54
9,Freiburg,42,11,9,14,-13,45,58


In [10]:
# Overall rankings by points
if all(c in df.columns for c in ["name", "pts"]):
    ranking_fig = px.bar(
        df.sort_values(by="pts", ascending=False),
        x="name", y="pts", text="pts",
        title="Overall Team Rankings by Points",
        labels={"name": "Team", "pts": "Points"},
        color="pts",
        template="plotly_white",
    )
    ranking_fig.update_traces(textposition="outside")
    ranking_fig.update_layout(xaxis_tickangle=-30)
    ranking_fig.show()
else:
    print("Skipping rankings plot (missing 'name' or 'pts').")


In [11]:
# Win/Draw/Loss bars
if all(c in df.columns for c in ["name", "wins", "draws", "losses"]):
    win_loss_fig = px.bar(
        df,
        x="name",
        y=["wins", "draws", "losses"],
        title="Win / Draw / Loss Analysis",
        labels={"value": "Matches", "name": "Team"},
        barmode="group",
        template="plotly_white",
    )
    win_loss_fig.update_layout(legend_title="Outcome", xaxis_tickangle=-30)
    win_loss_fig.show()
else:
    print("Skipping W/D/L plot (missing wins/draws/losses).")

# Goal difference bars
if all(c in df.columns for c in ["name", "goalConDiff"]):
    goal_diff_fig = px.bar(
        df.sort_values(by="goalConDiff", ascending=False),
        x="name", y="goalConDiff", text="goalConDiff",
        title="Goal Difference Analysis",
        labels={"name": "Team", "goalConDiff": "Goal Difference"},
        color="goalConDiff",
        template="plotly_white",
    )
    goal_diff_fig.update_traces(textposition="outside")
    goal_diff_fig.update_layout(xaxis_tickangle=-30)
    goal_diff_fig.show()
else:
    print("Skipping Goal Difference plot (missing goalConDiff).")


In [12]:
# Goals scored vs conceded (bubble size = points if available)
if all(col in df.columns for col in ["name", "goals_scored", "goals_conceded"]):
    scoring_efficiency_fig = px.scatter(
        df,
        x="goals_scored", y="goals_conceded",
        size="pts" if "pts" in df.columns else None,
        color="name",
        title="Scoring Efficiency: Goals Scored vs. Goals Conceded",
        labels={"goals_scored": "Goals Scored", "goals_conceded": "Goals Conceded", "name": "Team"},
        hover_name="name"
    )
    scoring_efficiency_fig.update_layout(legend_title="Teams")
    scoring_efficiency_fig.show()
else:
    print("Skipping scoring efficiency plot (missing goals_scored/goals_conceded).")

# Correlation: goal difference vs points
if all(col in df.columns for col in ["goalConDiff", "pts", "name"]):
    corr_fig = px.scatter(
        df,
        x="goalConDiff", y="pts",
        size="wins" if "wins" in df.columns else None,
        color="name",
        title="Correlation Between Goal Difference and Points",
        labels={"goalConDiff": "Goal Difference", "pts": "Points"},
        hover_name="name"
    )
    corr_fig.update_layout(legend_title="Teams")
    corr_fig.show()
else:
    print("Skipping correlation plot (missing goalConDiff/pts/name).")


In [13]:
# Top 15 goal scorers
df_scorers = numify(get_df(dataframes_dict, "player_top_scorers"), ["Goals"])
top_scorers = (
    df_scorers[["Player", "Team", "Goals"]]
    .sort_values("Goals", ascending=False)
    .head(15)
)
top_scorers["Player_Team"] = top_scorers["Player"] + " (" + top_scorers["Team"] + ")"

fig_goals = px.bar(
    top_scorers,
    x="Goals",
    y="Player_Team",
    orientation="h",
    title="Top 15 Players by Goals",
    text="Goals",
    color="Team",
    labels={"Goals": "Goals", "Player_Team": "Player (Team)"},
    template="plotly_white",
)
fig_goals.update_traces(textposition="outside")
fig_goals.update_layout(xaxis_title="Goals", yaxis_title="Player (Team)")
fig_goals.show()


In [14]:
# Top 15 by assists
df_assists = numify(get_df(dataframes_dict, "player_top_assists"), ["Assists"])
top_assists = (
    df_assists[["Player", "Team", "Assists"]]
    .sort_values("Assists", ascending=False)
    .head(15)
)
top_assists["Player_Team"] = top_assists["Player"] + " (" + top_assists["Team"] + ")"

fig_assists = px.bar(
    top_assists,
    x="Assists",
    y="Player_Team",
    orientation="h",
    title="Top 15 Players by Assists",
    text="Assists",
    color="Team",
    labels={"Assists": "Assists", "Player_Team": "Player (Team)"},
    template="plotly_white",
)
fig_assists.update_traces(textposition="outside")
fig_assists.update_layout(xaxis_title="Assists", yaxis_title="Player (Team)")
fig_assists.show()


In [15]:
# Combined goals + assists (stacked bars)
df_assists = numify(get_df(dataframes_dict, "player_top_assists"), ["Assists"])
df_scorers = numify(get_df(dataframes_dict, "player_top_scorers"), ["Goals"])

A = df_assists[["Player", "Team", "Assists"]]
G = df_scorers[["Player", "Team", "Goals"]]

top_combined = (
    pd.merge(A, G, on=["Player", "Team"], how="outer")
    .fillna(0)
)
top_combined = numify(top_combined, ["Assists", "Goals"])
top_combined["Total"] = top_combined["Goals"] + top_combined["Assists"]
top_combined["Player_Team"] = top_combined["Player"] + " (" + top_combined["Team"] + ")"
top_combined = top_combined.sort_values("Total", ascending=False).head(20)

# Simple team color mapping
palette = px.colors.qualitative.Plotly
teams = list(top_combined["Team"].unique())
team_colors = {t: palette[i % len(palette)] for i, t in enumerate(teams)}

fig = go.Figure()
fig.add_trace(go.Bar(
    x=top_combined["Goals"],
    y=top_combined["Player_Team"],
    name="Goals",
    orientation="h",
    marker=dict(color=[team_colors[t] for t in top_combined["Team"]]),
    text=top_combined["Goals"],
    textposition="inside"
))
fig.add_trace(go.Bar(
    x=top_combined["Assists"],
    y=top_combined["Player_Team"],
    name="Assists",
    orientation="h",
    marker=dict(color=[team_colors[t] for t in top_combined["Team"]], opacity=0.6),
    text=top_combined["Assists"],
    textposition="inside"
))

fig.update_layout(
    title="Top 20 Players by Combined Goals + Assists",
    xaxis_title="Count (Goals + Assists)",
    yaxis_title="Player (Team)",
    template="plotly_white",
    barmode="stack",
    showlegend=True
)
fig.show()


In [16]:
# xA vs actual assists
df_xa = numify(get_df(dataframes_dict, "player_expected_assists"), ["Expected Assists (xA)", "Actual Assists"])
xa_top = (
    df_xa[["Player", "Team", "Expected Assists (xA)", "Actual Assists"]]
    .sort_values("Expected Assists (xA)", ascending=False)
    .head(20)
)
fig_xa = px.scatter(
    xa_top,
    x="Expected Assists (xA)",
    y="Actual Assists",
    color="Team",
    hover_data=["Player"],
    title="Top Players: Expected Assists (xA) vs Actual Assists",
    labels={"Expected Assists (xA)": "Expected Assists (xA)", "Actual Assists": "Actual Assists"},
    template="plotly_white",
)
fig_xa.show()

# xG vs actual goals
df_xg = numify(get_df(dataframes_dict, "player_expected_goals"), ["Expected Goals (xG)", "Actual Goals"])
xg_top = (
    df_xg[["Player", "Team", "Expected Goals (xG)", "Actual Goals"]]
    .sort_values("Expected Goals (xG)", ascending=False)
    .head(20)
)
fig_xg = px.scatter(
    xg_top,
    x="Expected Goals (xG)",
    y="Actual Goals",
    color="Team",
    hover_data=["Player"],
    title="Top Players: Expected Goals (xG) vs Actual Goals",
    labels={"Expected Goals (xG)": "Expected Goals (xG)", "Actual Goals": "Actual Goals"},
    template="plotly_white",
)
fig_xg.show()

# xGOT vs actual goals
df_xgot = numify(get_df(dataframes_dict, "player_expected_goals_on_target"), ["Expected Goals on Target (xGOT)", "Actual Goals"])
xgot_top = (
    df_xgot[["Player", "Team", "Expected Goals on Target (xGOT)", "Actual Goals"]]
    .sort_values("Expected Goals on Target (xGOT)", ascending=False)
    .head(20)
)
fig_xgot = px.scatter(
    xgot_top,
    x="Expected Goals on Target (xGOT)",
    y="Actual Goals",
    color="Team",
    hover_data=["Player"],
    title="Top Players: Expected Goals on Target (xGOT) vs Actual Goals",
    labels={"Expected Goals on Target (xGOT)": "xGOT", "Actual Goals": "Actual Goals"},
    template="plotly_white",
)
fig_xgot.show()
