# **Copa America 2024**

# Step 1: Imports

In [14]:
!pip install kagglehub

Collecting kagglehub
  Downloading kagglehub-0.3.13-py3-none-any.whl.metadata (38 kB)
Downloading kagglehub-0.3.13-py3-none-any.whl (68 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.3/68.3 kB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: kagglehub
Successfully installed kagglehub-0.3.13

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


Downloading from https://www.kaggle.com/api/v1/datasets/download/thamersekhri/copa-america-2024-matches-stats?dataset_version_number=1...


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4.93k/4.93k [00:00<00:00, 1.81MB/s]

Extracting files...
Path to dataset files: /Users/derrick/.cache/kagglehub/datasets/thamersekhri/copa-america-2024-matches-stats/versions/1





In [19]:
from soccerdata import FBref
from pathlib import Path
import kagglehub
import pandas as pd
from typing import Dict
import os
from euros import _flatten_cols, _safe_div, _pick_player_pos, player_info


In [24]:

path = kagglehub.dataset_download("thamersekhri/copa-america-2024-matches-stats")

path = Path(path)

file_path = path / "Copa_2024_Matches.csv"

copa_raw = pd.read_csv(file_path)

copa_raw.head()


Unnamed: 0,stadium,attendance,home_team,away_team,home_goals,away_goals,Home Expected goals(xG),Home Total shots,Home Shots on target,Home Big chances,...,Away Red cards,Away Tackles won,Away Interceptions,Away Blocks,Away Clearances,Away Keeper saves,Away Duels won,Away Ground duels won,Away Aerial duels won,Away Successful dribbles
0,MercedesBenz Stadium,70564,Argentina,Canada,2,0,3.02,19,9,9,...,0,16(55%),7,4,18,6,50,46(58%),4(40%),5(45%)
1,ATT Stadium,43030,Peru,Chile,0,0,0.75,7,4,1,...,0,15(79%),8,1,10,4,67,40(45%),27(56%),5(28%)
2,Levis Stadium,29864,Ecuador,Venezuela,1,2,0.86,9,4,2,...,0,10(67%),5,1,19,4,71,43(52%),28(62%),9(60%)
3,NRG Stadium,53763,Mexico,Jamaica,1,0,0.97,20,9,1,...,0,13(68%),10,5,35,8,61,37(54%),24(52%),10(67%)
4,ATT Stadium,47873,USA,Bolivia,2,0,2.51,20,8,3,...,0,6(43%),16,5,28,6,40,29(40%),11(50%),7(50%)


# Step 2: Outputs

In [3]:
BASE = Path.cwd()
OUT_RAW = BASE / "outputs_copa" / "raw"
OUT_STAGING = BASE / "outputs_copa" / "staging"
OUT_PROCESSED = BASE / "outputs_copa" / "processed"
for p in (OUT_RAW, OUT_STAGING, OUT_PROCESSED):
    p.mkdir(parents=True, exist_ok=True)

# Step 3: Data retrieval

In [5]:
print(FBref.available_leagues())

['Big 5 European Leagues Combined', 'ENG-Premier League', 'ESP-La Liga', 'FRA-Ligue 1', 'GER-Bundesliga', 'INT-European Championship', "INT-Women's World Cup", 'INT-World Cup', 'ITA-Serie A']


In [12]:
import io, re, pandas as pd
from bs4 import BeautifulSoup, Comment

soup = BeautifulSoup(r.text, "lxml")

# Prefer wrapper(s) like all_stats_standard or all_stats_*standard*
wrapper = soup.find("div", id=re.compile(r"^all_stats_.*standard"))
if wrapper is None:
    # sometimes FBref uses a slightly different id or structure; list what’s there
    candidates = [div.get("id") for div in soup.find_all("div", id=True) if str(div.get("id")).startswith("all_stats_")]
    print("Found wrappers:", candidates)
    # fallback: search the whole page’s comments for any table with player rows
    comments_scope = soup.find_all(string=lambda s: isinstance(s, Comment))
else:
    comments_scope = wrapper.find_all(string=lambda s: isinstance(s, Comment))

comment_with_table = None
for c in comments_scope:
    # look for an HTML table that has player data (data-stat="player") and the 'stats_standard' id
    if "<table" in c and ('data-stat="player"' in c) and ("stats_standard" in c or "standard" in c):
        comment_with_table = c
        break

if comment_with_table is None:
    raise RuntimeError("Found no commented standard table. Check the page preview prints above; you may be blocked.")

tables = pd.read_html(io.StringIO(comment_with_table), flavor="lxml", displayed_only=False)
copa_df = tables[0]

print("Copa parsed shape:", copa_df.shape)
print(copa_df.columns[:12])
print(copa_df.head(3))

Found wrappers: []


RuntimeError: Found no commented standard table. Check the page preview prints above; you may be blocked.

In [None]:
def build_player_agg(year: str = "2024") -> pd.DataFrame:
    """
    Build one-row-per-player aggregate for EURO {year}.
    Saves raw -> staging -> processed parquet files.
    """
    # 1) fetch raw standard table
    fb = FBref(leagues=["INT-Copa América"], seasons=[year])
    season_stats = fb.read_player_season_stats(stat_type="standard")

    raw_path = OUT_RAW / f"player_standard_euro_{year}.parquet"
    season_stats.to_pickle(raw_path.with_suffix(".pkl"))

    # 2) flatten & stage
    stats = _flatten_cols(season_stats)
    stats_path = OUT_STAGING / f"player_standard_euro_{year}.parquet"
    stats.to_parquet(stats_path, engine='fastparquet')

    # 3) ensure all mapped columns exist
    missing = [k for k in player_info.keys() if k not in stats.columns]
    for m in missing:
        stats[m] = pd.NA

    # 4) rename to canonical schema
    agg = stats[list(player_info.keys())].rename(columns=player_info).copy()
    agg["tournament"] = "COPA"

    # 5) coerce types for numerics (prevents string math weirdness)
    numeric_cols = [
        "age","birth_year","mp","starts","minutes","nineties",
        "gls","ast","ga","g_pk","xg","xag","xg_xag","npxg","npxg_xag",
        "gls_90","ast_90","ga_90","g_pk_90",
    ]
    for c in numeric_cols:
        if c in agg.columns:
            agg[c] = pd.to_numeric(agg[c], errors="coerce")

    # tournament year as int for clean grouping
    agg["tournament_year"] = pd.to_numeric(agg["tournament_year"], errors="coerce").astype("Int64")

    # 6) derivatives
    agg["primary_pos"] = agg["pos_raw"].apply(_pick_player_pos)

    n90 = agg["nineties"]
    for src, dst in [
        ("xg", "xg_90"), ("xag", "xag_90"), ("xg_xag", "xg_xag_90"),
        ("npxg", "npxg_90"), ("npxg_xag", "npxg_xag_90"),
    ]:
        if dst not in agg.columns:
            agg[dst] = _safe_div(agg[src], n90)

    # usage
    team_minutes = agg.groupby(["team_name", "tournament_year"])["minutes"].transform("sum")
    agg["minutes_share"] = _safe_div(agg["minutes"], team_minutes)
    agg["starter_rate"] = _safe_div(agg["starts"], agg["mp"])

    # 7) QA guards
    agg = agg[agg["league"].eq("INT-European Championship")]
    agg = agg.drop_duplicates(subset=["player_name", "player_country", "tournament_year"])

    agg["minutes"] = agg["minutes"].clip(lower=0, upper=720)
    agg["minutes_share"] = agg["minutes_share"].clip(lower=0, upper=1.25)

    # 8) final column order
    cols = [
        "player_name","player_country","team_name","tournament","tournament_year",
        "age","birth_year","pos_raw","primary_pos",
        "mp","starts","minutes","nineties","minutes_share","starter_rate",
        "gls","ast","ga","xg","xag","xg_xag","npxg","npxg_xag",
        "gls_90","ast_90","ga_90","xg_90","xag_90","xg_xag_90","npxg_90","npxg_xag_90",
    ]
    for c in cols:
        if c not in agg.columns:
            agg[c] = pd.NA
    agg = agg[cols]

    # 9) save processed
    out_path = OUT_PROCESSED / f"player_agg_euro_{year}.parquet"
    agg.to_parquet(out_path, engine='fastparquet')

    # tiny sanity prints (optional)
    print(f"Saved raw      → {raw_path}")
    print(f"Saved staging  → {stats_path}")
    print(f"Saved processed→ {out_path} ({len(agg)} rows, {len(agg.columns)} cols)")
    return agg

In [None]:
build_player_agg()