In [29]:
import json
import sys
from pathlib import Path
import pandas as pd

## Single-match functions

In [30]:
def load_match_data(match_id, base_dir="../data/open-data/data"):
    """
    Load and join Events + 360 freeze-frame data for a single match.
    
    Args:
        match_id: Match ID (string or int)
        base_dir: Base directory for StatsBomb data
    
    Returns:
        List of combined event dictionaries with freeze-frame data attached
    """
    
    base = Path(base_dir)
    events_file = base / "events" / f"{match_id}.json"
    three_sixty_file = base / "three-sixty" / f"{match_id}.json"

    # Load events
    with open(events_file, "r", encoding="utf-8") as f:
        events = json.load(f)

    # Load freeze frames
    with open(three_sixty_file, "r", encoding="utf-8") as f:
        freeze_frames = json.load(f)

    # Build lookup dict for freeze frames
    freeze_lookup = {ff["event_uuid"]: ff for ff in freeze_frames}

    # Merge where possible
    combined = []
    for ev in events:
        ev_id = ev.get("id")
        if ev_id in freeze_lookup:
            ff = freeze_lookup[ev_id]
            ev["freeze_frame"] = ff.get("freeze_frame", [])
            ev["visible_area"] = ff.get("visible_area", [])
        combined.append(ev)

    return combined

In [31]:
match_id = "3788741"

match_data = load_match_data(match_id)
print(f"Loaded {len(match_data)} events")

Loaded 3803 events


In [33]:
def build_events_dataframe(combined):
    """
    Build events DataFrame from combined event list.
    One row per event, no nested lists.
    
    Args:
        combined: List of combined event dictionaries
    
    Returns:
        DataFrame with one row per event
    """
    
    rows = []
    for ev in combined:
        row = {
            "event_id": ev.get("id"),
            "match_id": ev.get("match_id"),
            "team": ev.get("team", {}).get("name"),
            "player": ev.get("player", {}).get("name"),
            "type": ev.get("type", {}).get("name"),
            "minute": ev.get("minute"),
            "second": ev.get("second"),
            "location": ev.get("location"),  # [x,y]
            "freeze_frame": ev.get("freeze_frame", []),
            "visible_area": ev.get("visible_area", []),
        }

        # For Pass, Carry, Shot: add end_location
        if row["type"] == "Pass":
            row["end_location"] = ev.get("pass", {}).get("end_location")
        elif row["type"] == "Carry":
            row["end_location"] = ev.get("carry", {}).get("end_location")
        elif row["type"] == "Shot":
            row["end_location"] = ev.get("shot", {}).get("end_location")

        rows.append(row)

    return pd.DataFrame(rows)


def build_freeze_frames_dataframe(events_df):
    """
    Create freeze-frames DataFrame from combined event list.
    One row per event, no nested lists.
    
    Args:
        combined: List of combined event dictionaries
    """
    rows = []
    for _, ev in events_df.iterrows():
        event_id = ev["event_id"]
        team = ev["team"]
        player = ev.get("player")  # actor name from events JSON
        ev_type = ev["type"]
        minute, second = ev["minute"], ev["second"]
        location = ev["location"]
        end_location = ev.get("end_location")

        for ff in ev["freeze_frame"]:
            rows.append({
                "event_id": event_id,
                "team": team,
                "player_name": player, 
                "event_type": ev_type,
                "minute": minute,
                "second": second,
                "event_location": location,
                "end_location": end_location,
                "ff_location": ff.get("location"),   # [x,y]
                "teammate": ff.get("teammate"),
                "actor": ff.get("actor"),
                "keeper": ff.get("keeper"),
            })

    return pd.DataFrame(rows)

In [34]:
combined = load_match_data("3788741")

# Build events DataFrame (one row per event)
events_df = build_events_dataframe(combined)

freeze_frames_df = build_freeze_frames_dataframe(events_df)

x = freeze_frames_df[freeze_frames_df['event_id'] == "5c888f58-fe77-459b-ab3b-a2fa5fb8ab16"]
x

Unnamed: 0,event_id,team,player_name,event_type,minute,second,event_location,end_location,ff_location,teammate,actor,keeper
0,5c888f58-fe77-459b-ab3b-a2fa5fb8ab16,Turkey,Burak Yılmaz,Pass,0,0,"[60.0, 40.0]","[32.5, 28.8]","[43.65321648946649, 31.98843233703487]",False,False,False
1,5c888f58-fe77-459b-ab3b-a2fa5fb8ab16,Turkey,Burak Yılmaz,Pass,0,0,"[60.0, 40.0]","[32.5, 28.8]","[43.997359300214555, 45.59948277519069]",False,False,False
2,5c888f58-fe77-459b-ab3b-a2fa5fb8ab16,Turkey,Burak Yılmaz,Pass,0,0,"[60.0, 40.0]","[32.5, 28.8]","[49.675715677557776, 35.43426789099837]",False,False,False
3,5c888f58-fe77-459b-ab3b-a2fa5fb8ab16,Turkey,Burak Yılmaz,Pass,0,0,"[60.0, 40.0]","[32.5, 28.8]","[54.49371502803081, 30.093222782354946]",False,False,False
4,5c888f58-fe77-459b-ab3b-a2fa5fb8ab16,Turkey,Burak Yılmaz,Pass,0,0,"[60.0, 40.0]","[32.5, 28.8]","[58.27928594625963, 48.52844299605966]",False,False,False
5,5c888f58-fe77-459b-ab3b-a2fa5fb8ab16,Turkey,Burak Yılmaz,Pass,0,0,"[60.0, 40.0]","[32.5, 28.8]","[60.0, 40.0]",True,True,False
6,5c888f58-fe77-459b-ab3b-a2fa5fb8ab16,Turkey,Burak Yılmaz,Pass,0,0,"[60.0, 40.0]","[32.5, 28.8]","[60.344142810748075, 30.782389893147645]",False,False,False
7,5c888f58-fe77-459b-ab3b-a2fa5fb8ab16,Turkey,Burak Yılmaz,Pass,0,0,"[60.0, 40.0]","[32.5, 28.8]","[61.20449983761826, 48.35615121836148]",True,False,False
8,5c888f58-fe77-459b-ab3b-a2fa5fb8ab16,Turkey,Burak Yılmaz,Pass,0,0,"[60.0, 40.0]","[32.5, 28.8]","[67.3990704310836, 33.022183003223915]",True,False,False
9,5c888f58-fe77-459b-ab3b-a2fa5fb8ab16,Turkey,Burak Yılmaz,Pass,0,0,"[60.0, 40.0]","[32.5, 28.8]","[68.94771307944993, 43.704273220510764]",True,False,False


In [35]:
def load_lineup_dataframe(match_id, base_dir="../data/open-data/data"):
    """
    Load lineup JSON into a DataFrame.
    
    Args:
        match_id: Match ID (string or int)
        base_dir: Base directory for StatsBomb data
    
    Returns:
        DataFrame with columns: team_name, player_name, player_id, position
    """
    lineup_file = Path(base_dir) / "lineups" / f"{match_id}.json"
    with open(lineup_file, "r", encoding="utf-8") as f:
        lineup_data = json.load(f)
    
    rows = []
    for team in lineup_data:
        team_name = team["team_name"]
        for p in team["lineup"]:
            pname = p["player_name"]
            pid = p["player_id"]
            # take first listed position (simplification)
            pos = p["positions"][0]["position"] if p.get("positions") else None
            rows.append({
                "team_name": team_name,
                "player_name": pname,
                "player_id": pid,
                "position": pos
            })
    
    return pd.DataFrame(rows)

def attach_actor_metadata(events_df, lineup_df):
    """
    Merge player_id and position from lineup into events DataFrame.
    
    Args:
        events_df: Events DataFrame
        lineup_df: Lineup DataFrame from load_lineup_dataframe()
    
    Returns:
        DataFrame with player_id and position columns added
    """
    events_df = events_df.copy()
    
    # Create lowercase columns for matching
    events_df["team_lower"] = events_df["team"].str.lower()
    events_df["player_lower"] = events_df["player"].str.lower()
    
    lineup_df = lineup_df.copy()
    lineup_df["team_name_lower"] = lineup_df["team_name"].str.lower()
    lineup_df["player_name_lower"] = lineup_df["player_name"].str.lower()
    
    # Merge
    merged = events_df.merge(
        lineup_df[["team_name_lower", "player_name_lower", "player_id", "position"]],
        left_on=["team_lower", "player_lower"],
        right_on=["team_name_lower", "player_name_lower"],
        how="left"
    )
    
    # Keep only the columns we want (drop the temporary lowercase columns)
    events_df["player_id"] = merged["player_id"]
    events_df["position"] = merged["position"]
    events_df = events_df.drop(columns=["team_lower", "player_lower"])
    
    return events_df

In [36]:
# Load all data
combined = load_match_data("3788741")
events_df = build_events_dataframe(combined)
lineup_lookup = load_lineup_dataframe("3788741")
events_df = attach_actor_metadata(events_df, lineup_lookup)
# lineup_lookup['Turkey']['Burak Yılmaz']
# {'id': 11086, 'position': 'Center Forward'}
events_df

Unnamed: 0,event_id,match_id,team,player,type,minute,second,location,freeze_frame,visible_area,end_location,player_id,position
0,59645e89-9c7f-46f2-b662-332c5b8bba12,,Turkey,,Starting XI,0,0,,[],[],,,
1,f063a4b6-f5ad-4576-b31d-a8dbe50f693e,,Italy,,Starting XI,0,0,,[],[],,,
2,4d9693c5-29bb-4e73-8913-7ee7a4e465fc,,Turkey,,Half Start,0,0,,[],[],,,
3,5c53ec59-b0d0-4515-8fee-d973e0037b56,,Italy,,Half Start,0,0,,[],[],,,
4,5c888f58-fe77-459b-ab3b-a2fa5fb8ab16,,Turkey,Burak Yılmaz,Pass,0,0,"[60.0, 40.0]","[{'teammate': False, 'actor': False, 'keeper':...","[0.0, 27.7649873094419, 19.2332231959735, 17.3...","[32.5, 28.8]",11086.0,Center Forward
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3798,87f19c86-543d-4a7a-93bb-b576fe9be63d,,Turkey,Mehmet Zeki Çelik,Duel,92,59,"[10.7, 60.5]","[{'teammate': True, 'actor': False, 'keeper': ...","[0.0, 80.0, 0.0, 33.0363392144773, 12.95951717...",,10349.0,Right Back
3799,9a782a08-27d0-4d77-908f-fdfde7e5f3a4,,Italy,Andrea Belotti,Pass,93,2,"[109.9, 10.1]","[{'teammate': False, 'actor': False, 'keeper':...","[104.1532652775, 67.3224942956356, 84.40446024...","[99.1, 12.7]",7471.0,Center Forward
3800,feb59f80-cefc-4dea-85ea-dfb9e4efb799,,Italy,Bryan Cristante,Ball Receipt*,93,3,"[97.9, 12.7]","[{'teammate': False, 'actor': False, 'keeper':...","[104.046891906434, 63.4526447290838, 84.382165...",,6993.0,Left Center Midfield
3801,18b56d9c-fb81-4a5f-84f0-7169ee8aead9,,Turkey,,Half End,93,3,,[],[],,,


In [37]:
def load_full_match(match_id, base_dir="../data/open-data/data"):
    """
    Load all data for a match into DataFrames:
    - events_df: one row per event with player_id and position
    - freeze_frames_df: one row per player in freeze-frames
    - lineup_df: DataFrame with player lineup info
    
    Args:
        match_id: Match ID (string or int)
        base_dir: Base directory for StatsBomb data
    
    Returns:
        Tuple of (events_df, freeze_frames_df, lineup_df)
    """
    # Step 1: Load events + freeze frames
    combined = load_match_data(match_id, base_dir=base_dir)
    
    # Step 2: Build events DataFrame
    events_df = build_events_dataframe(combined)
    
    # Step 3: Build freeze frames DataFrame
    freeze_frames_df = build_freeze_frames_dataframe(events_df)
    
    # Step 4: Load lineup data (using DataFrame version)
    lineup_df = load_lineup_dataframe(match_id, base_dir=base_dir)
    
    # Step 5: Attach player metadata to events
    events_df = attach_actor_metadata(events_df, lineup_df)
    
    return events_df, freeze_frames_df, lineup_df

In [None]:
# Load full match
events_df, freeze_frames_df, lineup_lookup = load_full_match("3788741")

print(f"Events: {events_df.shape}")
print(f"Freeze frames: {freeze_frames_df.shape}")
print(f"Teams in lineup: {list(lineup_lookup.keys())}")

print(f"\nEvent types: {events_df['type'].value_counts().head()}")

Events: (3803, 13)
Freeze frames: (45737, 12)
Teams in lineup: ['team_name', 'player_name', 'player_id', 'position']

Event types: type
Pass             1059
Ball Receipt*    1021
Carry             862
Pressure          382
Ball Recovery      92
Name: count, dtype: int64


## Multi-match/team collection functions

In [41]:
def load_competitions(base_dir="../data/open-data/data"):
    """
    Load all available competitions from competitions.json.
    
    Args:
        base_dir: Base directory for StatsBomb data
    
    Returns:
        List of competition dictionaries
    """
    base_path = Path(base_dir)
    competitions_file = base_path / "competitions.json"
        
    with open(competitions_file, "r", encoding="utf-8") as f:
        competitions = json.load(f)
    
    return competitions



def find_competitions(competition_name=None, country_name=None, base_dir="../data/open-data/data"):
    """
    Find competitions matching given criteria.
    
    Args:
        competition_name: Competition name (partial match, case-insensitive)
        country_name: Country name (partial match, case-insensitive)
        base_dir: Base directory for StatsBomb data
    
    Returns:
        List of matching competition dictionaries
    """
    competitions = load_competitions(base_dir)
    
    if competition_name is None and country_name is None:
        return competitions
    
    matches = []
    for comp in competitions:
        match = True
        
        if competition_name:
            comp_name = comp.get("competition_name", "")
            if competition_name.lower() not in comp_name.lower():
                match = False
        
        if country_name:
            country = comp.get("country_name", "")
            if country_name.lower() not in country.lower():
                match = False
        
        if match:
            matches.append(comp)
    
    return matches


def find_seasons_for_competition(competition_id, base_dir="../data/open-data/data"):
    """
    Find all seasons available for a given competition.
    
    Args:
        competition_id: Competition ID
        base_dir: Base directory for StatsBomb data
    
    Returns:
        List of season dictionaries with competition_id, season_id, season_name, etc.
    """
    competitions = load_competitions(base_dir)
    
    seasons = [
        {
            "competition_id": comp["competition_id"],
            "season_id": comp["season_id"],
            "season_name": comp.get("season_name", ""),
            "competition_name": comp.get("competition_name", ""),
            "country_name": comp.get("country_name", ""),
            "match_available_360": comp.get("match_available_360"),
        }
        for comp in competitions
        if comp.get("competition_id") == competition_id
    ]
    
    return seasons

In [42]:
all_comps = load_competitions()
print(f"Total competitions/seasons: {len(all_comps)}")

# Find specific competition
bundesliga = find_competitions(competition_name="Bundesliga")
print(f"Found {len(bundesliga)} Bundesliga seasons")

# Find by country
german_comps = find_competitions(country_name="Germany")
print(f"Found {len(german_comps)} German competitions")

# Find all seasons for a competition
bundesliga_seasons = find_seasons_for_competition(competition_id=9)
print(f"Bundesliga has {len(bundesliga_seasons)} seasons available")
for season in bundesliga_seasons:
    print(f"  - {season['season_name']} (ID: {season['season_id']})")

Total competitions/seasons: 75
Found 2 Bundesliga seasons
Found 2 German competitions
Bundesliga has 2 seasons available
  - 2023/2024 (ID: 281)
  - 2015/2016 (ID: 27)


In [43]:
def load_team_matches(team_name, competition_id=None, season_id=None, 
                     base_dir="../data/open-data/data", team_name_flexible=True):
    """
    Load all matches for a specific team from a competition and season.
    
    Args:
        team_name: Team name
        competition_id: Competition ID (optional)
        season_id: Season ID (optional)
        base_dir: Base directory for StatsBomb data
        team_name_flexible: If True, do case-insensitive partial matching for team name
    
    Returns:
        List of match dictionaries with match_id, match_date, teams, scores, etc.
    """
    base_path = Path(base_dir)
    
    # If specific competition and season provided, load directly
    if competition_id is not None and season_id is not None:
        matches_file = base_path / "matches" / str(competition_id) / f"{season_id}.json"
        
        with open(matches_file, "r", encoding="utf-8") as f:
            matches = json.load(f)
    else:
        # Search across competitions/seasons
        competitions = load_competitions(base_dir)
        
        if competition_id is not None:
            competitions = [c for c in competitions if c.get("competition_id") == competition_id]
        if season_id is not None:
            competitions = [c for c in competitions if c.get("season_id") == season_id]
        
        matches = []
        for comp in competitions:
            comp_id = comp.get("competition_id")
            seas_id = comp.get("season_id")
            
            matches_file = base_path / "matches" / str(comp_id) / f"{seas_id}.json"
            
            if not matches_file.exists():
                alt_paths = [
                    Path("data/open-data/data/matches") / str(comp_id) / f"{seas_id}.json",
                    Path("../data/open-data/data/matches") / str(comp_id) / f"{seas_id}.json",
                ]
                for alt_path in alt_paths:
                    if alt_path.exists():
                        matches_file = alt_path
                        break
                else:
                    continue  # Skip this season if file not found
            
            try:
                with open(matches_file, "r", encoding="utf-8") as f:
                    season_matches = json.load(f)
                    matches.extend(season_matches)
            except Exception as e:
                print(f"Warning: Could not load matches from {matches_file}: {e}")
                continue
    
    # Filter for team matches
    team_matches = []
    for match in matches:
        home_team = match.get("home_team", {}).get("home_team_name", "")
        away_team = match.get("away_team", {}).get("away_team_name", "")
        
        # Match team name (always case-insensitive)
        match_home = team_name.lower() in home_team.lower()
        match_away = team_name.lower() in away_team.lower()
        
        if match_home or match_away:
            team_matches.append({
                "match_id": match["match_id"],
                "match_date": match.get("match_date", ""),
                "home_team": home_team,
                "away_team": away_team,
                "home_score": match.get("home_score", 0),
                "away_score": match.get("away_score", 0),
                "is_home": match_home,
                "competition_id": match.get("competition", {}).get("competition_id"),
                "season_id": match.get("season", {}).get("season_id"),
                "competition_name": match.get("competition", {}).get("competition_name", ""),
                "season_name": match.get("season", {}).get("season_name", ""),
            })
    
    return team_matches


def find_matches_with_360(matches, base_dir="../data/open-data/data"):
    """
    Find which matches have 360 freeze-frame data available.
    
    Args:
        matches: List of match dictionaries (from load_team_matches)
        base_dir: Base directory for StatsBomb data
    
    Returns:
        List of match IDs (integers) with 360 data
    """
    base_path = Path(base_dir)
    
    # Try different path formats
    possible_paths = [
        base_path / "three-sixty",
        Path("data/open-data/data/three-sixty"),
        Path("../data/open-data/data/three-sixty"),
    ]
    
    three_sixty_dir = None
    for path in possible_paths:
        if path.exists():
            three_sixty_dir = path
            break
    
    if three_sixty_dir is None:
        print(f"Warning: three-sixty directory not found. Tried: {possible_paths}")
        return []
    
    matches_with_360 = []
    for match in matches:
        match_id = match["match_id"]
        three_sixty_file = three_sixty_dir / f"{match_id}.json"
        
        if three_sixty_file.exists():
            matches_with_360.append(match_id)
    
    return matches_with_360

In [44]:
# Load matches for a team
leverkusen_matches = load_team_matches(
    team_name="Bayer Leverkusen",
    competition_id=9,  # Bundesliga
    season_id=281     # 2023/24
)
print(f"Found {len(leverkusen_matches)} Leverkusen matches")

# Or search across all competitions/seasons
leverkusen_all = load_team_matches(team_name="Leverkusen")
print(f"Found {len(leverkusen_all)} Leverkusen matches across all competitions")

# Find which matches have 360 data
matches_with_360 = find_matches_with_360(leverkusen_matches)
print(f"Matches with 360 data: {len(matches_with_360)} out of {len(leverkusen_matches)}")

# View sample matches
for match in leverkusen_matches[:3]:
    print(f"  {match['match_date']}: {match['home_team']} vs {match['away_team']} "
          f"({match['home_score']}-{match['away_score']})")

Found 34 Leverkusen matches
Found 68 Leverkusen matches across all competitions
Matches with 360 data: 34 out of 34
  2024-04-14: Bayer Leverkusen vs Werder Bremen (5-0)
  2024-04-06: Union Berlin vs Bayer Leverkusen (0-1)
  2024-05-05: Eintracht Frankfurt vs Bayer Leverkusen (1-5)


In [45]:
def load_all_team_data(team_name, matches, matches_with_360, base_dir="../data/open-data/data",
                       max_matches=None, team_name_flexible=True):
    """
    Load all team matches with 360 data and combine into unified DataFrames.
    
    Args:
        team_name: Team name (used for filtering events)
        matches: List of match dictionaries (from load_team_matches)
        matches_with_360: List of match IDs with 360 data (from find_matches_with_360)
        base_dir: Base directory for StatsBomb data
        max_matches: Maximum number of matches to load (None = all)
        team_name_flexible: If True, do case-insensitive partial matching for team name
    
    Returns:
        Tuple of:
        - combined_events_df: DataFrame with all team events
        - combined_freeze_frames_df: DataFrame with expanded freeze-frame data
        - player_stats: Dictionary of player statistics
    """
    from collections import defaultdict
    
    all_events = []
    all_freeze_frames = []
    player_stats = defaultdict(lambda: {"events": 0, "passes": 0, "matches": set()})
    
    matches_to_load = matches[:max_matches] if max_matches else matches
    
    # Determine correct base_dir
    test_match_id = matches_with_360[0] if matches_with_360 else None
    if test_match_id:
        test_paths = [
            Path(base_dir) / "events" / f"{test_match_id}.json",
            Path("data/open-data/data/events") / f"{test_match_id}.json",
        ]
        for test_path in test_paths:
            if test_path.exists():
                base_dir = str(test_path.parent.parent)
                break
    
    for match in matches_to_load:
        match_id = match["match_id"]
        match_id_str = str(match_id)
        
        # Check if match has 360 data
        if match_id not in matches_with_360:
            continue
        
        try:
            # Load full match data
            events_df, freeze_frames_df, lineup_df = load_full_match(match_id_str, base_dir=base_dir)
            
            # Filter for team events (always case-insensitive)
            team_events = events_df[
                events_df["team"].str.contains(team_name, case=False, na=False)
            ].copy()
            
            if len(team_events) > 0:
                # Filter freeze frames for team events
                team_event_ids = set(team_events["event_id"])
                team_freeze_frames = freeze_frames_df[
                    freeze_frames_df["event_id"].isin(team_event_ids)
                ].copy()
                
                all_events.append(team_events)
                all_freeze_frames.append(team_freeze_frames)
                
                # Track player stats
                for _, row in team_events.iterrows():
                    player = row.get("player")
                    if player:
                        player_stats[player]["events"] += 1
                        player_stats[player]["matches"].add(match_id_str)
                        if row["type"] == "Pass":
                            player_stats[player]["passes"] += 1
        
        except Exception as e:
            continue
    
    # Combine all events
    if all_events:
        combined_events_df = pd.concat(all_events, ignore_index=True)
        combined_freeze_frames_df = pd.concat(all_freeze_frames, ignore_index=True)
    else:
        # Create empty DataFrames with expected columns
        combined_events_df = pd.DataFrame(columns=[
            "event_id", "match_id", "team", "player", "type", 
            "minute", "second", "location", "freeze_frame", 
            "visible_area", "end_location", "player_id", "position"
        ])
        combined_freeze_frames_df = pd.DataFrame(columns=[
            "event_id", "team", "player_name", "event_type",
            "minute", "second", "event_location", "end_location",
            "ff_location", "teammate", "actor", "keeper"
        ])
    
    # Convert sets to counts for player_stats
    player_stats_dict = {
        player: {
            "events": stats["events"],
            "passes": stats["passes"],
            "matches": len(stats["matches"])
        }
        for player, stats in player_stats.items()
    }
    
    return combined_events_df, combined_freeze_frames_df, player_stats_dict

In [50]:
# Step 1: Find Leverkusen matches
leverkusen_matches = load_team_matches(
    team_name="Bayer Leverkusen",
    competition_id=9, # Bundesliga
    season_id=281 # 2023/24
)
print(f"Found {len(leverkusen_matches)} Leverkusen matches")

# Step 2: Find matches with 360 data
matches_with_360 = find_matches_with_360(leverkusen_matches)
print(f"Matches with 360 data: {len(matches_with_360)} out of {len(leverkusen_matches)}")

# Step 3: Load all team data (start with 3 matches for testing)
events_df, freeze_frames_df, player_stats = load_all_team_data(
    team_name="Bayer Leverkusen",
    matches=leverkusen_matches,
    matches_with_360=matches_with_360,
    max_matches=34  # Test with 3 matches first
)

# Step 4: Check results
print(f"\n Loaded {len(events_df):,} events")
print(f" Loaded {len(freeze_frames_df):,} freeze-frame rows")
print(f" Unique players: {len(player_stats)}")
print(f" Unique matches: {events_df['match_id'].nunique()}")

# Step 5: View event types
print(f"\nEvent types:")
print(events_df['type'].value_counts().head(10))

# Step 6: View top players
print(f"\nTop 5 players by events:")
top_players = sorted(player_stats.items(), key=lambda x: x[1]['events'], reverse=True)[:5]
for player, stats in top_players:
    print(f"  {player}: {stats['events']} events ({stats['passes']} passes) in {stats['matches']} matches")

# Step 7: Sample data inspection
print(f"\nSample events:")
print(events_df[['match_id', 'type', 'player', 'minute', 'location']].head())

print(f"\nSample freeze frames:")
print(freeze_frames_df[['event_id', 'event_type', 'player_name', 'actor', 'teammate']].head())

Found 34 Leverkusen matches
Matches with 360 data: 34 out of 34

 Loaded 81,440 events
 Loaded 1,181,537 freeze-frame rows
 Unique players: 24
 Unique matches: 0

Event types:
type
Pass             24244
Ball Receipt*    23739
Carry            20593
Pressure          4786
Ball Recovery     1751
Duel               864
Shot               623
Block              616
Dribble            580
Clearance          552
Name: count, dtype: int64

Top 5 players by events:
  Granit Xhaka: 10127 events (3299 passes) in 33 matches
  Florian Wirtz: 7234 events (1841 passes) in 32 matches
  Alejandro Grimaldo García: 6651 events (2159 passes) in 33 matches
  Exequiel Alejandro Palacios: 6514 events (2022 passes) in 24 matches
  Jonathan Tah: 6469 events (2167 passes) in 31 matches

Sample events:
  match_id         type                 player  minute      location
0     None  Starting XI                   None       0          None
1     None   Half Start                   None       0          None
2   

## Utility functions

In [51]:
def get_data_summary(combined_events, combined_freeze_frames, player_stats, team_name="Team"):
    """
    Generate summary statistics for the loaded data.
    
    Args:
        combined_events: Combined events DataFrame
        combined_freeze_frames: Combined freeze frames DataFrame
        player_stats: Player statistics dictionary
        team_name: Team name for summary
    
    Returns:
        Dictionary with summary statistics
    """
    summary = {
        "team_name": team_name,
        "total_events": len(combined_events),
        "total_freeze_frames": len(combined_freeze_frames),
        "unique_matches": combined_events["match_id"].nunique() if len(combined_events) > 0 else 0,
        "unique_players": len(player_stats),
        "event_types": combined_events["type"].value_counts().to_dict() if len(combined_events) > 0 else {},
        "top_players": dict(sorted(
            player_stats.items(), 
            key=lambda x: x[1]["events"], 
            reverse=True
        )[:10]),
        "events_with_player_id": combined_events["player_id"].notna().sum() if "player_id" in combined_events.columns else 0,
        "events_with_location": combined_events["location"].notna().sum() if len(combined_events) > 0 else 0,
    }
    
    return summary
