## 1. Install and Import Libraries

In [5]:
# Install html5lib (missing dependency)
import sys
!{sys.executable} -m pip install html5lib -q

# Try importing soccerdata (should already be installed from previous attempt)
# If lxml 5.3 causes issues at runtime, we'll see it in the next cell

In [6]:
import soccerdata as sd
import pandas as pd
import numpy as np
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

print(f'soccerdata version: {sd.__version__}')
print('Libraries imported successfully')

soccerdata version: 1.5.1
Libraries imported successfully


## 2. Configure Data Fetching

Set up the FBref scraper for Premier League seasons 2017-18 through 2024-25.

In [7]:
# Initialize FBref scraper
print('='*80)
print('INITIALIZING FBREF DATA SCRAPER')
print('='*80)

# Seasons to fetch (2017-18 to 2024-25)
seasons = list(range(2017, 2025))  # [2017, 2018, ..., 2024]

print(f'\nTarget League: Premier League (England)')
print(f'Seasons: {seasons[0]}-{seasons[0]+1} to {seasons[-1]}-{seasons[-1]+1}')
print(f'Total seasons: {len(seasons)}')

# Create scraper instance
fbref = sd.FBref(leagues='ENG-Premier League', seasons=seasons)

print('\nFBref scraper initialized successfully')
print('Note: First fetch will be slow (downloads data), subsequent runs use cache')

INITIALIZING FBREF DATA SCRAPER

Target League: Premier League (England)
Seasons: 2017-2018 to 2024-2025
Total seasons: 8



FBref scraper initialized successfully
Note: First fetch will be slow (downloads data), subsequent runs use cache


## 3. Fetch Team Season Statistics

We'll fetch multiple stat types and merge them into a comprehensive dataset.

**Stat Types Available:**
- `standard` - Goals, shots, cards
- `shooting` - Shot quality and locations
- `passing` - Pass completion, distance, types
- `passing_types` - Crosses, corners, through balls
- `possession` - Possession %, touches, dribbles
- `defense` - Tackles, pressures, blocks
- `misc` - Fouls, offsides, aerial duels

In [8]:
print('='*80)
print('FETCHING TEAM SEASON STATISTICS')
print('='*80)

# Dictionary to store different stat types
stats_dict = {}

# List of stat types to fetch
stat_types = [
    'standard',      # Goals, shots, cards
    'shooting',      # Shot quality
    'passing',       # Passing stats
    'passing_types', # Pass types
    'possession',    # Possession and touches
    'defense',       # Defensive actions
    'misc'           # Fouls, aerial duels
]

print(f'\nFetching {len(stat_types)} stat categories...\n')

for stat_type in stat_types:
    try:
        print(f'Fetching {stat_type}...', end=' ')
        df = fbref.read_team_season_stats(stat_type=stat_type)
        stats_dict[stat_type] = df
        print(f'✓ ({df.shape[0]} rows, {df.shape[1]} columns)')
    except Exception as e:
        print(f'✗ Error: {str(e)}')
        stats_dict[stat_type] = None

print(f'\nData fetching complete!')
print(f'Successfully fetched: {sum(1 for v in stats_dict.values() if v is not None)}/{len(stat_types)} stat types')

FETCHING TEAM SEASON STATISTICS

Fetching 7 stat categories...

Fetching standard... 

✗ Error: Could not download https://fbref.com/en/comps/.
Fetching shooting... 

✗ Error: Could not download https://fbref.com/en/comps/.
Fetching passing... 

✗ Error: Could not download https://fbref.com/en/comps/.
Fetching passing_types... 

✗ Error: Could not download https://fbref.com/en/comps/.
Fetching possession... 

KeyboardInterrupt: 

## 4. Explore Fetched Data

Let's examine the structure and available columns in each stat type.

In [None]:
# Display sample data and column info for each stat type
for stat_type, df in stats_dict.items():
    if df is not None:
        print('='*80)
        print(f'{stat_type.upper()} STATS')
        print('='*80)
        print(f'\nShape: {df.shape}')
        print(f'\nColumns ({len(df.columns)}):') 
        print(df.columns.tolist())
        print(f'\nSample data:')
        print(df.head(3))
        print('\n')

## 5. Merge and Engineer Playstyle Features

Combine all stat types and create features specifically for tactical playstyle analysis.

In [None]:
print('='*80)
print('MERGING DATASETS AND ENGINEERING PLAYSTYLE FEATURES')
print('='*80)

# Start with standard stats as base
base_df = stats_dict['standard'].copy() if stats_dict['standard'] is not None else None

if base_df is None:
    raise ValueError('Standard stats not available - cannot proceed')

print(f'\nBase dataframe: {base_df.shape}')
print(f'Index levels: {base_df.index.names}')

# Merge other stat types
for stat_type, df in stats_dict.items():
    if stat_type != 'standard' and df is not None:
        try:
            # Add suffix to avoid column name conflicts
            df_suffixed = df.add_suffix(f'_{stat_type}')
            base_df = base_df.join(df_suffixed, how='left')
            print(f'Merged {stat_type}: {base_df.shape}')
        except Exception as e:
            print(f'Error merging {stat_type}: {str(e)}')

print(f'\nFinal merged dataframe: {base_df.shape}')
print(f'Total columns: {len(base_df.columns)}')

## 6. Reset Index and Clean Team Names

Prepare the dataframe for integration with existing data.

In [None]:
# Reset index to get season and team as columns
fbref_data = base_df.reset_index()

print('='*80)
print('DATA STRUCTURE')
print('='*80)
print(f'\nShape: {fbref_data.shape}')
print(f'\nColumns: {fbref_data.columns.tolist()[:10]}...')  # Show first 10
print(f'\nFirst few rows:')
print(fbref_data.head())

# Check team names
if 'team' in fbref_data.columns:
    print(f'\nUnique teams: {fbref_data["team"].nunique()}')
    print(f'\nTeam names:')
    print(sorted(fbref_data['team'].unique()))

## 7. Select and Engineer Playstyle Features

Extract the most relevant columns for tactical analysis.

In [None]:
print('='*80)
print('SELECTING PLAYSTYLE FEATURES')
print('='*80)

# List available columns to help identify correct names
print(f'\nAvailable columns ({len(fbref_data.columns)}):') 
for i, col in enumerate(fbref_data.columns, 1):
    print(f'{i:3d}. {col}')

print('\nNOTE: Review columns above and identify the ones needed for playstyle features')
print('Common patterns to look for:')
print('  - Possession: "Poss", "possession"')
print('  - Passing: "Cmp%", "pass", "progressive"')
print('  - Pressing: "Press", "tackles"')
print('  - Shots: "Sh", "SoT", "goals"')
print('  - Discipline: "Fls", "CrdY", "CrdR"')

In [None]:
# This cell will be updated after we see the actual column names
# For now, create a placeholder structure

print('='*80)
print('CREATING PLAYSTYLE FEATURE SET')
print('='*80)

# TODO: Update this mapping based on actual column names from above
# Example structure (will be customized based on actual data):
playstyle_features = {
    # Format: 'new_feature_name': 'actual_column_name_from_fbref'
    
    # Core identity columns
    'Season': 'season' if 'season' in fbref_data.columns else fbref_data.index.names[0],
    'Team': 'team' if 'team' in fbref_data.columns else fbref_data.index.names[1],
    
    # Features to be mapped after reviewing columns
}

print('\nFeature mapping will be completed after reviewing column names above')
print('Re-run this cell after identifying the correct column names')

## 8. Save Processed Data

Export the cleaned FBref data for use in BO3.

In [None]:
# Save raw merged data first
output_dir = Path('../data/processed')
output_dir.mkdir(parents=True, exist_ok=True)

raw_output_path = output_dir / 'fbref_team_stats_raw.csv'
fbref_data.to_csv(raw_output_path, index=False)

print('='*80)
print('DATA EXPORT')
print('='*80)
print(f'\nRaw FBref data saved to:')
print(f'  {raw_output_path}')
print(f'\nShape: {fbref_data.shape}')
print(f'Seasons: {fbref_data["season"].min() if "season" in fbref_data.columns else "N/A"} to {fbref_data["season"].max() if "season" in fbref_data.columns else "N/A"}')
print(f'Teams: {fbref_data["team"].nunique() if "team" in fbref_data.columns else "N/A"}')

print('\nNext steps:')
print('1. Review column names in cell above')
print('2. Map relevant columns to playstyle features')
print('3. Create cleaned feature set for BO3')
print('4. Update BO3 notebook to use FBref data')

## Summary

This notebook fetches advanced team statistics from FBref for Premier League seasons 2017-18 through 2024-25.

**Data includes:**
- Standard stats (goals, shots, cards)
- Shooting quality metrics
- Passing and build-up play
- Possession statistics
- Defensive actions
- Miscellaneous tactical indicators

**Next Actions:**
1. Review fetched columns and identify playstyle-relevant features
2. Create cleaned feature set focused on tactics (not performance)
3. Integrate with BO3 for pure playstyle clustering