# Webscrape data


In [None]:
import pandas as pd
import os
from pathlib import Path
import time
import requests
from random import uniform
from bs4 import BeautifulSoup
import random
from fake_useragent import UserAgent



### Naming convention for the tables

In [12]:
titles_12 = [
    'serie_A_overall',
    'serie_A_homeaway',
    'squad_standard_stats',
    'squad_standard_stats_opp',
    'squad_goalkeeping',
    'squad_goalkeeping_opp',
    'squad_advanced_goalkeeping',
    'squad_advanced_goalkeeping_opp',
    'squad_shooting',
    'squad_shooting_opp',
    'squad_passing',
    'squad_passing_opp',
    'squad_pass_types',
    'squad_pass_types_opp',
    'squad_goal_shot_creation',
    'squad_goal_shot_creation_opp',
    'squad_defensive_actions',
    'squad_defensive_actions_opp',
    'squad_possession',
    'squad_possession_opp',
    'squad_playing_time',
    'squad_playing_time_opp',
    'squad_miscellaneous',
    'squad_miscellaneous_opp',
]

titles_6 = titles_12[:6] + titles_12[8:10] + titles_12[20:]

titles_5 = titles_6[:6] + titles_6[8:]

### Acquire the league data

In [None]:
os.makedirs('data/clubes/', exist_ok=True)
os.makedirs('data/league/', exist_ok=True)

In [None]:
# Fetch all tables from the webpage
os.makedirs('data', exist_ok=True)
for page in range(11):
    
    url = f"https://fbref.com/en/comps/24/{2024 - page}/{2024 - page}-Serie-A-Stats"
    tables = pd.read_html(url)
    year = 2024 - page
    
    # Convert each table to a DataFrame
    if year > 2018:
        dfs = {f"{titles_12[i]}_{year}": table for i, table in enumerate(tables)}
    
    elif year > 2015:
        dfs = {f"{titles_6[i]}_{year}": table for i, table in enumerate(tables)}
    
    else:
        dfs = {f"{titles_5[i]}_{year}": table for i, table in enumerate(tables)}
    
    for name, df in dfs.items():
        df.to_csv(f"data/league/{name}.csv", index=False)


### Acquire player data

In [18]:
## ID da página de acesso de cada clube
clubes = {
        'Botafogo-RJ': {'id': 'd9fdd9d9'},
        'Palmeiras': {'id': 'abdce579'},
        'Flamengo': {'id': '639950ae'},
        'Fortaleza': {'id': 'a9d0ab0e'},
        'Internacional': {'id': '6f7e1f03'},
        'São-Paulo': {'id': '5f232eb1'},
        'Corinthians': {'id': 'bf4acd28'},
        'Bahia': {'id': '157b7fee'},
        'Cruzeiro': {'id': '03ff5eeb'},
        'Vasco-da-Gama': {'id': '83f55dbe'},
        'Vitória': {'id': '33f95fe0'},
        'Atlético-Mineiro': {'id': '422bb734'},
        'Fluminense': {'id': '84d9701c'},
        'Grêmio': {'id': 'd5ae3703'},
        'Juventude': {'id': 'd081b697'},
        'RB-Bragantino': {'id': 'f98930d1'},
        'Ath-Paranaense': {'id': '2091c619'},
        'Criciúma': {'id': '3f7595bb'},
        'Atl-Goianiense': {'id': '32d508ca'},
        'Cuiabá': {'id': 'f0e6fb14'},
        'Santos': {'id': '712c528f'},
        'Goiás': {'id': '78c617cc'},
        'Coritiba': {'id': 'd680d257'},
        'América-MG': {'id': '1f68d780'},
        'Ceará': {'id': '2f335e17'},
        'Avaí': {'id': 'f205258a'},
        'Sport-Recife': {'id': 'ece66b78'},
        'Chapecoense': {'id': 'baa296ad'},
        'CSA': {'id': '05aff519'},
        'Paraná': {'id': '2091c619'},
        'Ponte-Preta': {'id': 'b162ebe7'},
        'Figueirense': {'id': '0ce4436d'},
        'Santa-Cruz': {'id': 'ad0c1246'},
        'Joinville': {'id': 'da0666a2'},
    }

In [3]:
for clube in clubes:
    clubes[clube]['years_SA'] = []

In [None]:
# verifica se o time estava na serie A naquele ano e adiciona o ano na key 'years_SA'

for i in range(11):
    df = pd.read_csv(f'data/league/serie_A_overall_{2024 - i}.csv')

    unique_values = df['Squad'].unique()

    for value in unique_values:
        value = value.replace(" ", "-").replace("(", "").replace(")", "")

        clubes[value]['years_SA'].append(2024 - i)
        

### Scraping with pd.read_html()

In [None]:
for clube, dados in clubes.items():
    # Create club directory if it doesn't exist
    club_dir = Path(f'data/clubes/{clube}')
    club_dir.mkdir(parents=True, exist_ok=True)
    
    for year in dados['years_SA']:
        # Generate expected filenames pattern for this year
        existing_files = list(club_dir.glob(f"{year}_*.csv"))
        
        # Skip if we already have files for this year
        if existing_files:
            print(f"Skipping {clube} {year} - already downloaded")
            continue
        
        url = f"https://fbref.com/en/squads/{clubes[clube]['id']}/{year}/{clube}-Stats"

        try:
            # Add delay to avoid 429 errors
            time.sleep(uniform(3, 20))
            tables = pd.read_html(url)
            
            # Save each table
            for i, table in enumerate(tables):
                filename = club_dir / f"{year}_{i}.csv"
                table.to_csv(filename, index=False)
                print(f"Saved {filename}")
                
        except Exception as e:
            print(f"Failed to scrape {clube} {year}: {str(e)}")
            continue


#### Test scraping

In [None]:
url = f"https://fbref.com/en/squads/d9fdd9d9/2022/Botafogo-RJ-Stats"
    

tables = pd.read_html(url)

# Save each table
for i, table in enumerate(tables):
    filename = 'data/clubes/Botafogo-RJ' / f"{year}_{i}.csv"
    table.to_csv(filename, index=False)

### Scraping with beautifulsoup (gerado pelo gpt)

In [None]:

# Configure scraping
 # Random delay between requests

def scrape_table_without_readhtml(url):
    """Scrape HTML tables without pd.read_html()"""
    ua = UserAgent()
    headers = {'User-Agent': ua.random}
    DELAY = (3, 7) 
    
    try:
        # Make request with delay
        time.sleep(random.uniform(*DELAY))
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        
        # Parse HTML with BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')
        tables = soup.find_all('table')
        
        dataframes = []
        for table in tables:
            # Extract headers
            headers = []
            thead = table.find('thead')
            if thead:
                for th in thead.find_all('th'):
                    headers.append(th.get_text(strip=True))
            
            # Extract rows
            rows = []
            tbody = table.find('tbody') or table
            for tr in tbody.find_all('tr'):
                row = []
                for td in tr.find_all(['td', 'th']):
                    row.append(td.get_text(strip=True))
                if row:
                    rows.append(row)
            
            # Create DataFrame
            if headers and rows:
                df = pd.DataFrame(rows, columns=headers)
                dataframes.append(df)
        
        return dataframes
    
    except Exception as e:
        print(f"Error scraping {url}: {str(e)}")
        return []

# # Example usage
# url = "https://fbref.com/en/squads/18bb7c10/2022/Arsenal-Stats"
# tables = scrape_table_without_readhtml(url)

# for i, df in enumerate(tables):
#     print(f"Table {i+1}:")
#     print(df.head())
#     df.to_csv(f"table_{i+1}.csv", index=False)

#### Test beautifulsoup

In [15]:
url = f"https://fbref.com/en/squads/d9fdd9d9/2022/Botafogo-RJ-Stats"
tables = scrape_table_without_readhtml(url)

for i, df in enumerate(tables):
    print(f"Table {i+1}:")
    print(df.head())
    df.to_csv(f"table_{i+1}.csv", index=False)

Error scraping https://fbref.com/en/squads/d9fdd9d9/2022/Botafogo-RJ-Stats: 403 Client Error: Forbidden for url: https://fbref.com/en/squads/d9fdd9d9/2022/Botafogo-RJ-Stats


#### Scrape with beautifulsoup

In [None]:
for clube, dados in clubes.items():
    # Create club directory if it doesn't exist
    club_dir = Path(f'data/clubes/{clube}')
    club_dir.mkdir(parents=True, exist_ok=True)
    
    for year in dados['years_SA']:
        # Generate expected filenames pattern for this year
        existing_files = list(club_dir.glob(f"{year}_*.csv"))
        
        # Skip if we already have files for this year
        if existing_files:
            print(f"Skipping {clube} {year} - already downloaded")
            continue
            
        url = f"https://fbref.com/en/squads/{clubes[clube]['id']}/{year}/{clube}-Stats"

        try:
            tables = scrape_table_without_readhtml(url)
            
            for i, table in enumerate(tables):
                filename = club_dir / f"{year}_{i}.csv"
                table.to_csv(filename, index=False)
                print(f"Saved {filename}")
                
        except Exception as e:
            print(f"Failed to scrape {clube} {year}: {str(e)}")
            continue