In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
from io import StringIO
import json
from IPython.display import display


import re

def extract_core_table_name(long_table_name):
    # Use regular expression to find the first occurrence of a numeric character
    match = re.search(r'\d', long_table_name)
    if match:
        index = match.start()
        # Extract the part before the year (first number)
        core_table_name = long_table_name[:index].rstrip('_')
        return core_table_name
    else:
        # Return the original name if no number is found, or adjust as needed
        return long_table_name


def save_to_json(df, league, season, table_name, filepath):
    if isinstance(df.columns, pd.MultiIndex):
        df.columns = [' '.join(col).strip() for col in df.columns.values]

    data = df.to_dict(orient='records')

    content = {
        "league": league,
        "season": season,
        "table_name": table_name,
        "data": data
    }

    os.makedirs(os.path.dirname(filepath), exist_ok=True)

    with open(filepath, 'w') as f:
        json.dump(content, f, indent=4)

def get_tables_with_names(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    tables = soup.find_all('table')
    table_names = [caption.text.strip() for caption in soup.find_all('caption')]
    return tables, table_names

def parse_tables(url, league, season, directory):
    tables, table_names = get_tables_with_names(url)

    for i, (table, name) in enumerate(zip(tables, table_names)):
        long_table_name = name.replace(' ', '_').replace('/', '_')  # Normalize table name
        table_name = extract_core_table_name(long_table_name)

        # Check if this is a "versus" table based on the index
        if i % 2 == 1:
            table_name += "_versus"

        # Wrap the HTML content in a StringIO object
        html_content = StringIO(str(table))

        # Use read_html on the StringIO object
        df = pd.read_html(html_content)[0]  # Convert the table to a DataFrame

        # Define the JSON file path
        json_filepath = os.path.join(directory, f"{league}_{season}_{table_name}.json")

        # Save the DataFrame and its metadata as JSON
        save_to_json(df, league, season, table_name, json_filepath)


In [None]:
url = "https://fbref.com/en/comps/"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
tables = soup.find_all('table')

Data Scraping Loop

In [None]:
def scrap_data(season_list,directory,league,num):
    for season in season_list:
        url = f"https://fbref.com/en/comps/{num}/{season}/{season}-{league}-Stats"
        print(url)
        parse_tables(url, league, season, directory)

In [None]:
season_list = ["2022-2023","2021-2022","2020-2021","2019-2020","2018-2019"]
directory = "data"
league = 'Bundesliga'
num = '20'

#scrap_data(season_list,directory,league,num)

# First Tier

In [None]:
league_num_list = [('Premier-League',9),('La-Liga',12),('Serie-A',11),('Bundesliga',20),('Ligue-1',13)]
for league,num in league_num_list:
    print(f"scraping data for {league}")
    scrap_data(season_list,directory,league,num)

scraping data for Premier-League
https://fbref.com/en/comps/9/2022-2023/2022-2023-Premier-League-Stats
https://fbref.com/en/comps/9/2021-2022/2021-2022-Premier-League-Stats
https://fbref.com/en/comps/9/2020-2021/2020-2021-Premier-League-Stats
https://fbref.com/en/comps/9/2019-2020/2019-2020-Premier-League-Stats
https://fbref.com/en/comps/9/2018-2019/2018-2019-Premier-League-Stats
scraping data for La-Liga
https://fbref.com/en/comps/12/2022-2023/2022-2023-La-Liga-Stats
https://fbref.com/en/comps/12/2021-2022/2021-2022-La-Liga-Stats
https://fbref.com/en/comps/12/2020-2021/2020-2021-La-Liga-Stats
https://fbref.com/en/comps/12/2019-2020/2019-2020-La-Liga-Stats
https://fbref.com/en/comps/12/2018-2019/2018-2019-La-Liga-Stats
scraping data for Serie-A
https://fbref.com/en/comps/11/2022-2023/2022-2023-Serie-A-Stats
https://fbref.com/en/comps/11/2021-2022/2021-2022-Serie-A-Stats
https://fbref.com/en/comps/11/2020-2021/2020-2021-Serie-A-Stats
https://fbref.com/en/comps/11/2019-2020/2019-2020-Ser

# Second Tier

In [None]:
league_num_list = [('Championship',10),('Major-League-Soccer',22),('Eredivisie',23),('Primeira-Liga',32),('Liga-MX',31),
                   ('Serie-A',24),('Segunda-Division',17),('Belgian-Pro-League',37),('2-Bundesliga',33),('Ligue-2',60),
                   ('Serie-B',18),('Primera-Division',21)]
for league,num in league_num_list:
    print(f"scraping data for {league}")
    scrap_data(season_list,directory,league,num)

scraping data for Championship
https://fbref.com/en/comps/10/2022-2023/2022-2023-Championship-Stats
https://fbref.com/en/comps/10/2021-2022/2021-2022-Championship-Stats
https://fbref.com/en/comps/10/2020-2021/2020-2021-Championship-Stats
https://fbref.com/en/comps/10/2019-2020/2019-2020-Championship-Stats
https://fbref.com/en/comps/10/2018-2019/2018-2019-Championship-Stats
scraping data for Major-League-Soccer
https://fbref.com/en/comps/22/2022-2023/2022-2023-Major-League-Soccer-Stats
https://fbref.com/en/comps/22/2021-2022/2021-2022-Major-League-Soccer-Stats
https://fbref.com/en/comps/22/2020-2021/2020-2021-Major-League-Soccer-Stats
https://fbref.com/en/comps/22/2019-2020/2019-2020-Major-League-Soccer-Stats
https://fbref.com/en/comps/22/2018-2019/2018-2019-Major-League-Soccer-Stats
scraping data for Eredivisie
https://fbref.com/en/comps/23/2022-2023/2022-2023-Eredivisie-Stats
https://fbref.com/en/comps/23/2021-2022/2021-2022-Eredivisie-Stats
https://fbref.com/en/comps/23/2020-2021/202

# Female Data

In [None]:
directory = "female_data"
league_num_list = [('Womens-Super-League',189),('NWSL',182),('A-League-Women',196),('Division-1-Feminine',193),
                   ('Frauen-Bundesliga',183),('Serie-A',208)]
for league,num in league_num_list:
    print(f"scraping data for {league}")
    scrap_data(season_list,directory,league,num)

scraping data for Womens-Super-League
https://fbref.com/en/comps/189/2022-2023/2022-2023-Womens-Super-League-Stats
https://fbref.com/en/comps/189/2021-2022/2021-2022-Womens-Super-League-Stats
https://fbref.com/en/comps/189/2020-2021/2020-2021-Womens-Super-League-Stats
https://fbref.com/en/comps/189/2019-2020/2019-2020-Womens-Super-League-Stats
https://fbref.com/en/comps/189/2018-2019/2018-2019-Womens-Super-League-Stats
scraping data for NWSL
https://fbref.com/en/comps/182/2022-2023/2022-2023-NWSL-Stats
https://fbref.com/en/comps/182/2021-2022/2021-2022-NWSL-Stats
https://fbref.com/en/comps/182/2020-2021/2020-2021-NWSL-Stats
https://fbref.com/en/comps/182/2019-2020/2019-2020-NWSL-Stats
https://fbref.com/en/comps/182/2018-2019/2018-2019-NWSL-Stats
scraping data for Liga-F
https://fbref.com/en/comps/230/2022-2023/2022-2023-Liga-F-Stats
https://fbref.com/en/comps/230/2021-2022/2021-2022-Liga-F-Stats
https://fbref.com/en/comps/230/2020-2021/2020-2021-Liga-F-Stats
https://fbref.com/en/comps/

In [None]:
url = f"https://fbref.com/en/comps/9/{league}-Stats"

In [None]:
season_list = ["2022-2023","2021-2022","2020-2021","2019-2020","2018-2019"]

In [None]:
for season in season_list:
    url = f"https://fbref.com/en/comps/9/{season}/{season}-Premier-League-Stats"
    print(url)

https://fbref.com/en/comps/9/2022-2023/2022-2023-Premier-League-Stats
https://fbref.com/en/comps/9/2021-2022/2021-2022-Premier-League-Stats
https://fbref.com/en/comps/9/2020-2021/2020-2021-Premier-League-Stats
https://fbref.com/en/comps/9/2019-2020/2019-2020-Premier-League-Stats
https://fbref.com/en/comps/9/2018-2019/2018-2019-Premier-League-Stats


In [None]:
151*24

3624