In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import re
import requests
from bs4 import BeautifulSoup
import os
import time

In [2]:
main_url = "https://fbref.com"
temporadas = [2024, 2023, 2022, 2021, 2020, 2019]
leagues_dictionary = {
    #'MX1': 31,
    #'BR1': 24,
    #'ARG1': 21,
    #'MLS': 22,
    #'ENG1': 9,
    #'ESP1': 12,
    #'ITA1': 11,
    #'GER1': 20,
    #'FRA1': 13,
    #'HOL1': 23,
    'POR1': 32,
    'ENG2': 10
}

In [3]:
# Function that gets all the URLs of the matches
def get_urls(url):
    # Download the web page
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all the links that contain the word "matches" in the href attribute
    enlaces_matches = [a['href'] for a in soup.find_all('a', href=True) if 'matches' in a['href']]

    # Filter the links that have more than 30 characters and remove duplicates
    enlaces_filtrados = list(set([enlace for enlace in enlaces_matches if len(enlace) > 30]))

    return enlaces_filtrados


# Function that scrapes the URLs obtained from the get_urls function
# Function that scrapes the URLs obtained from the get_urls function
def match_scrap(url):
    
    url = main_url + url
    
    # Get date, attendance, stadium, and local time
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    match = re.search(r'(\w+-\d+-\d+)', url)
    date_str = match.group(1)
    date_object = datetime.strptime(date_str, '%B-%d-%Y')
    formatted_date = date_object.strftime('%Y-%m-%d')
    
    # Handling Attendance
    attendance_element = soup.find('div', class_='scorebox_meta').find('strong', string='Attendance')
    if attendance_element:
        attendance = attendance_element.find_next_sibling('small').text.replace(',', '')
    else:
        attendance = 'N/A'  # or set to a default value

    # Handling Stadium
    stadium_element = soup.find('div', class_='scorebox_meta').find('strong', string='Venue')
    if stadium_element:
        stadium = stadium_element.find_next_sibling('small').text
    else:
        stadium = 'N/A'  # or set to a default value

    local_time_element = soup.find('div', class_='scorebox_meta').find('span', class_='venuetime')
    local_time = local_time_element['data-venue-time']
    
    local_time_element = soup.find('div', class_='scorebox_meta').find('span', class_='venuetime')
    local_time = local_time_element['data-venue-time']
    
    
    # Obtain the name and formation of the home and away teams
    pagina_entera = pd.read_html(response.text)
    
    local_name = pagina_entera[0].columns.values[1].split()[0]
    away_name = pagina_entera[1].columns.values[1].split()[0]
    
    l_start = pagina_entera[0].columns.values[1].find("(") + 1
    a_start = pagina_entera[1].columns.values[1].find("(") + 1
    l_end = pagina_entera[0].columns.values[1].find(")")
    a_end = pagina_entera[1].columns.values[1].find(")")
    
    local_formacion = pagina_entera[0].columns.values[1][l_start:l_end]
    away_formacion = pagina_entera[1].columns.values[1][a_start:a_end]
    
    #Obtain Managers Name
    datapoints = soup.find_all('div', class_='datapoint')

    managers = []

    for datapoint in datapoints:
        strong_tag = datapoint.find('strong')
        if strong_tag and strong_tag.text.strip() == 'Manager':
            manager_name = datapoint.text.replace('Manager:', '').strip()
            managers.append(manager_name)
    
    
    # Summary Info Scraping
    df_local = pagina_entera[3]
    df_local.columns = df_local.columns.droplevel(0)
    df_local = df_local.dropna()
    
    df_away = pagina_entera[10]
    df_away.columns = df_away.columns.droplevel(0)
    df_away = df_away.dropna()
    
    local_goals = np.sum(df_local['Gls'])
    away_goals = np.sum(df_away['Gls'])
    
    if local_goals > away_goals:
        winner = "home"
    elif local_goals < away_goals:
        winner = "away"
    else: 
        winner = "tie"
    
    df_local["team"] = local_name
    df_local["rival"] = away_name
    df_away["team"] = away_name
    df_away["rival"] = local_name
    df_local['t_gls'] = local_goals
    df_local['tc_gls'] = away_goals
    df_away['t_gls'] = away_goals
    df_away['tc_gls'] = local_goals
    
    df_local['result'] = 'W'
    df_local.loc[df_local['t_gls'] < df_local['tc_gls'], 'result'] = "L"
    df_local.loc[df_local['t_gls'] == df_local['tc_gls'], 'result'] = "T"
    
    df_away['result'] = 'W'
    df_away.loc[df_away['t_gls'] < df_away['tc_gls'], 'result'] = "L"
    df_away.loc[df_away['t_gls'] == df_away['tc_gls'], 'result'] = "T"
    
    df_local["formacion"] = local_formacion
    df_local["formacion_rival"] = away_formacion
    df_away["formacion"] = away_formacion
    df_away["formacion_rival"] = local_formacion

    df_final = pd.concat([df_local, df_away], ignore_index=True)
    
    df_final["date"] = formatted_date    
    df_final['match'] = local_name + "-" + away_name + "-" + formatted_date
    
    df_final['location'] = df_final.apply(lambda row: 'home' if row['match'].startswith(row['team'])else 'away', axis=1)
    df_final['winner'] = winner
    df_final['season'] = season
    df_final['attendance'] = attendance
    df_final['time'] = local_time
    df_final['stadium'] = stadium
    df_final['home_manager'] = managers[0]
    df_final['away_manager'] = managers[1]
    
    # Scrap Def Data
    
    df_def_local = pagina_entera[6]
    df_def_local.columns = df_def_local.columns.droplevel(0)
    
    df_def_away = pagina_entera[13]
    df_def_away.columns = df_def_away.columns.droplevel(0)
    
    df_def_local["team"] = local_name
    df_def_local["rival"] = away_name
    df_def_away["team"] = away_name
    df_def_away["rival"] = local_name
    
    df_def_local["formacion"] = local_formacion
    df_def_local["formacion_rival"] = away_formacion
    df_def_away["formacion"] = away_formacion
    df_def_away["formacion_rival"] = local_formacion
    
    df_def_local['t_gls'] = local_goals
    df_def_local['tc_gls'] = away_goals
    df_def_away['t_gls'] = away_goals
    df_def_away['tc_gls'] = local_goals
    
    
    df_def_local['result'] = 'W'
    df_def_local.loc[df_def_local['t_gls'] < df_def_local['tc_gls'], 'result'] = "L"
    df_def_local.loc[df_def_local['t_gls'] == df_def_local['tc_gls'], 'result'] = "T"
    
    df_def_away['result'] = 'W'
    df_def_away.loc[df_def_away['t_gls'] < df_def_away['tc_gls'], 'result'] = "L"
    df_def_away.loc[df_def_away['t_gls'] == df_def_away['tc_gls'], 'result'] = "T"
    
    df_def_final = pd.concat([df_def_local, df_def_away], ignore_index=True)
    
    df_def_final['date'] = formatted_date
    df_def_final['match'] = local_name + "-" + away_name + "-" + formatted_date
    
    df_def_final['location'] = df_def_final.apply(lambda row: 'home' if row['match'].startswith(row['team'])else 'away', axis=1)
    df_def_final['winner'] = winner
    
    df_def_final['season'] = season
    
    df_def_final['attendance'] = attendance
    df_def_final['time'] = local_time
    df_def_final['stadium'] = stadium
    
    # Scrap Possesion Data
    
    df_pos_local = pagina_entera[7]
    df_pos_local.columns = df_pos_local.columns.droplevel(0)
    
    df_pos_away = pagina_entera[14]
    df_pos_away.columns = df_pos_away.columns.droplevel(0)
    
    df_pos_local["team"] = local_name
    df_pos_local["rival"] = away_name
    df_pos_away["team"] = away_name
    df_pos_away["rival"] = local_name
    
    df_pos_local["formacion"] = local_formacion
    df_pos_local["formacion_rival"] = away_formacion
    df_pos_away["formacion"] = away_formacion
    df_pos_away["formacion_rival"] = local_formacion
    
    df_pos_local['t_gls'] = local_goals
    df_pos_local['tc_gls'] = away_goals
    df_pos_away['t_gls'] = away_goals
    df_pos_away['tc_gls'] = local_goals
    
    
    df_pos_local['result'] = 'W'
    df_pos_local.loc[df_pos_local['t_gls'] < df_pos_local['tc_gls'], 'result'] = "L"
    df_pos_local.loc[df_pos_local['t_gls'] == df_pos_local['tc_gls'], 'result'] = "T"
    
    df_pos_away['result'] = 'W'
    df_pos_away.loc[df_pos_away['t_gls'] < df_pos_away['tc_gls'], 'result'] = "L"
    df_pos_away.loc[df_pos_away['t_gls'] == df_pos_away['tc_gls'], 'result'] = "T"
    
    df_pos_final = pd.concat([df_pos_local, df_pos_away], ignore_index=True)
    
    df_pos_final['date'] = formatted_date
    df_pos_final['match'] = local_name + "-" + away_name + "-" + formatted_date
    
    df_pos_final['location'] = df_pos_final.apply(lambda row: 'home' if row['match'].startswith(row['team'])else 'away', axis=1)
    df_pos_final['winner'] = winner
    
    df_pos_final['season'] = season
    
    df_pos_final['attendance'] = attendance
    df_pos_final['time'] = local_time
    df_pos_final['stadium'] = stadium
    
    return df_final.dropna(), df_def_final.dropna(), df_pos_final.dropna()

In [4]:
import os
from collections import defaultdict
from tqdm import tqdm
import time

# Resto de tus importaciones

# Iterar sobre ligas y temporadas
for league_code, league_id in leagues_dictionary.items():
    for season in temporadas:
        if league_code == "BR1" or league_code == "MLS":
            url = f"https://fbref.com/en/comps/{league_id}/{season}/schedule/{season}-Scores-and-Fixtures"
        else:
            url = f"https://fbref.com/en/comps/{league_id}/{season - 1}-{season}/schedule/{season - 1}-{season}-Scores-and-Fixtures"
        main_url = "https://fbref.com"
        match_list = get_urls(url)

        df_sum = pd.DataFrame()
        df_def = pd.DataFrame()
        df_pos = pd.DataFrame()

        error_count = 0
        
        # Utilizar tqdm para la barra de progreso
        progress_bar = tqdm(enumerate(match_list), desc=f"{league_code} - {season} | Errors: {error_count}", total=len(match_list), leave=False, mininterval=0.1)
        for k, match in progress_bar:
            try:
                df_temp = match_scrap(match)
                df_sum = pd.concat([df_sum, df_temp[0]], ignore_index=True)
                df_def = pd.concat([df_def, df_temp[1]], ignore_index=True)
                df_pos = pd.concat([df_pos, df_temp[2]], ignore_index=True)
                time.sleep(1.5)
            except:
                error_count +=1
                time.sleep(1.5)

            # Pausar un poco para la visualización de la barra de progreso

        # Crear el directorio si no existe
        os.makedirs(f"Data/{league_code}", exist_ok=True)

        # Rename duplicate columns in df_sum
        column_counter = defaultdict(int)
        new_columns = []
        for col in df_sum.columns:
            if column_counter[col] > 0:
                new_col_name = f"{col}_{column_counter[col]}"
            else:
                new_col_name = col

            column_counter[col] += 1
            new_columns.append(new_col_name)

        df_sum.columns = new_columns

        # Rename duplicate columns in df_def
        column_counter = defaultdict(int)
        new_columns = []
        for col in df_def.columns:
            if column_counter[col] > 0:
                new_col_name = f"{col}_{column_counter[col]}"
            else:
                new_col_name = col

            column_counter[col] += 1
            new_columns.append(new_col_name)

        df_def.columns = new_columns

        # Rename duplicate columns in df_pos
        column_counter = defaultdict(int)
        new_columns = []
        for col in df_pos.columns:
            if column_counter[col] > 0:
                new_col_name = f"{col}_{column_counter[col]}"
            else:
                new_col_name = col

            column_counter[col] += 1
            new_columns.append(new_col_name)

        df_pos.columns = new_columns

        df_sum.to_parquet(f"Data/{league_code}/{league_code}_{season}_Sum.gzip", index=False, compression='gzip')
        df_def.to_parquet(f"Data/{league_code}/{league_code}_{season}_Def.gzip", index=False, compression='gzip')
        df_pos.to_parquet(f"Data/{league_code}/{league_code}_{season}_Pos.gzip", index=False, compression='gzip')

                                                                           