***ARCHIVO LEAGUE_MANAGER.PY***

TODO CORRECTO Y ACABADO

In [1]:
import requests
from bs4 import BeautifulSoup,Comment
import re
import json
import time
import random
import pandas as pd
import numpy as np

USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'

class LeagueManager:
    """
    Clase para gestionar ligas de fútbol y generar URLs de estadísticas de jugadores desde FBref.
    """
    def __init__(self):
        """
        Inicializa los atributos necesarios para acceder a las ligas, temporadas y tipos de estadísticas disponibles.
        """
        self.base_url = "https://fbref.com/en/comps/"
        # Diccionario con ligas disponibles, cada una con su ID, slug para la URL y temporadas disponibles
        self.possible_leagues = {
            'Fbref': {
                'Premier League': {
                    'id': 9,
                    'slug': 'Premier-League',
                    'seasons': ['2024-2025', '2023-2024', '2022-2023', '2021-2022', '2020-2021']
                },
                'La Liga': {
                    'id': 12,
                    'slug': 'La-Liga',
                    'seasons': ['2024-2025', '2023-2024', '2022-2023', '2021-2022', '2020-2021']
                },
                'Ligue 1': {
                    'id': 13,
                    'slug': 'Ligue-1',
                    'seasons': ['2024-2025', '2023-2024', '2022-2023', '2021-2022', '2020-2021']    
                },
                'Bundesliga': {
                    'id': 20,
                    'slug': 'Bundesliga',
                    'seasons': ['2024-2025', '2023-2024', '2022-2023', '2021-2022', '2020-2021']
                },
                'Serie A': {
                    'id': 11,
                    'slug': 'Serie-A',
                    'seasons': ['2024-2025', '2023-2024', '2022-2023', '2021-2022', '2020-2021']
                },
            }
        }

        # Tipos de estadísticas disponibles para jugadores
        self.player_tables = {
            "Standard Stats": "stats",
            "Goalkeeping": "keepers",
            "Advanced Goalkeeping": "keepersadv",
            "Shooting": "shooting",
            "Passing": "passing",
            "Pass Types": "passing_types",
            "Goal and Shot Creation": "gca",
            "Defensive Actions": "defense",
            "Possession": "possession",
            "Playing Time": "playingtime",
            "Miscellaneous Stats": "misc",
        }
        self.team_tables = {
            'stats': 'stats',
            'shooting': 'shooting',
            'passing': 'passing',
            'passingtypes': 'passing_types',
            'gca': 'gca',
            'defensive': 'defense',
            'possession': 'possession',
            'playingtime': 'playingtime',
            'misc': 'misc',
            'keepers': 'keepers',
            'keepersadv': 'keepersadv'
        }

        self.headers = {'User-Agent': USER_AGENT}

    def get_available_leagues(self) -> dict:
        """
        Devuelve un diccionario con las ligas disponibles, sus identificadores y temporadas.

        Return:
            dict: Ligas disponibles con su ID y temporadas.
        """
        return {
            league_name: {
                'id': data['id'],
                'seasons': data['seasons']
            }
            for league_name, data in self.possible_leagues['Fbref'].items()
        }

    def get_league_info(self, league_name: str) -> dict | None:
        """
        Devuelve la información de una liga específica.

        Args:
            league_name (str): Nombre de la liga.

        Return:
            dict or None: Información de la liga seleccionada (id, slug, seasons) o None si no existe.
        """
        return self.possible_leagues['Fbref'].get(league_name)

    def get_all_league_names(self) -> list:
        """
        Devuelve la lista de nombres de todas las ligas disponibles.

        Return:
            list: Nombres de las ligas.
        """
        return list(self.possible_leagues['Fbref'].keys())

    def generate_player_urls(self)-> dict:
        """
        Genera URLs completas para acceder a estadísticas de jugadores por liga, temporada y tipo de estadística.

        Return:
            dict: Diccionario anidado con URLs organizadas por liga y temporada.
                  Formato: {liga: {temporada: {tipo_estadistica: url}}}
        """
        urls = {}

        for league_name, league_data in self.possible_leagues['Fbref'].items():
            league_id = league_data['id']
            seasons = league_data['seasons']
            urls[league_name] = {}

            for season in seasons:
                season_urls = {}
                for stat_name, path in self.player_tables.items():
                    url = (
                            f"{self.base_url}{league_id}/{season}/{path}/{season}/"
                            f"{league_data['slug']}-Stats"
                        )
                    season_urls[stat_name] = url

                urls[league_name][season] = season_urls

        return urls
    
    def generate_team_urls(self) -> dict:

        """
        Genera URLs completas para acceder a estadísticas de equipos por liga, temporada y tipo de estadística.

        Return:
            dict: Diccionario anidado con URLs organizadas por liga y temporada.
                Formato: {liga: {temporada: {tipo_estadistica: url}}}
        """
        
        urls = {}

        for league_name, league_data in self.possible_leagues['Fbref'].items():
            league_id = league_data['id']
            slug = league_data['slug']
            seasons = league_data['seasons']
            urls[league_name] = {}

            for season in seasons:
                season_urls = {}
                for stat_name, path in self.team_tables.items():
                    url = f"{self.base_url}{league_id}/{path}/{slug}-Stats"
                    season_urls[stat_name] = url
                urls[league_name][season] = season_urls

        return urls

manager = LeagueManager()
player_urls = manager.generate_player_urls()
teams_urls= manager.generate_team_urls()


***ARCHIVO PLAYER_DATA.PY***

In [29]:
def get_players_data(url, metrica_general=None):

    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    #Si no pasas metrica_general, lo extraemos de la URL
    if metrica_general is None:
        stat_match = re.search(r'/([^/]+)/\d{4}-\d{4}/[^/]+-Stats', url)
        if stat_match:
            metrica_general = stat_match.group(1).replace('-', ' ').title()
        else:
            raise ValueError(f"❌ No se pudo extraer 'metrica_general' desde la URL: {url}")


    metrica_general_clean = metrica_general.replace(' ', '_')

    # Buscar tablas ocultas (comentadas)
    comments = soup.find_all(string=lambda text: isinstance(text, Comment))
    player_table = None

    for comment in comments:
        comment_soup = BeautifulSoup(comment, 'html.parser')
        tables = comment_soup.find_all('table')
        for table in tables:
            if table.find('td', {'data-stat': 'player'}):
                player_table = table
                break
        if player_table:
            break

    if not player_table:
        print("No se encontró la tabla de jugadores.")
        return None, None

    
    #Procesar el encabezado de la tabla

    header_rows = player_table.find('thead').find_all('tr')
    last_header_row = header_rows[-1]

    columns_data = []
    column_names = []

    for th in last_header_row.find_all('th'):
        data_stat = th.get('data-stat')
        data_over_header = th.get('data-over-header') or 'General'
        data_over_header = data_over_header.replace(' ', '_')
        metrica_general_clean = metrica_general.replace(' ', '_')
        column_name = f"{data_stat}_{data_over_header}_{metrica_general_clean}"
        column_names.append(column_name)

        columns_data.append({
            'data-stat': data_stat,
            'data-over-header': data_over_header,
            'metrica-general': metrica_general
        })

    df_columns = pd.DataFrame(columns_data)


    #Extraer las filas de jugadores

    data_rows = []
    for row in player_table.find('tbody').find_all('tr'):
        row_data = []
        for cell in row.find_all(['th', 'td']):
            cell_text = cell.get_text(strip=True)
            row_data.append(cell_text)
        if row_data:  # Evitar filas vacías
            data_rows.append(row_data)

 
    #Crear el DataFrame de jugadores

    df_players = pd.DataFrame(data_rows, columns=column_names)

    return  df_players




In [30]:
def limpieza_df_players(df, url):

    #Extraer la métrica general automáticamente desde la URL
    stat_match = re.search(r'/([^/]+)/\d{4}-\d{4}/[^/]+-Stats', url)
    if stat_match:
        metrica_general = stat_match.group(1).replace('-', ' ').title()
    else:
        raise ValueError(f"❌ No se pudo extraer 'metrica_general' desde la URL: {url}")


    metrica_general_clean = metrica_general.replace(' ', '_')

     
    # Filtrar dinámicamente las columnas a eliminar
    columns_to_drop = [
    col for col in df.columns
    if col.startswith("ranker_") or col.startswith("matches_")
    ]

    if columns_to_drop:
        df = df.drop(columns=columns_to_drop)
    
    # Eliminar filas con encabezados repetidos
    player_cols = [col for col in df.columns if col.lower().startswith('player')]
    if player_cols:
        player_col = player_cols[0]
        # Filtrar filas donde esa columna contenga 'Player', 'Team' o 'Totals'
        df = df[~df[player_col].isin(['Player', 'Team', 'Totals'])]

    # Procesar la columna de nacionalidad si existe
    nationality_col = [col for col in df.columns if 'nationality' in col]
    if nationality_col:
        col_name = nationality_col[0]
        df[col_name] = df[col_name].astype(str).str.extract(r'([A-Z]+)$')

    # Extraer la parte de la competición
    competition_name_match = re.search(r'/([^/]+)-Stats(?:/|$)', url)
    if competition_name_match:
        competition_name = competition_name_match.group(1).replace('-', ' ')
    else:
        competition_name = 'Desconocida'

    competition_col = [col for col in df.columns if 'competition' in col]
    if not competition_col:
        df['competition'] = competition_name
    # Reemplazar celdas vacías por NaN y luego NaN por 0
    df.replace('', np.nan, inplace=True)
    df.fillna(0, inplace=True)
    df.reset_index(drop=True,inplace=True)

    
    return df


In [None]:
player_urls['La Liga']

In [None]:
player_urls['Ligue 1']

In [None]:
player_urls['Serie A']

In [None]:
player_urls['Premier League']

In [162]:
player_urls['Bundesliga']

{'2024-2025': {'Standard Stats': 'https://fbref.com/en/comps/20/2024-2025/stats/2024-2025/Bundesliga-Stats',
  'Goalkeeping': 'https://fbref.com/en/comps/20/2024-2025/keepers/2024-2025/Bundesliga-Stats',
  'Advanced Goalkeeping': 'https://fbref.com/en/comps/20/2024-2025/keepersadv/2024-2025/Bundesliga-Stats',
  'Shooting': 'https://fbref.com/en/comps/20/2024-2025/shooting/2024-2025/Bundesliga-Stats',
  'Passing': 'https://fbref.com/en/comps/20/2024-2025/passing/2024-2025/Bundesliga-Stats',
  'Pass Types': 'https://fbref.com/en/comps/20/2024-2025/passing_types/2024-2025/Bundesliga-Stats',
  'Goal and Shot Creation': 'https://fbref.com/en/comps/20/2024-2025/gca/2024-2025/Bundesliga-Stats',
  'Defensive Actions': 'https://fbref.com/en/comps/20/2024-2025/defense/2024-2025/Bundesliga-Stats',
  'Possession': 'https://fbref.com/en/comps/20/2024-2025/possession/2024-2025/Bundesliga-Stats',
  'Playing Time': 'https://fbref.com/en/comps/20/2024-2025/playingtime/2024-2025/Bundesliga-Stats',
  'Mi

In [27]:
# Paso 3: Elegir liga, temporada y tipo de estadística
liga = 'Bundesliga'
temporada = '2024-2025'

#Estadisticas disponibles: ['Standard Stats', 'Goalkeeping', 'Advanced Goalkeeping', 'Shooting', 'Passing','Pass Types',
# 'Goal and Shot Creation', 'Defensive Actions', 'Possession', 'Playing Time', 'Miscellaneous Stats']
tipo_estadistica =  'Standard Stats'

url_player = player_urls[liga][temporada][tipo_estadistica]
url_player

'https://fbref.com/en/comps/20/2024-2025/stats/2024-2025/Bundesliga-Stats'

ESTO FUNCIONA!!!

In [None]:
def creacion_df_jugadores_estadistica_unica(url: str, guardar_csv=False, league='La Liga', season='2024', metrica_general=None):
    
    try:
        # Obtener datos de la tabla
        df_sucio = get_players_data(url)
        
        # Limpiar los datos
        df_limpio = limpieza_df_players(df_sucio,  url=url)  # Asegúrate de pasar la URL a la función de limpieza

        # Extraer metrica_general de la URL
        if metrica_general is None:

            metrica_general_match = re.search(r'/([^/]+)/\d{4}-\d{4}/[^/]+-Stats', url)
            if metrica_general_match:
                metrica_general = metrica_general_match.group(1)
            else:
                raise ValueError(f"❌ No se pudo extraer la métrica general desde la URL: {url}")


        # Suprimir espacios en el parámetro league
        league_clean = league.lower().replace(' ', '_')

        #Remplazar el guion entre los años de la temporada por guión bajo
        season_clean = season.replace('-', '_')

        # Guardar CSV si es necesario
        if guardar_csv:
            df_limpio.to_csv(f'./df_players_{metrica_general}_{league_clean}_{season_clean}.csv', index=False)
        
        return df_limpio
    
    except Exception as e:
        print(f"❌ Error en creacion_df_jugadores_estadistica_unica para {league} - {season} - {metrica_general}: {e}")
        return None



In [40]:
creacion_df_jugadores_estadistica_unica(url_player, guardar_csv=True, league=liga, season=temporada)

Unnamed: 0,player_General_Stats,nationality_General_Stats,position_General_Stats,team_General_Stats,age_General_Stats,birth_year_General_Stats,games_Playing_Time_Stats,games_starts_Playing_Time_Stats,minutes_Playing_Time_Stats,minutes_90s_Playing_Time_Stats,...,assists_per90_Per_90_Minutes_Stats,goals_assists_per90_Per_90_Minutes_Stats,goals_pens_per90_Per_90_Minutes_Stats,goals_assists_pens_per90_Per_90_Minutes_Stats,xg_per90_Per_90_Minutes_Stats,xg_assist_per90_Per_90_Minutes_Stats,xg_xg_assist_per90_Per_90_Minutes_Stats,npxg_per90_Per_90_Minutes_Stats,npxg_xg_assist_per90_Per_90_Minutes_Stats,competition
0,Junior Adamu,AUT,FW,Freiburg,23,2001,25,19,1545,17.2,...,0.12,0.23,0.12,0.23,0.32,0.14,0.46,0.32,0.46,Bundesliga
1,Karim Adeyemi,GER,"FW,MF",Dortmund,22,2002,25,17,1433,15.9,...,0.38,0.82,0.44,0.82,0.34,0.38,0.72,0.34,0.72,Bundesliga
2,Amine Adli,MAR,"MF,FW",Leverkusen,24,2000,20,6,766,8.5,...,0.00,0.23,0.23,0.23,0.32,0.08,0.40,0.32,0.40,Bundesliga
3,Oladapo Afolayan,ENG,"FW,MF",St. Pauli,26,1998,32,17,1639,18.2,...,0.05,0.22,0.16,0.22,0.17,0.21,0.38,0.17,0.38,Bundesliga
4,Felix Agu,NGA,DF,Werder Bremen,24,1999,22,21,1751,19.5,...,0.00,0.15,0.15,0.15,0.07,0.09,0.17,0.07,0.17,Bundesliga
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
487,Robin Zentner,GER,GK,Mainz 05,29,1994,32,32,2880,32.0,...,0.00,0.00,0.00,0.00,0.00,0.01,0.01,0.00,0.01,Bundesliga
488,Cedric Zesiger,SUI,DF,Augsburg,26,1998,15,15,1221,13.6,...,0.15,0.15,0.00,0.15,0.09,0.09,0.18,0.09,0.18,Bundesliga
489,Cedric Zesiger,SUI,DF,Wolfsburg,26,1998,7,5,522,5.8,...,0.00,0.00,0.00,0.00,0.01,0.00,0.01,0.01,0.01,Bundesliga
490,Michael Zetterer,GER,GK,Werder Bremen,29,1995,34,34,3060,34.0,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,Bundesliga


TODO LO ANTERIOR FUNCIONA

In [58]:
def creacion_df_general_torneo_fbref(league='La Liga',  season="2024-2025", stat_list=None,
                                      player_urls=None, guardar_csv=False,guardar_csv_individuales=False):
    """
    Crea un DataFrame general con estadísticas de jugadores a partir de las URLs de FBref.

    Args:
        league (str): Nombre de la liga (e.g., 'La Liga').
        season (str): Temporada (e.g., '2024-2025').
        stat_list (list): Lista de estadísticas a procesar (e.g., ['Standard Stats', 'Shooting', ...]).
        player_urls (dict): Diccionario con las URLs por liga y temporada.
        guardar_csv (bool): Si se desea guardar cada DataFrame en CSV.

    Returns:
        pd.DataFrame: DataFrame general con las estadísticas combinadas.
    """
    
    if stat_list is None:
        stat_list = ["Standard Stats", "Shooting", "Passing", "Pass Types", 
                     "Goal and Shot Creation", "Defensive Actions", 
                     "Possession", "Miscellaneous Stats"]

    dfs = []

    for stat_name in stat_list:
        # Obtener la URL de la estadística actual
        try:
            url_actual = player_urls[league][season][stat_name]
        except KeyError:
            print(f"⚠️ No se encontró URL para la estadística '{stat_name}'. Se omite.")
            continue
        
        try: 
            # Obtener el DataFrame limpio
            df_temp = creacion_df_jugadores_estadistica_unica(
                url=url_actual,
                guardar_csv=guardar_csv_individuales,
                league=league,
                season=season,
                metrica_general=stat_name
            )
            if df_temp is not None:
                if 'competition' in df_temp.columns:
                    df_temp = df_temp.drop(columns=['competition'])
                dfs.append(df_temp)
            else:
                print(f"⚠️ No se pudo procesar '{stat_name}'. Se omite.")
        
        except Exception as e:
            print(f"❌ Error procesando '{stat_name}' para {league} - {season}: {e}")

        # Pausa aleatoria
        time.sleep(random.uniform(2, 5))

    if not dfs:
        print("⚠️ No se generó ningún DataFrame.")
        return pd.DataFrame()

    # Concatenar DataFrames horizontalmente y eliminar columnas duplicadas
    df_general_final = pd.concat(dfs, axis=1)
    df_general_final = df_general_final.loc[:, ~df_general_final.columns.duplicated(keep='first')]

    # Insertar columna de Competición
    df_general_final.insert(3, 'competition', league)

    season_clean= season.replace('-', '_')
    # Guardar resultado final si se solicita
    if guardar_csv:
        league_clean = league.lower().replace(' ', '_')
        df_general_final.to_csv(f'estadisticas_jugadores_{league_clean}_{season_clean}.csv', index=False)


    return df_general_final

In [48]:
#Para poder ejecutar la funcion hace falta poner solo la variable liga y temporada 
liga = 'Ligue 1'
temporada = '2024-2025'


In [46]:
df_la_liga_2024 = creacion_df_general_torneo_fbref( league=liga,  season=temporada,
      stat_list=["Standard Stats", "Shooting", "Passing", "Pass Types","Goal and Shot Creation", "Defensive Actions", 
                     "Possession", "Miscellaneous Stats"], player_urls=player_urls, guardar_csv=True, guardar_csv_individuales=False)
df_la_liga_2024

Unnamed: 0,player_General_Stats,nationality_General_Stats,position_General_Stats,competition,team_General_Stats,age_General_Stats,birth_year_General_Stats,games_Playing_Time_Stats,games_starts_Playing_Time_Stats,minutes_Playing_Time_Stats,...,crosses_Performance_Misc,interceptions_Performance_Misc,tackles_won_Performance_Misc,pens_won_Performance_Misc,pens_conceded_Performance_Misc,own_goals_Performance_Misc,ball_recoveries_Performance_Misc,aerials_won_Aerial_Duels_Misc,aerials_lost_Aerial_Duels_Misc,aerials_won_pct_Aerial_Duels_Misc
0,Keyliane Abdallah,FRA,FW,Ligue 1,Marseille,18,2006,1,0,3,...,0,0,1,0,0,0,0,0,0,0
1,Yunis Abdelhamid,MAR,DF,Ligue 1,Saint-Étienne,36,1987,16,11,1033,...,0,14,11,0,0,0,42,19,10,65.5
2,Himad Abdelli,ALG,"MF,FW",Ligue 1,Angers,24,1999,32,32,2842,...,44,26,32,0,0,0,193,13,20,39.4
3,Mohamed Abdelmoneim,EGY,DF,Ligue 1,Nice,25,1999,12,10,855,...,2,15,14,0,0,0,50,31,19,62.0
4,Ali Abdi,TUN,"DF,MF",Ligue 1,Nice,30,1993,25,17,1393,...,51,20,25,0,0,0,72,22,14,61.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
548,Melvin Zinga,FRA,GK,Ligue 1,Angers,22,2002,1,1,90,...,0,0,0,0,1,0,0,0,0,0
549,Luck Zogbé,CIV,DF,Ligue 1,Brest,19,2005,13,7,638,...,14,14,18,0,0,0,51,12,4,75.0
550,Aristide Zossou,CIV,MF,Ligue 1,Auxerre,19,2005,3,0,16,...,1,0,0,0,0,0,0,0,0,0
551,Yanis Zouaoui,ALG,"DF,FW",Ligue 1,Le Havre,30,1994,17,9,976,...,110,8,10,0,0,0,41,7,4,63.6


PORTEROS PLAYERS--- ESTO FUNCIONA (FALTA ADECUARLO PARA LO DE LOS PLAYER_URL)

In [55]:
def creacion_df_porteros_torneo_fbref(league='La Liga',  season="2024-2025", stat_list=None, player_urls=None, guardar_csv=False, guardar_csv_individuales=False):
    """
    Crea un DataFrame general con estadísticas de jugadores a partir de las URLs de FBref.

    Args:
        league (str): Nombre de la liga (e.g., 'La Liga').
        season (str): Temporada (e.g., '2024-2025').
        stat_list (list): Lista de estadísticas a procesar (e.g., ['Standard Stats', 'Shooting', ...]).
        player_urls (dict): Diccionario con las URLs por liga y temporada.
        guardar_csv (bool): Si se desea guardar cada DataFrame en CSV.

    Returns:
        pd.DataFrame: DataFrame general con las estadísticas combinadas.
    """
    
    if stat_list is None:
        stat_list = ['Goalkeeping', 'Advanced Goalkeeping']

    dfs = []

    for stat_name in stat_list:
        # Obtener la URL de la estadística actual
        try:
            url_actual = player_urls[league][season][stat_name]
        except KeyError:
            print(f"⚠️ No se encontró URL para la estadística '{stat_name}'. Se omite.")
            continue
        
        try:
            # Obtener el DataFrame limpio
            df_temp = creacion_df_jugadores_estadistica_unica(
                url=url_actual,
                guardar_csv=guardar_csv_individuales,
                league=league,
                season=season,
                metrica_general=stat_name,
                )
            
            if df_temp is not None:
                if 'competition' in df_temp.columns:
                    df_temp = df_temp.drop(columns=['competition'])
                dfs.append(df_temp)
            else:
                print(f"⚠️ No se pudo procesar '{stat_name}'. Se omite.")
        
        except Exception as e:
            print(f"❌ Error procesando '{stat_name}' para {league} - {season}: {e}")

        # Pausa aleatoria
        time.sleep(random.uniform(2, 5))

    if not dfs:
        print("⚠️ No se generó ningún DataFrame.")
        return pd.DataFrame()

    # Concatenar DataFrames horizontalmente y eliminar columnas duplicadas
    df_general_final = pd.concat(dfs, axis=1)
    df_general_final = df_general_final.loc[:, ~df_general_final.columns.duplicated(keep='first')]

    # Insertar columna de Competición
    df_general_final.insert(3, 'competition', league)

    season_clean= season.replace('-', '_')
    # Guardar resultado final si se solicita
    if guardar_csv:
        league_clean = league.lower().replace(' ', '_')
        df_general_final.to_csv(f'estadisticas_porteros_{league_clean}_{season_clean}.csv', index=False)


    return df_general_final

In [56]:
#Para poder ejecutar la funcion hace falta poner solo la variable liga y temporada 
liga = 'La Liga'
temporada = '2024-2025'

In [57]:
df_porteros= creacion_df_porteros_torneo_fbref(league=liga,  season=temporada,
      stat_list=['Goalkeeping', 'Advanced Goalkeeping'], player_urls=player_urls, guardar_csv=True, guardar_csv_individuales=False)

In [54]:
df_porteros

Unnamed: 0,player_General_Keepers,nationality_General_Keepers,position_General_Keepers,competition,team_General_Keepers,age_General_Keepers,birth_year_General_Keepers,gk_games_Playing_Time_Keepers,gk_games_starts_Playing_Time_Keepers,gk_minutes_Playing_Time_Keepers,...,gk_passes_length_avg_Passes_Keepersadv,gk_goal_kicks_Goal_Kicks_Keepersadv,gk_pct_goal_kicks_launched_Goal_Kicks_Keepersadv,gk_goal_kick_length_avg_Goal_Kicks_Keepersadv,gk_crosses_Crosses_Keepersadv,gk_crosses_stopped_Crosses_Keepersadv,gk_crosses_stopped_pct_Crosses_Keepersadv,gk_def_actions_outside_pen_area_Sweeper_Keepersadv,gk_def_actions_outside_pen_area_per90_Sweeper_Keepersadv,gk_avg_distance_def_actions_Sweeper_Keepersadv
0,Ben Alexander Voll,GER,GK,Bundesliga,St. Pauli,23,2000,2,1,91,...,44.7,13,46.2,39.7,15,0,0.0,0,0.0,0.0
1,Noah Atubolu,GER,GK,Bundesliga,Freiburg,22,2002,26,26,2307,...,33.1,126,68.3,51.5,377,34,9.0,28,1.09,12.7
2,Oliver Baumann,GER,GK,Bundesliga,Hoffenheim,34,1990,28,28,2520,...,36.5,136,69.1,50.8,407,18,4.4,31,1.11,14.5
3,Tiago Cardoso,LUX,GK,Bundesliga,Gladbach,18,2006,5,4,380,...,29.2,31,71.0,48.9,96,14,14.6,1,0.24,5.6
4,Finn Dahmen,GER,GK,Bundesliga,Augsburg,26,1998,19,19,1710,...,34.9,145,71.7,53.2,337,9,2.7,26,1.37,14.7
5,Thomas Dähne,GER,GK,Bundesliga,Holstein Kiel,30,1994,9,9,810,...,30.3,91,60.4,45.6,177,15,8.5,5,0.56,9.3
6,Patrick Drewes,GER,GK,Bundesliga,Bochum,31,1993,21,21,1890,...,37.6,177,80.8,52.6,285,12,4.2,33,1.57,15.8
7,Frank Feller,GER,GK,Bundesliga,Heidenheim,20,2004,2,1,121,...,38.0,11,100.0,68.4,25,1,4.0,1,0.74,14.8
8,Kamil Grabara,POL,GK,Bundesliga,Wolfsburg,25,1999,29,29,2585,...,36.7,154,66.9,51.9,413,15,3.6,19,0.66,12.2
9,Péter Gulácsi,HUN,GK,Bundesliga,RB Leipzig,34,1990,30,30,2631,...,28.6,196,33.7,34.3,434,23,5.3,34,1.16,14.8


In [65]:
clave_equivalente = {
    'Standard Stats': 'stats',
    'Shooting': 'shooting',
    'Passing': 'passing',
    'Pass Types': 'passingtypes',
    'Goal and Shot Creation': 'gca',
    'Defensive Actions': 'defensive',
    'Possession': 'possession',
    'Playing Time': 'playingtime',
    'Miscellaneous Stats': 'misc',
    # Puedes incluir o excluir porteros:
    'Goalkeeping': 'keepers',
    'Advanced Goalkeeping': 'keepersadv'
}


In [67]:
liga = 'La Liga'
temporada = '2024-2025'

urls_original = player_urls[liga][temporada]

urls_convertidas = {
    clave_equivalente[k]: v
    for k, v in urls_original.items()
    if k in clave_equivalente
}

In [64]:
player_urls

{'Premier League': {'2024-2025': {'Standard Stats': 'https://fbref.com/en/comps/9/2024-2025/stats/2024-2025/Premier-League-Stats',
   'Goalkeeping': 'https://fbref.com/en/comps/9/2024-2025/keepers/2024-2025/Premier-League-Stats',
   'Advanced Goalkeeping': 'https://fbref.com/en/comps/9/2024-2025/keepersadv/2024-2025/Premier-League-Stats',
   'Shooting': 'https://fbref.com/en/comps/9/2024-2025/shooting/2024-2025/Premier-League-Stats',
   'Passing': 'https://fbref.com/en/comps/9/2024-2025/passing/2024-2025/Premier-League-Stats',
   'Pass Types': 'https://fbref.com/en/comps/9/2024-2025/passing_types/2024-2025/Premier-League-Stats',
   'Goal and Shot Creation': 'https://fbref.com/en/comps/9/2024-2025/gca/2024-2025/Premier-League-Stats',
   'Defensive Actions': 'https://fbref.com/en/comps/9/2024-2025/defense/2024-2025/Premier-League-Stats',
   'Possession': 'https://fbref.com/en/comps/9/2024-2025/possession/2024-2025/Premier-League-Stats',
   'Playing Time': 'https://fbref.com/en/comps/9/20

***ARCHIVO TEAM_DATA.PY***

DATOS EQUIPOS -- CORRECTO Y ACABADO

In [85]:
# Paso 3: Elegir liga, temporada y tipo de estadística
liga = 'La Liga'
temporada = '2024-2025'
tipo_estadistica = 'passing'  # puede ser 'stats', 'shooting', etc.

In [86]:
url = teams_urls[liga][temporada][tipo_estadistica]

In [87]:
def obtener_tabla_equipos_estadistica_unica( url_general: str,  stats_vs: bool = False,   guardar_csv: bool = False,
    league: str = 'La Liga', season: str = '2024') -> pd.DataFrame:
    """
    Descarga y limpia la tabla de estadísticas de equipos desde una URL de FBref.

    Args:
        url_general (str): URL de la tabla de FBref.
        stats_vs (bool): True si la tabla deseada es la segunda (vs) en la página.
        guardar_csv (bool): True para guardar la tabla limpia en CSV.
        league (str): Nombre de la liga.
        season (str): Temporada.

    Returns:
        pd.DataFrame: Tabla limpia con columnas renombradas.
    """
    tablas = pd.read_html(url_general)
    df = tablas[1] if stats_vs else tablas[0]

    # Definir metrica_general siempre
    metrica_general = url_general.split('/')[-2].replace('-', '_').lower()

    # Procesar MultiIndex de columnas si existe
    if isinstance(df.columns, pd.MultiIndex):
        nuevas_columnas = []
        for col in df.columns:
            over_header = col[0].strip().replace(' ', '_').lower()
            data_stat = col[1].strip().replace(' ', '_').lower()
            nuevas_columnas.append(f"{data_stat}_{over_header}_{metrica_general}")

        df.columns = nuevas_columnas

    # Eliminar filas que contengan títulos o vacías
    if any(df.iloc[:, 0].astype(str).str.contains('Squad', case=False, na=False)):
        df = df[~df.iloc[:, 0].astype(str).str.contains('Squad', case=False, na=False)].copy()

    df.reset_index(drop=True, inplace=True)

   
    # Guardar CSV si solicitado
    if guardar_csv:
        league_clean = league.lower().replace(' ', '_')
        prefijo_vs = 'vs_' if stats_vs else ''
        nombre_archivo = f'./df_equipos_{prefijo_vs}{metrica_general}_{league_clean}_{season}.csv'
        df.to_csv(nombre_archivo, index=False)

    return df

In [89]:
obtener_tabla_equipos_estadistica_unica(url, stats_vs=False, guardar_csv= True, league='La Liga', season='2024')

Unnamed: 0,squad_unnamed:_0_level_0_passing,#_pl_unnamed:_1_level_0_passing,90s_unnamed:_2_level_0_passing,cmp_total_passing,att_total_passing,cmp%_total_passing,totdist_total_passing,prgdist_total_passing,cmp_short_passing,att_short_passing,...,cmp%_long_passing,ast_unnamed:_17_level_0_passing,xag_unnamed:_18_level_0_passing,xa_expected_passing,a-xag_expected_passing,kp_unnamed:_21_level_0_passing,1/3_unnamed:_22_level_0_passing,ppa_unnamed:_23_level_0_passing,crspa_unnamed:_24_level_0_passing,prgp_unnamed:_25_level_0_passing
0,Alavés,29,38.0,10534,14746,71.4,199550,84871,4546,5494,...,46.2,20,25.4,25.9,-5.4,284,963,229,99,1160
1,Athletic Club,31,38.0,13665,17375,78.6,253362,90647,5552,6396,...,51.0,40,36.7,33.1,3.3,351,1283,274,90,1579
2,Atlético Madrid,24,38.0,17826,21418,83.2,295213,100382,8814,9752,...,59.0,51,47.9,45.3,3.1,348,1233,365,96,1559
3,Barcelona,28,38.0,22321,25721,86.8,371914,116385,10452,11341,...,65.1,72,67.5,64.7,4.5,500,2020,479,77,2212
4,Betis,36,38.0,15770,19284,81.8,264738,91436,7583,8423,...,54.9,37,37.5,34.8,-0.5,417,1127,329,75,1486
5,Celta Vigo,30,38.0,17798,21253,83.7,312877,104182,7549,8367,...,58.7,40,38.4,36.9,1.6,339,1424,300,53,1659
6,Espanyol,27,38.0,10580,13996,75.6,199624,80026,4245,4942,...,49.3,27,24.5,23.4,2.5,256,772,204,81,1000
7,Getafe,30,38.0,8695,12922,67.3,169125,74933,3711,4579,...,42.4,21,23.9,23.8,-2.9,296,958,226,104,981
8,Girona,30,38.0,18273,21401,85.4,322614,100730,8002,8697,...,59.5,31,31.6,37.4,-0.6,308,1494,302,87,1691
9,Las Palmas,34,38.0,14352,18209,78.8,254010,92291,6396,7255,...,50.3,27,26.3,26.5,0.7,304,1059,246,79,1210



GENERACION DATAFRAME DE TODAS LAS ESTADISTICAS PARA LOS EQUIPOS -- CORRECTO Y ACABADO

In [34]:
def renombrar_columna_squad(df):
    """
    Renombra la columna que contiene 'squad' (sin importar mayúsculas/minúsculas) a 'Squad'.
    
    Args:
        df (pd.DataFrame): DataFrame de entrada.
        
    Returns:
        pd.DataFrame: DataFrame con la columna renombrada.
    """
    df = df.copy()
    df.columns = ['squad' if 'squad' in str(col).lower() else col for col in df.columns]
    return df

In [102]:

def creacion_df_general_estadisticas_equipos(urls, stats_vs=False, guardar_csv=False, guardar_csv_individuales=False,  league='La Liga', season='2024'):
    """
    Crea un dataframe combinado con estadísticas generales de equipos (excluye porteros).

    Args:
        urls (dict): Diccionario con URLs de cada tipo de estadística (stats, shooting, passing, etc.).
        stats_vs (bool): Si es True, obtiene estadísticas comparativas ('versus').
        guardar_csv (bool): Si es True, guarda el resultado como CSV.
        guardar_csv_individuales (bool): Si True, guarda los CSV individuales por estadística. 
        league (str): Nombre de la liga.
        season (str): Temporada.

    Returns:
        tuple: (df_merged, df_merged_vs) si stats_vs=True, si no (df_merged, None)
    """

    tipos_estadisticas = [
        'stats', 'shooting', 'passing', 'passingtypes',
        'gca', 'defensive', 'possession', 'playingtime', 'misc'
    ]

    def procesar_dfs(stats_vs_flag):
        dataframes = []
        for stat in tipos_estadisticas:
            url = urls.get(stat)
            if url:
                df = obtener_tabla_equipos_estadistica_unica(
                    url_general=url,
                    stats_vs=stats_vs_flag,
                    guardar_csv=guardar_csv_individuales,
                    league=league,
                    season=season
                )
                df_final = renombrar_columna_squad(df)
                dataframes.append(df_final)
                time.sleep(5)
        return dataframes

    # Procesar datos estándar
    dataframes = procesar_dfs(stats_vs_flag=False)
    df_merged = reduce(lambda left, right: pd.merge(left, right, on='squad', how='outer'), dataframes)

    # Procesar datos "vs" si aplica
    df_merged_vs = None
    if stats_vs:
        dataframes_vs = procesar_dfs(stats_vs_flag=True)
        df_merged_vs = reduce(lambda left, right: pd.merge(left, right, on='squad', how='outer'), dataframes_vs)

    # Guardado CSV final
    if guardar_csv:
        league_clean = league.lower().replace(' ', '_')
        df_merged.to_csv(f'estadisticas_equipos_{league_clean}_{season}.csv', index=False)
        if stats_vs and df_merged_vs is not None:
            df_merged_vs.to_csv(f'estadisticas_equipos_vs_{league_clean}_{season}.csv', index=False)

    return df_merged, df_merged_vs

In [93]:
liga = 'La Liga'
temporada = '2024-2025'
urls_equipos = teams_urls[liga][temporada]
urls_equipos

{'stats': 'https://fbref.com/en/comps/12/stats/La-Liga-Stats',
 'shooting': 'https://fbref.com/en/comps/12/shooting/La-Liga-Stats',
 'passing': 'https://fbref.com/en/comps/12/passing/La-Liga-Stats',
 'passingtypes': 'https://fbref.com/en/comps/12/passing_types/La-Liga-Stats',
 'gca': 'https://fbref.com/en/comps/12/gca/La-Liga-Stats',
 'defensive': 'https://fbref.com/en/comps/12/defense/La-Liga-Stats',
 'possession': 'https://fbref.com/en/comps/12/possession/La-Liga-Stats',
 'playingtime': 'https://fbref.com/en/comps/12/playingtime/La-Liga-Stats',
 'misc': 'https://fbref.com/en/comps/12/misc/La-Liga-Stats',
 'keepers': 'https://fbref.com/en/comps/12/keepers/La-Liga-Stats',
 'keepersadv': 'https://fbref.com/en/comps/12/keepersadv/La-Liga-Stats'}

In [103]:
df_equipos, df_equiposvs= creacion_df_general_estadisticas_equipos(urls_equipos, stats_vs=True, guardar_csv=True,guardar_csv_individuales=False, league='La Liga', season='2024')

FUNCIONA PARA PORTEROS -- CORRECTO Y ACABADO

In [110]:
def creacion_df_general_estadisticas_equipos_porteros(urls: dict, stats_vs: bool = False, guardar_csv: bool = False,  guardar_csv_individuales: bool = False,
            league: str = 'La Liga',  season: str = '2024') -> tuple[pd.DataFrame, pd.DataFrame | None]:
    """
    Crea dataframes combinados con estadísticas de equipos para porteros y porteros avanzados.

    Args:
        urls (dict): Diccionario con claves 'keepers' y 'keepersadv' y sus respectivas URLs.
        stats_vs (bool): Si es True, incluye estadísticas 'versus' (comparativas).
        guardar_csv (bool): Si es True, guarda el resultado como CSV.
        league (str): Nombre de la liga.
        season (str): Temporada.

    Returns:
        tuple: (df_merged, df_merged_vs) si stats_vs=True, si no (df_merged, None)
    """
    tipos_estadisticas = [
        'keepers', 'keepersadv'
    ]
    
    def procesar_dfs(stats_vs_flag):
        dataframes = []
        for stat in tipos_estadisticas:
            url = urls.get(stat)
            if url:
                df = obtener_tabla_equipos_estadistica_unica(
                    url_general=url,
                    stats_vs=stats_vs_flag,
                    guardar_csv=guardar_csv_individuales,
                    league=league,
                    season=season
                )
                df_final = renombrar_columna_squad(df)
                dataframes.append(df_final)
                time.sleep(5)
        return dataframes

    # Procesar datos estándar
    dataframes = procesar_dfs(stats_vs_flag=False)
    df_merged = reduce(lambda left, right: pd.merge(left, right, on='squad', how='outer'), dataframes)

    # Procesar datos "vs" si aplica
    df_merged_vs = None
    if stats_vs:
        dataframes_vs = procesar_dfs(stats_vs_flag=True)
        df_merged_vs = reduce(lambda left, right: pd.merge(left, right, on='squad', how='outer'), dataframes_vs)


    # Guardado CSV final
    if guardar_csv:
        league_clean = league.lower().replace(' ', '_')
        df_merged.to_csv(f'estadisticas_equipos_keepers_{league_clean}_{season}.csv', index=False)
        if stats_vs and df_merged_vs is not None:
            df_merged_vs.to_csv(f'estadisticas_equipos_keepers_vs_{league_clean}_{season}.csv', index=False)

    return df_merged, df_merged_vs
    


In [117]:
df_porteros, df_porteros_vs= creacion_df_general_estadisticas_equipos_porteros(urls_equipos, stats_vs=True, guardar_csv=True, guardar_csv_individuales= False, league='La Liga', season='2024')

OBTENER TABLA CLASIFICATORIA DE LA LIGA -- CORRECTO Y YA ACABADO

In [154]:
def obtener_tabla_liga_principal(url_general: str) -> pd.DataFrame:
    """
    Extrae y limpia la tabla clasificatoria de una liga desde FBref.

    Args:
        url_general (str): URL (en inglés) de la página de clasificación de la liga en FBref.

    Returns:
        pd.DataFrame: DataFrame limpio con la tabla clasificatoria, 
                      renombrando la columna 'RL' a 'Posicion' y eliminando la columna 'Notas'.
    """
    try:
        tablas = pd.read_html(url_general)
        tabla = tablas[0]
        tabla = tabla.rename(columns={'Rk': 'Position'})
        if 'Notes' in tabla.columns:
            tabla = tabla.drop(columns=['Notes'])
        return tabla
    except Exception as e:
        print(f"[ERROR] No se pudo obtener la tabla principal de liga desde {url_general}: {e}")
        return pd.DataFrame()


In [155]:
obtener_tabla_liga_principal('https://fbref.com/en/comps/20/Bundesliga-Stats')

Unnamed: 0,Position,Squad,MP,W,D,L,GF,GA,GD,Pts,Pts/MP,xG,xGA,xGD,xGD/90,Attendance,Top Team Scorer,Goalkeeper
0,1,Bayern Munich,34,25,7,2,99,32,67,82,2.41,81.7,25.5,56.2,1.65,75000,Harry Kane - 26,Manuel Neuer
1,2,Leverkusen,34,19,12,3,72,43,29,69,2.03,56.9,36.2,20.6,0.61,29961,Patrik Schick - 21,Lukáš Hrádecký
2,3,Eint Frankfurt,34,17,9,8,68,46,22,60,1.76,65.1,48.0,17.1,0.5,57600,"Omar Marmoush, Hugo Ekitike - 15",Kevin Trapp
3,4,Dortmund,34,17,6,11,71,51,20,57,1.68,61.2,42.8,18.4,0.54,81365,Serhou Guirassy - 21,Gregor Kobel
4,5,Freiburg,34,16,7,11,49,53,-4,55,1.62,44.3,42.4,1.8,0.05,34188,Ritsu Doan - 10,Noah Atubolu
5,6,Mainz 05,34,14,10,10,55,43,12,52,1.53,50.1,48.1,2.0,0.06,32354,Jonathan Burkardt - 18,Robin Zentner
6,7,RB Leipzig,34,13,12,9,53,48,5,51,1.5,46.6,53.3,-6.8,-0.2,44803,Benjamin Šeško - 13,Péter Gulácsi
7,8,Werder Bremen,34,14,9,11,54,57,-3,51,1.5,49.1,48.1,1.0,0.03,41350,Jens Stage - 10,Michael Zetterer
8,9,Stuttgart,34,14,8,12,64,53,11,50,1.47,62.3,46.9,15.4,0.45,59265,Ermedin Demirović - 15,Alexander Nübel
9,10,Gladbach,34,13,6,15,55,57,-2,45,1.32,50.6,63.4,-12.7,-0.37,53056,Tim Kleindienst - 16,Moritz Nicolas


DATOS JUGADOR CONCRETO - INFORME RECLUTAMIENTO 

In [156]:
url_jugador= 'https://fbref.com/en/players/a755db8c/scout/365_m1/Rodrigo-Muniz-Scouting-Report'
url_jugador

'https://fbref.com/en/players/a755db8c/scout/365_m1/Rodrigo-Muniz-Scouting-Report'

In [162]:
def obtener_jugadores_similares(url_jugador: str)-> pd.DataFrame:
    """
    Extrae y limpia la tabla de jugadores similares del informe de reclutamiento de un jugador en FBref.

    Args:
        url_jugador (str): URL del informe de reclutamiento del jugador en FBref.

    Returns:
        pd.DataFrame: DataFrame limpio con los jugadores similares, sin columnas irrelevantes y con la nacionalidad normalizada.
    """
    #Lee todas las tablas HTML de la página del informe de reclutamiento
    tablas = pd.read_html(url_jugador)
    
    #Selecciona la segunda tabla (índice 1), que suele contener los jugadores similares
    tabla_sucia = tablas[1]
    
    #Elimina las columnas 'RL' y 'Comparar', que no aportan información relevante
    tabla_limpia = tabla_sucia.drop(columns=['Rk', 'Compare'])
    
    #Normaliza la columna de nacionalidad: extrae solo el código de país en mayúsculas
    nationality_col = [col for col in tabla_limpia.columns if 'Nation' in col]
    if nationality_col:
        col_name = nationality_col[0]
        tabla_limpia[col_name] = tabla_limpia[col_name].astype(str).str.extract(r'([A-Z]+)$')
    
    #Devuelve el DataFrame limpio
    return tabla_limpia
    

In [158]:
obtener_jugadores_similares(url_jugador)

Unnamed: 0,Player,Nation,Squad
0,Beto,GNB,Everton
1,Alexander Sørloth,NOR,Atlético Madrid
2,Tim Kleindienst,GER,Mönchengladbach
3,Yoane Wissa,COD,Brentford
4,Phil Harres,GER,Holstein Kiel
5,Cristhian Stuani,URU,Girona
6,Raúl Jiménez,MEX,Fulham
7,Ante Budimir,CRO,Osasuna
8,Gorka Guruzeta,ESP,Athletic Club
9,Erling Haaland,NOR,Manchester City


In [163]:
def obtener_tabla_datos_jugador_por90_percentiles(url_jugador: str)-> pd.DataFrame:
    """
    Extrae y limpia la tabla de percentiles 'Per 90' de un informe de reclutamiento de jugador en FBref.

    Args:
        url_jugador (str): URL (en inglés) del informe de reclutamiento del jugador en FBref.

    Returns:
        pd.DataFrame: DataFrame limpio con los datos de percentiles 'Per 90' del jugador.
    """


    #Lee todas las tablas HTML de la página del informe de reclutamiento

    tablas = pd.read_html(url_jugador)

    #Selecciona la tercera tabla (índice 2), que suele contener los percentiles 'Per 90'

    tabla_sucia = tablas[2]

    #Elimina el primer nivel del MultiIndex de columnas si existe

    tabla_sucia.columns = tabla_sucia.columns.droplevel(0)

    #Elimina filas completamente vacías

    tabla_sucia = tabla_sucia.dropna()

    #Filtra filas para quedarse solo con las que tienen valores numéricos en 'Per 90'

    tabla_sucia = tabla_sucia[~tabla_sucia['Per 90'].str.contains(r'[a-zA-Z]', na=False)] 

    #Limpia la columna 'Per 90': elimina el símbolo '%' y convierte a numérico

    tabla_sucia['Per 90'] = tabla_sucia['Per 90'].str.replace('%', '', regex=True)
    tabla_sucia['Per 90'] = pd.to_numeric(tabla_sucia['Per 90'], errors='coerce')

    #Convierte la columna 'Percentil' a tipo numérico

    tabla_sucia['Percentile'] = pd.to_numeric(tabla_sucia['Percentile'], errors='coerce')

    #Reinicia el índice del DataFrame limpio

    tabla_limpia = tabla_sucia.reset_index(drop=True)

    #Devuelve el DataFrame limpio
    
    return tabla_limpia 

In [161]:
obtener_tabla_datos_jugador_por90_percentiles(url_jugador)

Unnamed: 0,Statistic,Per 90,Percentile
0,Goals,0.75,87
1,Assists,0.09,37
2,Goals + Assists,0.84,86
3,Non-Penalty Goals,0.75,95
4,Penalty Kicks Made,0.00,21
...,...,...,...
130,Own Goals,0.00,52
131,Ball Recoveries,1.31,8
132,Aerials Won,6.26,98
133,Aerials Lost,5.88,2


TABLA TIROS DE UN PARTIDO -- CORRECTO Y YA ACABADO

In [128]:
url_partido = 'https://fbref.com/en/matches/cc5b4244/Manchester-United-Fulham-August-16-2024-Premier-League'


In [None]:
def obtener_tabla_tiros_partido( url_partido: str,   tiros_por_equipo: bool = False) -> tuple[pd.DataFrame, pd.DataFrame | None, pd.DataFrame | None]:
    """
    Extrae las tablas de tiros de un partido de fútbol desde la URL de FBref.

    Args:
        url_partido (str): URL de la página del partido en FBref.
        tiros_por_equipo (bool): Si True, devuelve también tablas individuales de tiros por equipo.

    Return:
        tuple:
            - pd.DataFrame: Tabla de tiros general (ambos equipos).
            - pd.DataFrame | None: Tabla tiros equipo local (si se solicita).
            - pd.DataFrame | None: Tabla tiros equipo visitante (si se solicita).
    """
    tablas = pd.read_html(url_partido)

    # La tabla general está en el índice 17 (puede cambiar si FBref actualiza la página)
    tabla_tiros = tablas[17]
    tabla_tiros.columns = tabla_tiros.columns.droplevel(0)
  

    tabla_local = None
    tabla_visitante = None

    if tiros_por_equipo:
        tabla_local = tablas[18]
        tabla_local.columns = tabla_local.columns.droplevel(0)

        tabla_visitante = tablas[19]
        tabla_visitante.columns = tabla_visitante.columns.droplevel(0)

    return tabla_tiros, tabla_local, tabla_visitante

In [None]:
tabla_tiros_completo,tabla_tiros_local, tabla_tiros_visitante= obtener_tabla_tiros_partido(url_partido, tiros_por_equipo= True)

RESUMEN ESTADISTICAS GENERALES POR EQUIPO DE UN PARTIDO -- CORRECTO Y YA ACABADO

In [149]:
def limpiar_df_estadisticas_partido(df: pd.DataFrame) -> pd.DataFrame:
    """
    Limpia y normaliza un DataFrame de estadísticas de partido.

    - Extrae el código de nacionalidad en mayúsculas si existe.
    - Extrae la edad como número entero antes de un guion si existe.

    Args:
        df (pd.DataFrame): DataFrame original con estadísticas de partido.

    Return:
        pd.DataFrame: DataFrame limpio y normalizado.
    """
    columnas_lower = [col.lower() for col in df.columns]

    # Procesar columna de nacionalidad
    cols_nacion = [df.columns[i] for i, col in enumerate(columnas_lower) if 'nation' in col]
    if cols_nacion:
        col = cols_nacion[0]
        df[col] = df[col].astype(str).str.extract(r'([A-Z]+)$')

    # Procesar columna de edad
    cols_edad = [df.columns[i] for i, col in enumerate(columnas_lower) if 'age' in col]
    if cols_edad:
        col = cols_edad[0]
        df[col] = df[col].astype(str).str.split('-').str[0]

    return df

In [148]:
def bajada_nivel_porteros(df: pd.DataFrame) -> pd.DataFrame:
    """
    Normaliza columnas de un DataFrame de porteros, bajando nivel de MultiIndex si existe.

    Args:
        df (pd.DataFrame): DataFrame con columnas MultiIndex o normales.

    Return:
        pd.DataFrame: DataFrame con columnas normalizadas a un solo nivel.
    """
    if isinstance(df.columns, pd.MultiIndex):
        nuevas_columnas = []
        for col in df.columns:
            over_header = str(col[0]).strip().replace(' ', '_').lower()
            data_stat = str(col[1]).strip().replace(' ', '_').lower()
            nuevas_columnas.append(f"{data_stat}_{over_header}")
        df.columns = nuevas_columnas
    else:
        df.columns = [str(col).strip().replace(' ', '_').lower() for col in df.columns]
    return df

In [None]:
def obtener_tabla_estadisticas_principales_partido(url_partido: str,    keepers: bool = False
        ) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame | None, pd.DataFrame | None]:
    """
    Extrae y limpia las estadísticas principales de un partido de fútbol desde FBref.

    Args:
        url_partido (str): URL de la página del partido en FBref.
        keepers (bool): Extraer estadísticas de porteros si True.

    Return:
        tuple:
            - estadisticas_local (pd.DataFrame): Estadísticas equipo local.
            - estadisticas_visitante (pd.DataFrame): Estadísticas equipo visitante.
            - keeper_local (pd.DataFrame | None): Estadísticas portero local.
            - keeper_visitante (pd.DataFrame | None): Estadísticas portero visitante.
    """
    tablas = pd.read_html(url_partido)

    # Estadísticas equipo local
    estad_local = tablas[3]
    estad_local.columns = estad_local.columns.droplevel(0)
    estad_local = estad_local.iloc[:-1, :].copy()
    estad_local = limpiar_df_estadisticas_partido(estad_local)

    # Estadísticas equipo visitante
    estad_visit = tablas[10]
    estad_visit.columns = estad_visit.columns.droplevel(0)
    estad_visit = estad_visit.iloc[:-1, :].copy()
    estad_visit = limpiar_df_estadisticas_partido(estad_visit)

    keeper_local = None
    keeper_visitante = None

    if keepers:
        keeper_local_raw = tablas[9]
        keeper_local = bajada_nivel_porteros(keeper_local_raw)
        keeper_local = limpiar_df_estadisticas_partido(keeper_local)

        keeper_visitante_raw = tablas[16]
        keeper_visitante = bajada_nivel_porteros(keeper_visitante_raw)
        keeper_visitante = limpiar_df_estadisticas_partido(keeper_visitante)

    return estad_local, estad_visit, keeper_local, keeper_visitante

In [151]:
estadisticas_local, estadisticas_visitante, keeper_local, kepper_visitante= obtener_tabla_estadisticas_principales_partido(url_partido, keepers= True)
