In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import time
import os
import pickle
import pandas as pd

In [2]:
# Configura el navegador con Selenium
driver = webdriver.Chrome()

# Abre la página web
url = "https://fbref.com/es/comps/9/Estadisticas-de-Premier-League"
driver.get(url)

# Espera para asegurarte de que la página se cargue completamente
time.sleep(5)

# Obtener el contenido HTML de la página
html = driver.page_source

# Cierra el navegador
driver.quit()

# Usar BeautifulSoup para analizar el HTML
soup = BeautifulSoup(html, 'html.parser')

# Función para extraer datos de la tabla
def extract_team_data():
    # Buscar todas las filas en la tabla que contienen datos de equipos
    rows = soup.find_all('tr', {'data-row': True})
    data = []
    
    for row in rows:
        # Extraer el ranking, verificando que el elemento exista
        rank_cell = row.find('th', {'data-stat': 'rank'})
        rank = rank_cell.text.strip() if rank_cell else None
        
        # Extraer el nombre del equipo, verificando que el elemento exista
        team_cell = row.find('td', {'data-stat': 'team'})
        team_name = team_cell.text.strip() if team_cell else None
        
        # Extraer estadísticas de local
        home_stats = [
            cell.text.strip() for cell in row.find_all('td', {'data-stat': lambda x: x and x.startswith('home_')})
        ] if row.find_all('td', {'data-stat': lambda x: x and x.startswith('home_')}) else [None] * 10
        
        # Extraer estadísticas de visitante
        away_stats = [
            cell.text.strip() for cell in row.find_all('td', {'data-stat': lambda x: x and x.startswith('away_')})
        ] if row.find_all('td', {'data-stat': lambda x: x and x.startswith('away_')}) else [None] * 10
        
        # Agregar la fila de datos completa a la lista
        data.append([rank, team_name] + home_stats + away_stats)
    
    return data

# Definir las columnas para el DataFrame
columns = [
    'Rank', 'Team',
    'Home Games', 'Home Wins', 'Home Ties', 'Home Losses', 
    'Home Goals For', 'Home Goals Against', 'Home Goal Diff', 'Home Points', 
    'Home Points Avg', 'Home xG For', 'Home xG Against', 'Home xG Diff', 
    'Home xG Diff Per 90', 
    'Away Games', 'Away Wins', 'Away Ties', 'Away Losses', 
    'Away Goals For', 'Away Goals Against', 'Away Goal Diff', 'Away Points', 
    'Away Points Avg', 'Away xG For', 'Away xG Against', 'Away xG Diff', 
    'Away xG Diff Per 90'
]

# Extraer los datos y crear un DataFrame
team_data = extract_team_data()
Estadisticas_temparada_2024_2025 = pd.DataFrame(team_data, columns=columns)

# Filtrar filas que tienen valores válidos en todas las columnas relevantes
Estadisticas_temparada_2024_2025= Estadisticas_temparada_2024_2025.dropna()

In [3]:
# Añadir la columna 'Competicion' con el valor 'Liga Profesional'
Estadisticas_temparada_2024_2025['Competicion'] = 'Premier leaqgue'

# Añadir la columna 'Temporada' con el valor 2022
Estadisticas_temparada_2024_2025['Temporada'] = "2024/2025"

In [4]:
Estadisticas_temparada_2024_2025.head(2)

Unnamed: 0,Rank,Team,Home Games,Home Wins,Home Ties,Home Losses,Home Goals For,Home Goals Against,Home Goal Diff,Home Points,...,Away Goals Against,Away Goal Diff,Away Points,Away Points Avg,Away xG For,Away xG Against,Away xG Diff,Away xG Diff Per 90,Competicion,Temporada
20,1,Manchester City,1,1,0,0,4,1,3,3,...,1,4,6,3.0,3.8,1.7,2.1,1.04,Premier leaqgue,2024/2025
21,2,Liverpool,1,1,0,0,2,0,2,3,...,0,5,6,3.0,4.3,1.8,2.5,1.27,Premier leaqgue,2024/2025


In [5]:
# Configura el navegador con Selenium
driver = webdriver.Chrome()

# Abre la página web
url = "https://fbref.com/es/comps/9/2023-2024/Estadisticas-2023-2024-Premier-League"
driver.get(url)

# Espera para asegurarte de que la página se cargue completamente
time.sleep(5)

# Obtener el contenido HTML de la página
html = driver.page_source

# Cierra el navegador
driver.quit()

# Usar BeautifulSoup para analizar el HTML
soup = BeautifulSoup(html, 'html.parser')

# Función para extraer datos de la tabla
def extract_team_data():
    # Buscar todas las filas en la tabla que contienen datos de equipos
    rows = soup.find_all('tr', {'data-row': True})
    data = []
    
    for row in rows:
        # Extraer el ranking, verificando que el elemento exista
        rank_cell = row.find('th', {'data-stat': 'rank'})
        rank = rank_cell.text.strip() if rank_cell else None
        
        # Extraer el nombre del equipo, verificando que el elemento exista
        team_cell = row.find('td', {'data-stat': 'team'})
        team_name = team_cell.text.strip() if team_cell else None
        
        # Extraer estadísticas de local
        home_stats = [
            cell.text.strip() for cell in row.find_all('td', {'data-stat': lambda x: x and x.startswith('home_')})
        ] if row.find_all('td', {'data-stat': lambda x: x and x.startswith('home_')}) else [None] * 10
        
        # Extraer estadísticas de visitante
        away_stats = [
            cell.text.strip() for cell in row.find_all('td', {'data-stat': lambda x: x and x.startswith('away_')})
        ] if row.find_all('td', {'data-stat': lambda x: x and x.startswith('away_')}) else [None] * 10
        
        # Agregar la fila de datos completa a la lista
        data.append([rank, team_name] + home_stats + away_stats)
    
    return data

# Definir las columnas para el DataFrame
columns = [
    'Rank', 'Team',
    'Home Games', 'Home Wins', 'Home Ties', 'Home Losses', 
    'Home Goals For', 'Home Goals Against', 'Home Goal Diff', 'Home Points', 
    'Home Points Avg', 'Home xG For', 'Home xG Against', 'Home xG Diff', 
    'Home xG Diff Per 90', 
    'Away Games', 'Away Wins', 'Away Ties', 'Away Losses', 
    'Away Goals For', 'Away Goals Against', 'Away Goal Diff', 'Away Points', 
    'Away Points Avg', 'Away xG For', 'Away xG Against', 'Away xG Diff', 
    'Away xG Diff Per 90'
]

# Extraer los datos y crear un DataFrame
team_data = extract_team_data()
Estadisticas_temparada_2023_2024 = pd.DataFrame(team_data, columns=columns)

# Filtrar filas que tienen valores válidos en todas las columnas relevantes
Estadisticas_temparada_2023_2024 = Estadisticas_temparada_2023_2024.dropna()

In [6]:
# Añadir la columna 'Competicion' con el valor 'Liga Profesional'
Estadisticas_temparada_2023_2024['Competicion'] = 'Premier leaqgue'

# Añadir la columna 'Temporada' con el valor 2022
Estadisticas_temparada_2023_2024['Temporada'] = "2023/2024"

In [7]:
# Mostrar las primeras filas del DataFrame filtrado
Estadisticas_temparada_2023_2024.head(2)

Unnamed: 0,Rank,Team,Home Games,Home Wins,Home Ties,Home Losses,Home Goals For,Home Goals Against,Home Goal Diff,Home Points,...,Away Goals Against,Away Goal Diff,Away Points,Away Points Avg,Away xG For,Away xG Against,Away xG Diff,Away xG Diff Per 90,Competicion,Temporada
20,1,Manchester City,19,14,5,0,51,16,35,47,...,18,27,44,2.32,39.8,21.6,18.2,0.96,Premier leaqgue,2023/2024
21,2,Arsenal,19,15,2,2,48,16,32,47,...,13,30,42,2.21,32.6,14.5,18.2,0.96,Premier leaqgue,2023/2024


In [8]:
# Configura el navegador con Selenium
driver = webdriver.Chrome()

# Abre la página web
url = "https://fbref.com/es/comps/9/2022-2023/Estadisticas-2022-2023-Premier-League"
driver.get(url)

# Espera para asegurarte de que la página se cargue completamente
time.sleep(5)

# Obtener el contenido HTML de la página
html = driver.page_source

# Cierra el navegador
driver.quit()

# Usar BeautifulSoup para analizar el HTML
soup = BeautifulSoup(html, 'html.parser')

# Función para extraer datos de la tabla
def extract_team_data():
    # Buscar todas las filas en la tabla que contienen datos de equipos
    rows = soup.find_all('tr', {'data-row': True})
    data = []
    
    for row in rows:
        # Extraer el ranking, verificando que el elemento exista
        rank_cell = row.find('th', {'data-stat': 'rank'})
        rank = rank_cell.text.strip() if rank_cell else None
        
        # Extraer el nombre del equipo, verificando que el elemento exista
        team_cell = row.find('td', {'data-stat': 'team'})
        team_name = team_cell.text.strip() if team_cell else None
        
        # Extraer estadísticas de local
        home_stats = [
            cell.text.strip() for cell in row.find_all('td', {'data-stat': lambda x: x and x.startswith('home_')})
        ] if row.find_all('td', {'data-stat': lambda x: x and x.startswith('home_')}) else [None] * 10
        
        # Extraer estadísticas de visitante
        away_stats = [
            cell.text.strip() for cell in row.find_all('td', {'data-stat': lambda x: x and x.startswith('away_')})
        ] if row.find_all('td', {'data-stat': lambda x: x and x.startswith('away_')}) else [None] * 10
        
        # Agregar la fila de datos completa a la lista
        data.append([rank, team_name] + home_stats + away_stats)
    
    return data

# Definir las columnas para el DataFrame
columns = [
    'Rank', 'Team',
    'Home Games', 'Home Wins', 'Home Ties', 'Home Losses', 
    'Home Goals For', 'Home Goals Against', 'Home Goal Diff', 'Home Points', 
    'Home Points Avg', 'Home xG For', 'Home xG Against', 'Home xG Diff', 
    'Home xG Diff Per 90', 
    'Away Games', 'Away Wins', 'Away Ties', 'Away Losses', 
    'Away Goals For', 'Away Goals Against', 'Away Goal Diff', 'Away Points', 
    'Away Points Avg', 'Away xG For', 'Away xG Against', 'Away xG Diff', 
    'Away xG Diff Per 90'
]

# Extraer los datos y crear un DataFrame
team_data = extract_team_data()
Estadisticas_temparada_2022_2023 = pd.DataFrame(team_data, columns=columns)

# Filtrar filas que tienen valores válidos en todas las columnas relevantes
Estadisticas_temparada_2022_2023 = Estadisticas_temparada_2022_2023.dropna()

In [9]:
# Añadir la columna 'Competicion' con el valor 'Liga Profesional'
Estadisticas_temparada_2022_2023['Competicion'] = 'Premier leaqgue'

# Añadir la columna 'Temporada' con el valor 2022
Estadisticas_temparada_2022_2023['Temporada'] = "2022/2023"

In [10]:
# Mostrar las primeras filas del DataFrame filtrado
Estadisticas_temparada_2022_2023.head(2)

Unnamed: 0,Rank,Team,Home Games,Home Wins,Home Ties,Home Losses,Home Goals For,Home Goals Against,Home Goal Diff,Home Points,...,Away Goals Against,Away Goal Diff,Away Points,Away Points Avg,Away xG For,Away xG Against,Away xG Diff,Away xG Diff Per 90,Competicion,Temporada
20,1,Manchester City,19,17,1,1,60,17,43,52,...,16,18,37,1.95,35.9,19.3,16.6,0.87,Premier leaqgue,2022/2023
21,2,Arsenal,19,14,3,2,53,25,28,45,...,18,17,39,2.05,29.1,22.7,6.4,0.34,Premier leaqgue,2022/2023


In [12]:
# Configura el navegador con Selenium
driver = webdriver.Chrome()

# Abre la página web
url = "https://fbref.com/es/comps/9/2021-2022/Estadisticas-2021-2022-Premier-League"
driver.get(url)

# Espera para asegurarte de que la página se cargue completamente
time.sleep(5)

# Obtener el contenido HTML de la página
html = driver.page_source

# Cierra el navegador
driver.quit()

# Usar BeautifulSoup para analizar el HTML
soup = BeautifulSoup(html, 'html.parser')

# Función para extraer datos de la tabla
def extract_team_data():
    # Buscar todas las filas en la tabla que contienen datos de equipos
    rows = soup.find_all('tr', {'data-row': True})
    data = []
    
    for row in rows:
        # Extraer el ranking, verificando que el elemento exista
        rank_cell = row.find('th', {'data-stat': 'rank'})
        rank = rank_cell.text.strip() if rank_cell else None
        
        # Extraer el nombre del equipo, verificando que el elemento exista
        team_cell = row.find('td', {'data-stat': 'team'})
        team_name = team_cell.text.strip() if team_cell else None
        
        # Extraer estadísticas de local
        home_stats = [
            cell.text.strip() for cell in row.find_all('td', {'data-stat': lambda x: x and x.startswith('home_')})
        ] if row.find_all('td', {'data-stat': lambda x: x and x.startswith('home_')}) else [None] * 10
        
        # Extraer estadísticas de visitante
        away_stats = [
            cell.text.strip() for cell in row.find_all('td', {'data-stat': lambda x: x and x.startswith('away_')})
        ] if row.find_all('td', {'data-stat': lambda x: x and x.startswith('away_')}) else [None] * 10
        
        # Agregar la fila de datos completa a la lista
        data.append([rank, team_name] + home_stats + away_stats)
    
    return data

# Definir las columnas para el DataFrame
columns = [
    'Rank', 'Team',
    'Home Games', 'Home Wins', 'Home Ties', 'Home Losses', 
    'Home Goals For', 'Home Goals Against', 'Home Goal Diff', 'Home Points', 
    'Home Points Avg', 'Home xG For', 'Home xG Against', 'Home xG Diff', 
    'Home xG Diff Per 90', 
    'Away Games', 'Away Wins', 'Away Ties', 'Away Losses', 
    'Away Goals For', 'Away Goals Against', 'Away Goal Diff', 'Away Points', 
    'Away Points Avg', 'Away xG For', 'Away xG Against', 'Away xG Diff', 
    'Away xG Diff Per 90'
]

# Extraer los datos y crear un DataFrame
team_data = extract_team_data()
Estadisticas_temparada_2021_2022 = pd.DataFrame(team_data, columns=columns)

# Filtrar filas que tienen valores válidos en todas las columnas relevantes
Estadisticas_temparada_2021_2022 = Estadisticas_temparada_2021_2022.dropna()

In [13]:
# Añadir la columna 'Competicion' con el valor 'Liga Profesional'
Estadisticas_temparada_2021_2022['Competicion'] = 'Premier leaqgue'

# Añadir la columna 'Temporada' con el valor 2022
Estadisticas_temparada_2021_2022['Temporada'] = "2021/2022"

In [14]:
Estadisticas_temparada_2021_2022.head(2)

Unnamed: 0,Rank,Team,Home Games,Home Wins,Home Ties,Home Losses,Home Goals For,Home Goals Against,Home Goal Diff,Home Points,...,Away Goals Against,Away Goal Diff,Away Points,Away Points Avg,Away xG For,Away xG Against,Away xG Diff,Away xG Diff Per 90,Competicion,Temporada
20,1,Manchester City,19,15,2,2,58,15,43,47,...,11,30,46,2.42,43.1,13.2,29.8,1.57,Premier leaqgue,2021/2022
21,2,Liverpool,19,15,4,0,49,9,40,49,...,17,28,43,2.26,39.8,20.9,18.8,0.99,Premier leaqgue,2021/2022


In [15]:
# Configura el navegador con Selenium
driver = webdriver.Chrome()

# Abre la página web
url = "https://fbref.com/es/comps/9/2020-2021/Estadisticas-2020-2021-Premier-League"
driver.get(url)

# Espera para asegurarte de que la página se cargue completamente
time.sleep(5)

# Obtener el contenido HTML de la página
html = driver.page_source

# Cierra el navegador
driver.quit()

# Usar BeautifulSoup para analizar el HTML
soup = BeautifulSoup(html, 'html.parser')

# Función para extraer datos de la tabla
def extract_team_data():
    # Buscar todas las filas en la tabla que contienen datos de equipos
    rows = soup.find_all('tr', {'data-row': True})
    data = []
    
    for row in rows:
        # Extraer el ranking, verificando que el elemento exista
        rank_cell = row.find('th', {'data-stat': 'rank'})
        rank = rank_cell.text.strip() if rank_cell else None
        
        # Extraer el nombre del equipo, verificando que el elemento exista
        team_cell = row.find('td', {'data-stat': 'team'})
        team_name = team_cell.text.strip() if team_cell else None
        
        # Extraer estadísticas de local
        home_stats = [
            cell.text.strip() for cell in row.find_all('td', {'data-stat': lambda x: x and x.startswith('home_')})
        ] if row.find_all('td', {'data-stat': lambda x: x and x.startswith('home_')}) else [None] * 10
        
        # Extraer estadísticas de visitante
        away_stats = [
            cell.text.strip() for cell in row.find_all('td', {'data-stat': lambda x: x and x.startswith('away_')})
        ] if row.find_all('td', {'data-stat': lambda x: x and x.startswith('away_')}) else [None] * 10
        
        # Agregar la fila de datos completa a la lista
        data.append([rank, team_name] + home_stats + away_stats)
    
    return data

# Definir las columnas para el DataFrame
columns = [
    'Rank', 'Team',
    'Home Games', 'Home Wins', 'Home Ties', 'Home Losses', 
    'Home Goals For', 'Home Goals Against', 'Home Goal Diff', 'Home Points', 
    'Home Points Avg', 'Home xG For', 'Home xG Against', 'Home xG Diff', 
    'Home xG Diff Per 90', 
    'Away Games', 'Away Wins', 'Away Ties', 'Away Losses', 
    'Away Goals For', 'Away Goals Against', 'Away Goal Diff', 'Away Points', 
    'Away Points Avg', 'Away xG For', 'Away xG Against', 'Away xG Diff', 
    'Away xG Diff Per 90'
]

# Extraer los datos y crear un DataFrame
team_data = extract_team_data()
Estadisticas_temparada_2020_2021 = pd.DataFrame(team_data, columns=columns)

# Filtrar filas que tienen valores válidos en todas las columnas relevantes
Estadisticas_temparada_2020_2021 = Estadisticas_temparada_2020_2021.dropna()

In [16]:
# Añadir la columna 'Competicion' con el valor 'Liga Profesional'
Estadisticas_temparada_2020_2021['Competicion'] = 'Premier leaqgue'

# Añadir la columna 'Temporada' con el valor 2022
Estadisticas_temparada_2020_2021['Temporada'] = "2020/2021"

In [17]:
Estadisticas_temparada_2020_2021.head(2)

Unnamed: 0,Rank,Team,Home Games,Home Wins,Home Ties,Home Losses,Home Goals For,Home Goals Against,Home Goal Diff,Home Points,...,Away Goals Against,Away Goal Diff,Away Points,Away Points Avg,Away xG For,Away xG Against,Away xG Diff,Away xG Diff Per 90,Competicion,Temporada
20,1,Manchester City,19,13,2,4,43,17,26,41,...,15,25,45,2.37,31.0,15.3,15.7,0.83,Premier leaqgue,2020/2021
21,2,Manchester Utd,19,9,4,6,38,28,10,31,...,16,19,43,2.26,28.9,19.1,9.7,0.51,Premier leaqgue,2020/2021


In [18]:
dataframes = [
    Estadisticas_temparada_2024_2025,
    Estadisticas_temparada_2023_2024,
    Estadisticas_temparada_2022_2023,
    Estadisticas_temparada_2021_2022,
    Estadisticas_temparada_2020_2021
    
]

# Concatenar los DataFrames a lo largo del eje de las filas
Premier_league_temporadas = pd.concat(dataframes, ignore_index=True)

In [19]:
Premier_league_temporadas

Unnamed: 0,Rank,Team,Home Games,Home Wins,Home Ties,Home Losses,Home Goals For,Home Goals Against,Home Goal Diff,Home Points,...,Away Goals Against,Away Goal Diff,Away Points,Away Points Avg,Away xG For,Away xG Against,Away xG Diff,Away xG Diff Per 90,Competicion,Temporada
0,1,Manchester City,1,1,0,0,4,1,+3,3,...,1,+4,6,3.00,3.8,1.7,+2.1,+1.04,Premier leaqgue,2024/2025
1,2,Liverpool,1,1,0,0,2,0,+2,3,...,0,+5,6,3.00,4.3,1.8,+2.5,+1.27,Premier leaqgue,2024/2025
2,3,Brighton,1,1,0,0,2,1,+1,3,...,1,+3,4,2.00,3.2,2.6,+0.6,+0.28,Premier leaqgue,2024/2025
3,4,Arsenal,2,1,1,0,3,1,+2,4,...,0,+2,3,3.00,0.9,1.2,-0.4,-0.38,Premier leaqgue,2024/2025
4,5,Newcastle Utd,2,2,0,0,3,1,+2,6,...,1,0,1,1.00,1.6,2.2,-0.6,-0.60,Premier leaqgue,2024/2025
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,16,Brighton,19,4,9,6,22,22,0,21,...,24,-6,20,1.05,20.2,21.1,-0.8,-0.04,Premier leaqgue,2020/2021
96,17,Burnley,19,4,6,9,14,27,-13,18,...,28,-9,21,1.11,18.8,27.3,-8.6,-0.45,Premier leaqgue,2020/2021
97,18,Fulham,19,2,4,13,9,28,-19,10,...,25,-7,18,0.95,21.8,26.7,-4.9,-0.26,Premier leaqgue,2020/2021
98,19,West Brom,19,3,6,10,15,39,-24,15,...,37,-17,11,0.58,17.4,32.5,-15.1,-0.80,Premier leaqgue,2020/2021
