In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import time
import os
import pickle
import pandas as pd

In [3]:
# Configura el navegador con Selenium
driver = webdriver.Chrome()

# Abre la página web
url = "https://fbref.com/es/comps/9/Estadisticas-de-Premier-League"
driver.get(url)

# Espera para asegurarte de que la página se cargue completamente
time.sleep(5)

# Obtener el contenido HTML de la página
html = driver.page_source

# Cierra el navegador
driver.quit()

# Usar BeautifulSoup para analizar el HTML
soup = BeautifulSoup(html, 'html.parser')

# Función para extraer datos de la tabla
def extract_team_data():
    # Buscar todas las filas en la tabla que contienen datos de equipos
    rows = soup.find_all('tr', {'data-row': True})
    data = []
    
    for row in rows:
        # Extraer el ranking, verificando que el elemento exista
        rank_cell = row.find('th', {'data-stat': 'rank'})
        rank = rank_cell.text.strip() if rank_cell else None
        
        # Extraer el nombre del equipo, verificando que el elemento exista
        team_cell = row.find('td', {'data-stat': 'team'})
        team_name = team_cell.text.strip() if team_cell else None
        
        # Extraer estadísticas de local
        home_stats = [
            cell.text.strip() for cell in row.find_all('td', {'data-stat': lambda x: x and x.startswith('home_')})
        ] if row.find_all('td', {'data-stat': lambda x: x and x.startswith('home_')}) else [None] * 10
        
        # Extraer estadísticas de visitante
        away_stats = [
            cell.text.strip() for cell in row.find_all('td', {'data-stat': lambda x: x and x.startswith('away_')})
        ] if row.find_all('td', {'data-stat': lambda x: x and x.startswith('away_')}) else [None] * 10
        
        # Agregar la fila de datos completa a la lista
        data.append([rank, team_name] + home_stats + away_stats)
    
    return data

# Definir las columnas para el DataFrame
columns = [
    'Rank', 'Team',
    'Home Games', 'Home Wins', 'Home Ties', 'Home Losses', 
    'Home Goals For', 'Home Goals Against', 'Home Goal Diff', 'Home Points', 
    'Home Points Avg', 'Home xG For', 'Home xG Against', 'Home xG Diff', 
    'Home xG Diff Per 90', 
    'Away Games', 'Away Wins', 'Away Ties', 'Away Losses', 
    'Away Goals For', 'Away Goals Against', 'Away Goal Diff', 'Away Points', 
    'Away Points Avg', 'Away xG For', 'Away xG Against', 'Away xG Diff', 
    'Away xG Diff Per 90'
]

# Extraer los datos y crear un DataFrame
team_data = extract_team_data()
Estadisticas_temparada_2024_2025 = pd.DataFrame(team_data, columns=columns)

# Filtrar filas que tienen valores válidos en todas las columnas relevantes
Estadisticas_temparada_2024_2025= Estadisticas_temparada_2024_2025.dropna()

In [4]:
# Añadir la columna 'Competicion' con el valor 'Liga Profesional'
Estadisticas_temparada_2024_2025['Competicion'] = 'Premier leaqgue'

# Añadir la columna 'Temporada' con el valor 2022
Estadisticas_temparada_2024_2025['Temporada'] = "2024/2025"

In [5]:
Estadisticas_temparada_2024_2025.head(2)

Unnamed: 0,Rank,Team,Home Games,Home Wins,Home Ties,Home Losses,Home Goals For,Home Goals Against,Home Goal Diff,Home Points,...,Away Goals Against,Away Goal Diff,Away Points,Away Points Avg,Away xG For,Away xG Against,Away xG Diff,Away xG Diff Per 90,Competicion,Temporada
20,1,Manchester City,2,2,0,0,6,2,4,6,...,1,4,6,3.0,3.8,1.7,2.1,1.04,Premier leaqgue,2024/2025
21,2,Arsenal,2,1,1,0,3,1,2,4,...,0,3,6,3.0,1.6,2.0,-0.4,-0.19,Premier leaqgue,2024/2025


In [6]:
# Configura el navegador con Selenium
driver = webdriver.Chrome()

# Abre la página web
url = "https://fbref.com/es/comps/9/2023-2024/Estadisticas-2023-2024-Premier-League"
driver.get(url)

# Espera para asegurarte de que la página se cargue completamente
time.sleep(5)

# Obtener el contenido HTML de la página
html = driver.page_source

# Cierra el navegador
driver.quit()

# Usar BeautifulSoup para analizar el HTML
soup = BeautifulSoup(html, 'html.parser')

# Función para extraer datos de la tabla
def extract_team_data():
    # Buscar todas las filas en la tabla que contienen datos de equipos
    rows = soup.find_all('tr', {'data-row': True})
    data = []
    
    for row in rows:
        # Extraer el ranking, verificando que el elemento exista
        rank_cell = row.find('th', {'data-stat': 'rank'})
        rank = rank_cell.text.strip() if rank_cell else None
        
        # Extraer el nombre del equipo, verificando que el elemento exista
        team_cell = row.find('td', {'data-stat': 'team'})
        team_name = team_cell.text.strip() if team_cell else None
        
        # Extraer estadísticas de local
        home_stats = [
            cell.text.strip() for cell in row.find_all('td', {'data-stat': lambda x: x and x.startswith('home_')})
        ] if row.find_all('td', {'data-stat': lambda x: x and x.startswith('home_')}) else [None] * 10
        
        # Extraer estadísticas de visitante
        away_stats = [
            cell.text.strip() for cell in row.find_all('td', {'data-stat': lambda x: x and x.startswith('away_')})
        ] if row.find_all('td', {'data-stat': lambda x: x and x.startswith('away_')}) else [None] * 10
        
        # Agregar la fila de datos completa a la lista
        data.append([rank, team_name] + home_stats + away_stats)
    
    return data

# Definir las columnas para el DataFrame
columns = [
    'Rank', 'Team',
    'Home Games', 'Home Wins', 'Home Ties', 'Home Losses', 
    'Home Goals For', 'Home Goals Against', 'Home Goal Diff', 'Home Points', 
    'Home Points Avg', 'Home xG For', 'Home xG Against', 'Home xG Diff', 
    'Home xG Diff Per 90', 
    'Away Games', 'Away Wins', 'Away Ties', 'Away Losses', 
    'Away Goals For', 'Away Goals Against', 'Away Goal Diff', 'Away Points', 
    'Away Points Avg', 'Away xG For', 'Away xG Against', 'Away xG Diff', 
    'Away xG Diff Per 90'
]

# Extraer los datos y crear un DataFrame
team_data = extract_team_data()
Estadisticas_temparada_2023_2024 = pd.DataFrame(team_data, columns=columns)

# Filtrar filas que tienen valores válidos en todas las columnas relevantes
Estadisticas_temparada_2023_2024 = Estadisticas_temparada_2023_2024.dropna()

In [7]:
# Añadir la columna 'Competicion' con el valor 'Liga Profesional'
Estadisticas_temparada_2023_2024['Competicion'] = 'Premier leaqgue'

# Añadir la columna 'Temporada' con el valor 2022
Estadisticas_temparada_2023_2024['Temporada'] = "2023/2024"

In [8]:
# Mostrar las primeras filas del DataFrame filtrado
Estadisticas_temparada_2023_2024.head(2)

Unnamed: 0,Rank,Team,Home Games,Home Wins,Home Ties,Home Losses,Home Goals For,Home Goals Against,Home Goal Diff,Home Points,...,Away Goals Against,Away Goal Diff,Away Points,Away Points Avg,Away xG For,Away xG Against,Away xG Diff,Away xG Diff Per 90,Competicion,Temporada
20,1,Manchester City,19,14,5,0,51,16,35,47,...,18,27,44,2.32,39.8,21.6,18.2,0.96,Premier leaqgue,2023/2024
21,2,Arsenal,19,15,2,2,48,16,32,47,...,13,30,42,2.21,32.6,14.5,18.2,0.96,Premier leaqgue,2023/2024


In [9]:
# Configura el navegador con Selenium
driver = webdriver.Chrome()

# Abre la página web
url = "https://fbref.com/es/comps/9/2022-2023/Estadisticas-2022-2023-Premier-League"
driver.get(url)

# Espera para asegurarte de que la página se cargue completamente
time.sleep(5)

# Obtener el contenido HTML de la página
html = driver.page_source

# Cierra el navegador
driver.quit()

# Usar BeautifulSoup para analizar el HTML
soup = BeautifulSoup(html, 'html.parser')

# Función para extraer datos de la tabla
def extract_team_data():
    # Buscar todas las filas en la tabla que contienen datos de equipos
    rows = soup.find_all('tr', {'data-row': True})
    data = []
    
    for row in rows:
        # Extraer el ranking, verificando que el elemento exista
        rank_cell = row.find('th', {'data-stat': 'rank'})
        rank = rank_cell.text.strip() if rank_cell else None
        
        # Extraer el nombre del equipo, verificando que el elemento exista
        team_cell = row.find('td', {'data-stat': 'team'})
        team_name = team_cell.text.strip() if team_cell else None
        
        # Extraer estadísticas de local
        home_stats = [
            cell.text.strip() for cell in row.find_all('td', {'data-stat': lambda x: x and x.startswith('home_')})
        ] if row.find_all('td', {'data-stat': lambda x: x and x.startswith('home_')}) else [None] * 10
        
        # Extraer estadísticas de visitante
        away_stats = [
            cell.text.strip() for cell in row.find_all('td', {'data-stat': lambda x: x and x.startswith('away_')})
        ] if row.find_all('td', {'data-stat': lambda x: x and x.startswith('away_')}) else [None] * 10
        
        # Agregar la fila de datos completa a la lista
        data.append([rank, team_name] + home_stats + away_stats)
    
    return data

# Definir las columnas para el DataFrame
columns = [
    'Rank', 'Team',
    'Home Games', 'Home Wins', 'Home Ties', 'Home Losses', 
    'Home Goals For', 'Home Goals Against', 'Home Goal Diff', 'Home Points', 
    'Home Points Avg', 'Home xG For', 'Home xG Against', 'Home xG Diff', 
    'Home xG Diff Per 90', 
    'Away Games', 'Away Wins', 'Away Ties', 'Away Losses', 
    'Away Goals For', 'Away Goals Against', 'Away Goal Diff', 'Away Points', 
    'Away Points Avg', 'Away xG For', 'Away xG Against', 'Away xG Diff', 
    'Away xG Diff Per 90'
]

# Extraer los datos y crear un DataFrame
team_data = extract_team_data()
Estadisticas_temparada_2022_2023 = pd.DataFrame(team_data, columns=columns)

# Filtrar filas que tienen valores válidos en todas las columnas relevantes
Estadisticas_temparada_2022_2023 = Estadisticas_temparada_2022_2023.dropna()

In [10]:
# Añadir la columna 'Competicion' con el valor 'Liga Profesional'
Estadisticas_temparada_2022_2023['Competicion'] = 'Premier leaqgue'

# Añadir la columna 'Temporada' con el valor 2022
Estadisticas_temparada_2022_2023['Temporada'] = "2022/2023"

In [11]:
# Mostrar las primeras filas del DataFrame filtrado
Estadisticas_temparada_2022_2023.head(2)

Unnamed: 0,Rank,Team,Home Games,Home Wins,Home Ties,Home Losses,Home Goals For,Home Goals Against,Home Goal Diff,Home Points,...,Away Goals Against,Away Goal Diff,Away Points,Away Points Avg,Away xG For,Away xG Against,Away xG Diff,Away xG Diff Per 90,Competicion,Temporada
20,1,Manchester City,19,17,1,1,60,17,43,52,...,16,18,37,1.95,35.9,19.3,16.6,0.87,Premier leaqgue,2022/2023
21,2,Arsenal,19,14,3,2,53,25,28,45,...,18,17,39,2.05,29.1,22.7,6.4,0.34,Premier leaqgue,2022/2023


In [16]:
# Configura el navegador con Selenium
driver = webdriver.Chrome()

# Abre la página web
url = "https://fbref.com/es/comps/9/2021-2022/Estadisticas-2021-2022-Premier-League"
driver.get(url)

# Espera para asegurarte de que la página se cargue completamente
time.sleep(5)

# Obtener el contenido HTML de la página
html = driver.page_source

# Cierra el navegador
driver.quit()

# Usar BeautifulSoup para analizar el HTML
soup = BeautifulSoup(html, 'html.parser')

# Función para extraer datos de la tabla
def extract_team_data():
    # Buscar todas las filas en la tabla que contienen datos de equipos
    rows = soup.find_all('tr', {'data-row': True})
    data = []
    
    for row in rows:
        # Extraer el ranking, verificando que el elemento exista
        rank_cell = row.find('th', {'data-stat': 'rank'})
        rank = rank_cell.text.strip() if rank_cell else None
        
        # Extraer el nombre del equipo, verificando que el elemento exista
        team_cell = row.find('td', {'data-stat': 'team'})
        team_name = team_cell.text.strip() if team_cell else None
        
        # Extraer estadísticas de local
        home_stats = [
            cell.text.strip() for cell in row.find_all('td', {'data-stat': lambda x: x and x.startswith('home_')})
        ] if row.find_all('td', {'data-stat': lambda x: x and x.startswith('home_')}) else [None] * 10
        
        # Extraer estadísticas de visitante
        away_stats = [
            cell.text.strip() for cell in row.find_all('td', {'data-stat': lambda x: x and x.startswith('away_')})
        ] if row.find_all('td', {'data-stat': lambda x: x and x.startswith('away_')}) else [None] * 10
        
        # Agregar la fila de datos completa a la lista
        data.append([rank, team_name] + home_stats + away_stats)
    
    return data

# Definir las columnas para el DataFrame
columns = [
    'Rank', 'Team',
    'Home Games', 'Home Wins', 'Home Ties', 'Home Losses', 
    'Home Goals For', 'Home Goals Against', 'Home Goal Diff', 'Home Points', 
    'Home Points Avg', 'Home xG For', 'Home xG Against', 'Home xG Diff', 
    'Home xG Diff Per 90', 
    'Away Games', 'Away Wins', 'Away Ties', 'Away Losses', 
    'Away Goals For', 'Away Goals Against', 'Away Goal Diff', 'Away Points', 
    'Away Points Avg', 'Away xG For', 'Away xG Against', 'Away xG Diff', 
    'Away xG Diff Per 90'
]

# Extraer los datos y crear un DataFrame
team_data = extract_team_data()
Estadisticas_temparada_2021_2022 = pd.DataFrame(team_data, columns=columns)

# Filtrar filas que tienen valores válidos en todas las columnas relevantes
Estadisticas_temparada_2021_2022 = Estadisticas_temparada_2021_2022.dropna()

In [17]:
# Añadir la columna 'Competicion' con el valor 'Liga Profesional'
Estadisticas_temparada_2021_2022['Competicion'] = 'Premier leaqgue'

# Añadir la columna 'Temporada' con el valor 2022
Estadisticas_temparada_2021_2022['Temporada'] = "2021/2022"

In [18]:
Estadisticas_temparada_2021_2022.head(2)

Unnamed: 0,Rank,Team,Home Games,Home Wins,Home Ties,Home Losses,Home Goals For,Home Goals Against,Home Goal Diff,Home Points,...,Away Goals Against,Away Goal Diff,Away Points,Away Points Avg,Away xG For,Away xG Against,Away xG Diff,Away xG Diff Per 90,Competicion,Temporada
20,1,Manchester City,19,15,2,2,58,15,43,47,...,11,30,46,2.42,43.1,13.2,29.8,1.57,Premier leaqgue,2021/2022
21,2,Liverpool,19,15,4,0,49,9,40,49,...,17,28,43,2.26,39.8,20.9,18.8,0.99,Premier leaqgue,2021/2022


In [21]:
# Configura el navegador con Selenium
driver = webdriver.Chrome()

# Abre la página web
url = "https://fbref.com/es/comps/9/2020-2021/Estadisticas-2020-2021-Premier-League"
driver.get(url)

# Espera para asegurarte de que la página se cargue completamente
time.sleep(5)

# Obtener el contenido HTML de la página
html = driver.page_source

# Cierra el navegador
driver.quit()

# Usar BeautifulSoup para analizar el HTML
soup = BeautifulSoup(html, 'html.parser')

# Función para extraer datos de la tabla
def extract_team_data():
    # Buscar todas las filas en la tabla que contienen datos de equipos
    rows = soup.find_all('tr', {'data-row': True})
    data = []
    
    for row in rows:
        # Extraer el ranking, verificando que el elemento exista
        rank_cell = row.find('th', {'data-stat': 'rank'})
        rank = rank_cell.text.strip() if rank_cell else None
        
        # Extraer el nombre del equipo, verificando que el elemento exista
        team_cell = row.find('td', {'data-stat': 'team'})
        team_name = team_cell.text.strip() if team_cell else None
        
        # Extraer estadísticas de local
        home_stats = [
            cell.text.strip() for cell in row.find_all('td', {'data-stat': lambda x: x and x.startswith('home_')})
        ] if row.find_all('td', {'data-stat': lambda x: x and x.startswith('home_')}) else [None] * 10
        
        # Extraer estadísticas de visitante
        away_stats = [
            cell.text.strip() for cell in row.find_all('td', {'data-stat': lambda x: x and x.startswith('away_')})
        ] if row.find_all('td', {'data-stat': lambda x: x and x.startswith('away_')}) else [None] * 10
        
        # Agregar la fila de datos completa a la lista
        data.append([rank, team_name] + home_stats + away_stats)
    
    return data

# Definir las columnas para el DataFrame
columns = [
    'Rank', 'Team',
    'Home Games', 'Home Wins', 'Home Ties', 'Home Losses', 
    'Home Goals For', 'Home Goals Against', 'Home Goal Diff', 'Home Points', 
    'Home Points Avg', 'Home xG For', 'Home xG Against', 'Home xG Diff', 
    'Home xG Diff Per 90', 
    'Away Games', 'Away Wins', 'Away Ties', 'Away Losses', 
    'Away Goals For', 'Away Goals Against', 'Away Goal Diff', 'Away Points', 
    'Away Points Avg', 'Away xG For', 'Away xG Against', 'Away xG Diff', 
    'Away xG Diff Per 90'
]

# Extraer los datos y crear un DataFrame
team_data = extract_team_data()
Estadisticas_temparada_2020_2021 = pd.DataFrame(team_data, columns=columns)

# Filtrar filas que tienen valores válidos en todas las columnas relevantes
Estadisticas_temparada_2020_2021 = Estadisticas_temparada_2020_2021.dropna()

In [22]:
# Añadir la columna 'Competicion' con el valor 'Liga Profesional'
Estadisticas_temparada_2020_2021['Competicion'] = 'Premier leaqgue'

# Añadir la columna 'Temporada' con el valor 2022
Estadisticas_temparada_2020_2021['Temporada'] = "2020/2021"

In [23]:
Estadisticas_temparada_2020_2021.head(2)

Unnamed: 0,Rank,Team,Home Games,Home Wins,Home Ties,Home Losses,Home Goals For,Home Goals Against,Home Goal Diff,Home Points,...,Away Goals Against,Away Goal Diff,Away Points,Away Points Avg,Away xG For,Away xG Against,Away xG Diff,Away xG Diff Per 90,Competicion,Temporada
20,1,Manchester City,19,13,2,4,43,17,26,41,...,15,25,45,2.37,31.0,15.3,15.7,0.83,Premier leaqgue,2020/2021
21,2,Manchester Utd,19,9,4,6,38,28,10,31,...,16,19,43,2.26,28.9,19.1,9.7,0.51,Premier leaqgue,2020/2021


In [24]:
dataframes = [
    Estadisticas_temparada_2024_2025,
    Estadisticas_temparada_2023_2024,
    Estadisticas_temparada_2022_2023,
    Estadisticas_temparada_2021_2022,
    Estadisticas_temparada_2020_2021
    
]

# Concatenar los DataFrames a lo largo del eje de las filas
premier_league_temporadas = pd.concat(dataframes, ignore_index=True)

In [25]:
premier_league_temporadas

Unnamed: 0,Rank,Team,Home Games,Home Wins,Home Ties,Home Losses,Home Goals For,Home Goals Against,Home Goal Diff,Home Points,...,Away Goals Against,Away Goal Diff,Away Points,Away Points Avg,Away xG For,Away xG Against,Away xG Diff,Away xG Diff Per 90,Competicion,Temporada
0,1,Manchester City,2,2,0,0,6,2,+4,6,...,1,+4,6,3.00,3.8,1.7,+2.1,+1.04,Premier leaqgue,2024/2025
1,2,Arsenal,2,1,1,0,3,1,+2,4,...,0,+3,6,3.00,1.6,2.0,-0.4,-0.19,Premier leaqgue,2024/2025
2,3,Newcastle Utd,2,2,0,0,3,1,+2,6,...,2,+1,4,2.00,3.2,3.5,-0.3,-0.16,Premier leaqgue,2024/2025
3,4,Liverpool,2,1,0,1,2,1,+1,3,...,0,+5,6,3.00,4.3,1.8,+2.5,+1.27,Premier leaqgue,2024/2025
4,5,Aston Villa,2,1,0,1,3,4,-1,3,...,2,+2,6,3.00,3.4,2.7,+0.6,+0.31,Premier leaqgue,2024/2025
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,16,Brighton,19,4,9,6,22,22,0,21,...,24,-6,20,1.05,20.2,21.1,-0.8,-0.04,Premier leaqgue,2020/2021
96,17,Burnley,19,4,6,9,14,27,-13,18,...,28,-9,21,1.11,18.8,27.3,-8.6,-0.45,Premier leaqgue,2020/2021
97,18,Fulham,19,2,4,13,9,28,-19,10,...,25,-7,18,0.95,21.8,26.7,-4.9,-0.26,Premier leaqgue,2020/2021
98,19,West Brom,19,3,6,10,15,39,-24,15,...,37,-17,11,0.58,17.4,32.5,-15.1,-0.80,Premier leaqgue,2020/2021


In [58]:
premier_league_temporadas.to_csv('Premier_league_tmeporada.csv', index=False)

In [34]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler

# Cargar los datos generales de los equipos
# premier_league_temporadas = pd.read_csv('ruta_a_tu_archivo.csv')

# Características generales que ya tienes (valores agregados por equipo)
features = [
    'Home Goals For', 'Home Goals Against', 'Home xG For', 'Home xG Against',
    'Away Goals For', 'Away Goals Against', 'Away xG For', 'Away xG Against'
]

# Asegurarse de que no haya valores nulos en las características
premier_league_temporadas = premier_league_temporadas.dropna(subset=features)

# Definir X (características) e y (target)
X = premier_league_temporadas[features]
y = premier_league_temporadas['Home Goals For']  # o el target que desees predecir

# Normalización de características
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Validación cruzada con GroupKFold
groups = premier_league_temporadas['Team']  # Agrupar por equipo
gkf = GroupKFold(n_splits=5)

# Entrenar el modelo de RandomForest
model = RandomForestRegressor(random_state=42)
for train_index, test_index in gkf.split(X_scaled, y, groups):
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    model.fit(X_train, y_train)

# Función para predecir un partido
def predict_match(home_team_data, away_team_data):
    # Crear un DataFrame con características del partido
    match_data = pd.DataFrame({
        'Home Goals For': [home_team_data['Home Goals For']],
        'Home Goals Against': [home_team_data['Home Goals Against']],
        'Home xG For': [home_team_data['Home xG For']],
        'Home xG Against': [home_team_data['Home xG Against']],
        'Away Goals For': [away_team_data['Away Goals For']],
        'Away Goals Against': [away_team_data['Away Goals Against']],
        'Away xG For': [away_team_data['Away xG For']],
        'Away xG Against': [away_team_data['Away xG Against']]
    })
    
    # Normalizar las características del partido
    match_data_scaled = scaler.transform(match_data)
    
    # Hacer la predicción
    predicted_goals = model.predict(match_data_scaled)[0]
    return predicted_goals

# Lista de próximos partidos
upcoming_matches = pd.DataFrame({
    "Home Team": ["Southampton","Liverpool", "Manchester City",  "Fulham", "Crystal Palace", "Brighton","Aston Villa","Bournemouth","Tottenham", "Wolves"  ],
    "Away Team": ["Manchester Utd", "Nott'ham Forest","Brentford","West Ham", "Leicester City","Ipswich Town", "Everton", "Chelsea","Arsenal","Newcastle Utd" ]
})

# Función para obtener los datos del equipo
def get_team_data(team_name, df):
    try:
        # Extraemos la fila correspondiente a ese equipo
        team_data = df[df['Team'] == team_name].iloc[0]
        return team_data
    except IndexError:
        print(f"No se encontraron datos para el equipo {team_name}")
        return None

# Predecir los goles para cada partido futuro
for index, row in upcoming_matches.iterrows():
    home_team = row['Home Team']
    away_team = row['Away Team']
    
    # Obtener los datos del equipo local y visitante
    home_team_data = get_team_data(home_team, premier_league_temporadas)
    away_team_data = get_team_data(away_team, premier_league_temporadas)
    
    # Solo hacer la predicción si ambos equipos tienen datos disponibles
    if home_team_data is not None and away_team_data is not None:
        prediction = predict_match(home_team_data, away_team_data)
        print(f"Predicción de goles totales en el partido {home_team} vs {away_team}: {prediction}")
    else:
        print(f"Datos insuficientes para el partido {home_team} vs {away_team}")

Predicción de goles totales en el partido Southampton vs Manchester Utd: 0.88
Predicción de goles totales en el partido Liverpool vs Nott'ham Forest: 2.62
Predicción de goles totales en el partido Manchester City vs Brentford: 4.38
Predicción de goles totales en el partido Fulham vs West Ham: 3.53
Predicción de goles totales en el partido Crystal Palace vs Leicester City: 1.91
Predicción de goles totales en el partido Brighton vs Ipswich Town: 1.95
Predicción de goles totales en el partido Aston Villa vs Everton: 2.99
Predicción de goles totales en el partido Bournemouth vs Chelsea: 1.98
Predicción de goles totales en el partido Tottenham vs Arsenal: 3.81
Predicción de goles totales en el partido Wolves vs Newcastle Utd: 2.95


In [35]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

# Mapeamos las temporadas para asignar más peso a las recientes
season_weights = {
    '2024/2025': 2,  # Mayor peso a la temporada más reciente
    '2023/2024': 1.5,
    '2022/2023': 1,
    '2021/2022': 0.75,
    '2020/2021': 0.5  # Menor peso a la temporada más antigua
}

# Crear una nueva columna de peso basada en la temporada
premier_league_temporadas['season_weight'] = premier_league_temporadas['Temporada'].map(season_weights)

# Definir las características
features = [
    'Home Goals For', 'Home Goals Against', 'Home xG For', 'Home xG Against',
    'Away Goals For', 'Away Goals Against', 'Away xG For', 'Away xG Against'
]

# Asegurarse de que no haya valores nulos en las características
premier_league_temporadas = premier_league_temporadas.dropna(subset=features)

# Definir X (características) e y (target)
X = premier_league_temporadas[features]
y = premier_league_temporadas['Home Goals For']  # o el target que desees predecir

# Normalización de características
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Validación cruzada con GroupKFold
groups = premier_league_temporadas['Team']  # Agrupar por equipo
gkf = GroupKFold(n_splits=5)

# Entrenar el modelo de RandomForest utilizando los pesos de las temporadas
model = RandomForestRegressor(random_state=42)
for train_index, test_index in gkf.split(X_scaled, y, groups):
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Aplicar los pesos por temporada en el entrenamiento
    train_weights = premier_league_temporadas['season_weight'].iloc[train_index]
    
    # Entrenar el modelo usando sample_weight para ponderar las temporadas
    model.fit(X_train, y_train, sample_weight=train_weights)

# Función para predecir un partido
def predict_match(home_team_data, away_team_data):
    # Crear un DataFrame con características del partido
    match_data = pd.DataFrame({
        'Home Goals For': [home_team_data['Home Goals For']],
        'Home Goals Against': [home_team_data['Home Goals Against']],
        'Home xG For': [home_team_data['Home xG For']],
        'Home xG Against': [home_team_data['Home xG Against']],
        'Away Goals For': [away_team_data['Away Goals For']],
        'Away Goals Against': [away_team_data['Away Goals Against']],
        'Away xG For': [away_team_data['Away xG For']],
        'Away xG Against': [away_team_data['Away xG Against']]
    })
    
    # Normalizar las características del partido
    match_data_scaled = scaler.transform(match_data)
    
    # Hacer la predicción
    predicted_goals = model.predict(match_data_scaled)[0]
    return predicted_goals

# Lista de próximos partidos
upcoming_matches = pd.DataFrame({
    "Home Team": ["Southampton","Liverpool", "Manchester City",  "Fulham", "Crystal Palace", "Brighton","Aston Villa","Bournemouth","Tottenham", "Wolves"  ],
    "Away Team": ["Manchester Utd", "Nott'ham Forest","Brentford","West Ham", "Leicester City","Ipswich Town", "Everton", "Chelsea","Arsenal","Newcastle Utd" ]
})

# Función para obtener los datos del equipo
def get_team_data(team_name, df):
    try:
        # Extraemos la fila correspondiente a ese equipo
        team_data = df[df['Team'] == team_name].iloc[0]
        return team_data
    except IndexError:
        print(f"No se encontraron datos para el equipo {team_name}")
        return None

# Predecir los goles para cada partido futuro
for index, row in upcoming_matches.iterrows():
    home_team = row['Home Team']
    away_team = row['Away Team']
    
    # Obtener los datos del equipo local y visitante
    home_team_data = get_team_data(home_team, premier_league_temporadas)
    away_team_data = get_team_data(away_team, premier_league_temporadas)
    
    # Solo hacer la predicción si ambos equipos tienen datos disponibles
    if home_team_data is not None and away_team_data is not None:
        prediction = predict_match(home_team_data, away_team_data)
        print(f"Predicción de goles totales en el partido {home_team} vs {away_team}: {prediction}")
    else:
        print(f"Datos insuficientes para el partido {home_team} vs {away_team}")

Predicción de goles totales en el partido Southampton vs Manchester Utd: 0.79
Predicción de goles totales en el partido Liverpool vs Nott'ham Forest: 2.6
Predicción de goles totales en el partido Manchester City vs Brentford: 4.43
Predicción de goles totales en el partido Fulham vs West Ham: 3.57
Predicción de goles totales en el partido Crystal Palace vs Leicester City: 1.92
Predicción de goles totales en el partido Brighton vs Ipswich Town: 1.95
Predicción de goles totales en el partido Aston Villa vs Everton: 3.02
Predicción de goles totales en el partido Bournemouth vs Chelsea: 1.98
Predicción de goles totales en el partido Tottenham vs Arsenal: 3.74
Predicción de goles totales en el partido Wolves vs Newcastle Utd: 2.96


In [32]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler

# Cargar los datos
# premier_league_temporadas = pd.read_csv('ruta_a_tu_archivo.csv')

# Características que vamos a usar para predecir el total de goles
features = [
    'Home Goals For Mean', 'Home Goals Against Mean', 'Home xG For Mean', 'Home xG Against Mean',
    'Away Goals For Mean', 'Away Goals Against Mean', 'Away xG For Mean', 'Away xG Against Mean',
    'Home Goals For Std', 'Home Goals Against Std', 'Home xG For Std', 'Home xG Against Std',
    'Away Goals For Std', 'Away Goals Against Std', 'Away xG For Std', 'Away xG Against Std'
]

# Agregar medias y desviaciones estándar a las características
def calculate_stats(df):
    df['Home Goals For Mean'] = df.groupby('Team')['Home Goals For'].transform('mean')
    df['Home Goals Against Mean'] = df.groupby('Team')['Home Goals Against'].transform('mean')
    df['Home xG For Mean'] = df.groupby('Team')['Home xG For'].transform('mean')
    df['Home xG Against Mean'] = df.groupby('Team')['Home xG Against'].transform('mean')
    df['Away Goals For Mean'] = df.groupby('Team')['Away Goals For'].transform('mean')
    df['Away Goals Against Mean'] = df.groupby('Team')['Away Goals Against'].transform('mean')
    df['Away xG For Mean'] = df.groupby('Team')['Away xG For'].transform('mean')
    df['Away xG Against Mean'] = df.groupby('Team')['Away xG Against'].transform('mean')
    
    df['Home Goals For Std'] = df.groupby('Team')['Home Goals For'].transform('std')
    df['Home Goals Against Std'] = df.groupby('Team')['Home Goals Against'].transform('std')
    df['Home xG For Std'] = df.groupby('Team')['Home xG For'].transform('std')
    df['Home xG Against Std'] = df.groupby('Team')['Home xG Against'].transform('std')
    df['Away Goals For Std'] = df.groupby('Team')['Away Goals For'].transform('std')
    df['Away Goals Against Std'] = df.groupby('Team')['Away Goals Against'].transform('std')
    df['Away xG For Std'] = df.groupby('Team')['Away xG For'].transform('std')
    df['Away xG Against Std'] = df.groupby('Team')['Away xG Against'].transform('std')

calculate_stats(premier_league_temporadas)

# Asegurarse de que no haya valores nulos en las características
premier_league_temporadas = premier_league_temporadas.dropna(subset=features)

# Definir X (características) e y (target)
X = premier_league_temporadas[features]
y = premier_league_temporadas['Home Goals For']  # Puede cambiar según tu objetivo

# Normalización de características
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Validación cruzada con GroupKFold
groups = premier_league_temporadas['Team']  # Agrupar por equipo
gkf = GroupKFold(n_splits=5)

# Entrenar el modelo de RandomForest
model = RandomForestRegressor(random_state=42)
for train_index, test_index in gkf.split(X_scaled, y, groups):
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    model.fit(X_train, y_train)

# Función para predecir un partido
def predict_match(home_team_data, away_team_data):
    # Crear un DataFrame con características del partido
    match_data = pd.DataFrame({
        'Home Goals For Mean': [home_team_data['Home Goals For Mean']],
        'Home Goals Against Mean': [home_team_data['Home Goals Against Mean']],
        'Home xG For Mean': [home_team_data['Home xG For Mean']],
        'Home xG Against Mean': [home_team_data['Home xG Against Mean']],
        'Away Goals For Mean': [away_team_data['Away Goals For Mean']],
        'Away Goals Against Mean': [away_team_data['Away Goals Against Mean']],
        'Away xG For Mean': [away_team_data['Away xG For Mean']],
        'Away xG Against Mean': [away_team_data['Away xG Against Mean']],
        'Home Goals For Std': [home_team_data['Home Goals For Std']],
        'Home Goals Against Std': [home_team_data['Home Goals Against Std']],
        'Home xG For Std': [home_team_data['Home xG For Std']],
        'Home xG Against Std': [home_team_data['Home xG Against Std']],
        'Away Goals For Std': [away_team_data['Away Goals For Std']],
        'Away Goals Against Std': [away_team_data['Away Goals Against Std']],
        'Away xG For Std': [away_team_data['Away xG For Std']],
        'Away xG Against Std': [away_team_data['Away xG Against Std']]
    })
    
    # Normalizar las características del partido
    match_data_scaled = scaler.transform(match_data)
    
    # Hacer la predicción
    predicted_goals = model.predict(match_data_scaled)[0]
    return predicted_goals

# Lista de próximos partidos
upcoming_matches = pd.DataFrame({
    "Home Team": ["Wolves", "Brentford", "Leicester", "Chelsea", "Everton", 
                  "Nottingham Forest", "Ipswich", "Manchester Utd", "Arsenal", 
                  "West Ham", "Newcastle Utd"],
    "Away Team": ["Chelsea", "Southampton", "Aston Villa", "Crystal Palace", 
                  "Bournemouth", "Wolves", "Fulham", "Liverpool", 
                  "Brighton", "Manchester City", "Tottenham"]
})

# Función para obtener los datos del equipo
def get_team_data(team_name, df):
    try:
        # Extraemos la última fila disponible de ese equipo
        team_data = df[df['Team'] == team_name].iloc[-1]
        return team_data
    except IndexError:
        print(f"No se encontraron datos para el equipo {team_name}")
        return None

# Predecir los goles para cada partido futuro
for index, row in upcoming_matches.iterrows():
    home_team = row['Home Team']
    away_team = row['Away Team']
    
    # Obtener los datos del equipo local y visitante
    home_team_data = get_team_data(home_team, premier_league_temporadas)
    away_team_data = get_team_data(away_team, premier_league_temporadas)
    
    # Solo hacer la predicción si ambos equipos tienen datos disponibles
    if home_team_data is not None and away_team_data is not None:
        prediction = predict_match(home_team_data, away_team_data)
        print(f"Predicción de goles totales en el partido {home_team} vs {away_team}: {prediction}")
    else:
        print(f"Datos insuficientes para el partido {home_team} vs {away_team}")

TypeError: agg function failed [how->mean,dtype->object]