In [2]:
import requests
from bs4 import BeautifulSoup
from io import StringIO
import pandas as pd
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

years = list(range(2024,1992,-1))
url_winners = "https://www.premierleague.com/awards?at=2&aw=20&se=-1"
url_PS = "https://fbref.com/en/comps/9/stats/Premier-League-Stats#header"
playerStats= []

#estas lineas son para extraer el javascript en el html ya que la tabla que uso no esta en html directamente
options = Options() 
options.headless = True #esto es para que no se abra una nueva ventana de chrome
options.add_argument('window-size=1920x1080')
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=options) #inicializas el driver

for year in years:
    driver.get(url_PS)
    time.sleep(5)
    html_content = driver.page_source
    
    try:
        # Use BeautifulSoup to find the link to the previous season
        soup = BeautifulSoup(html_content, 'html.parser')
        previousSeasonLink = soup.select_one("a.prev")
        if previousSeasonLink:
            url_PS = "https://fbref.com" + previousSeasonLink.get("href")
        else:
            break  # Exit loop if no previous season link is found

        # Extract the player stats table
        playerStatsTable = pd.read_html(StringIO(html_content), match="Player Standard Stats")[0]
        playerStatsTable.drop_duplicates(inplace=True)
        playerStatsTable["Season"] = year
        playerStatsTable.columns = playerStatsTable.columns.droplevel()
        playerStatsTable.drop_duplicates(inplace=True)
        playerStatsTable.drop(["Matches"], axis=1, inplace=True)
        
        columns = playerStatsTable.columns
        new_columns = []
        seen = {}
        for column in columns:
            seen[column] = seen.get(column, 0) + 1
            if seen[column] > 1:
                new_columns.append(f"{column}_{seen[column] - 1}")
            else:
                new_columns.append(column)
        playerStatsTable.columns = new_columns
        
        playerStats.append(playerStatsTable)
    except ValueError as e:
        print(f"Failed to find table in year {year}: {e}")
        
    time.sleep(5)
driver.quit()

In [4]:
playerStats[0]

Unnamed: 0,Rk,Player,Nation,Pos,Squad,Age,Born,MP,Starts,Min,...,Ast_1,G+A_1,G-PK_1,G+A-PK,xG_1,xAG_1,xG+xAG,npxG_1,npxG+xAG_1,Unnamed: 21
0,1,Max Aarons,eng ENG,DF,Bournemouth,23,2000,20,13,1237,...,0.07,0.07,0.00,0.07,0.00,0.06,0.06,0.00,0.06,2024
1,2,Joshua Acheampong,eng ENG,DF,Chelsea,17,2006,1,0,6,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,2024
2,3,Bénie Adama Traore,ci CIV,"FW,MF",Sheffield Utd,20,2002,8,3,387,...,0.00,0.00,0.00,0.00,0.06,0.13,0.19,0.06,0.19,2024
3,4,Tyler Adams,us USA,MF,Bournemouth,24,1999,3,1,121,...,0.00,0.00,0.00,0.00,0.00,0.06,0.06,0.00,0.06,2024
4,5,Tosin Adarabioyo,eng ENG,DF,Fulham,25,1997,20,18,1617,...,0.00,0.11,0.11,0.11,0.04,0.01,0.05,0.04,0.05,2024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
598,576,Nicolò Zaniolo,it ITA,"FW,MF",Aston Villa,24,1999,25,9,839,...,0.00,0.21,0.21,0.21,0.28,0.11,0.39,0.28,0.39,2024
599,577,Anass Zaroury,ma MAR,"FW,MF",Burnley,22,2000,6,1,152,...,0.00,0.00,0.00,0.00,0.10,0.02,0.12,0.10,0.12,2024
600,578,Oleksandr Zinchenko,ua UKR,DF,Arsenal,26,1996,27,20,1722,...,0.10,0.16,0.05,0.16,0.03,0.13,0.16,0.03,0.16,2024
601,579,Kurt Zouma,fr FRA,DF,West Ham,28,1994,33,32,2838,...,0.00,0.10,0.10,0.10,0.06,0.01,0.06,0.06,0.06,2024


In [5]:
statsTable = pd.concat(playerStats)
statsTable = statsTable.drop(statsTable.columns[0], axis=1)
statsTable = statsTable.reset_index(drop=True)
statsTable.to_csv("AllPlayerStats.csv")