### Import and settings

In [1]:
import requests
import re
from bs4 import BeautifulSoup
import pandas as pd
import time

pd.set_option('display.max_rows', 25)
pd.set_option('display.max_columns', 50)

### Functions

In [2]:
def get_teams(year: int):
    start_year = year-1
    end_year = year
    
    url = f'https://fbref.com/en/comps/Big5/{start_year}-{end_year}/{start_year}-{end_year}-Big-5-European-Leagues-Stats'
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
    result = requests.get(url, headers=headers)
    teams = pd.read_html(result.content)[0]
    soup = BeautifulSoup(result.content, 'html.parser')
    links = soup.find("table", {"id": "big5_table"}).find_all('tr')
    link_list = []
    
    for i in range(1, len(links)):
        link_list.append(links[i].find('a')['href'])

    teams['link'] = link_list
    return teams

def get_players(teams: pd.DataFrame):
    links = teams['link']
    team = teams['Squad']
    
    total = pd.DataFrame()
    
    for i in range(len(links)):
        time.sleep(3)
        url = f'https://fbref.com{links[i]}'
        headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
        result = requests.get(url, headers=headers)
        players = pd.read_html(result.content)[0]
        players.columns = players.columns.droplevel(0)
        players = players.drop(players.tail(2).index)
        soup = BeautifulSoup(result.content, 'html.parser')
        extra = soup.find("table", {"id": re.compile('stats_standard*')}).find_all('tr')
        id_list = []
        link_list = []
        squad = []
        
        for j in range(2, len(extra)-2):
            id_list.append(extra[j].find('th')['data-append-csv'])
            cur_link = extra[j].find('a')['href']
            if cur_link:
                link_list.append(cur_link)
            squad.append(team[i])
            
        players['link'] = link_list
        players['id'] = id_list
        players['team'] = squad
        
        total = pd.concat([players, total], ignore_index=True)
        
    return total

In [8]:
players = pd.DataFrame(columns = ['name', 'link', 'country', 'league_pos', 'MP', 'W', 'D', 'L', 'GF', 'GA', 'PTS',
                                     'xG', 'xGA'])
    
year = 2023

start_year = year-1
end_year = year

url = f'https://fbref.com/en/comps/Big5/{start_year}-{end_year}/{start_year}-{end_year}-Big-5-European-Leagues-Stats'
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
result = requests.get(url, headers=headers)
soup = BeautifulSoup(result.content, 'html.parser')
teams = pd.read_html(result.content)

In [3]:
test = get_teams(2023)
test

Unnamed: 0,Rk,Squad,Country,LgRk,MP,W,D,L,GF,GA,GD,Pts,Pts/MP,xG,xGA,xGD,xGD/90,Attendance,Top Team Scorer,Goalkeeper,link
0,1,Napoli,it ITA,1,38,28,6,4,77,28,49,90,2.37,64.7,31.8,33.0,0.87,46173,Victor Osimhen - 26,Alex Meret,/en/squads/d48ad4ff/Napoli-Stats
1,2,Manchester City,eng ENG,1,38,28,5,5,94,33,61,89,2.34,78.7,32.1,46.6,1.23,53249,Erling Haaland - 36,Ederson,/en/squads/b8fd03ef/Manchester-City-Stats
2,3,Barcelona,es ESP,1,38,28,4,6,70,20,50,88,2.32,75.5,33.2,42.3,1.11,83498,Robert Lewandowski - 23,Marc-André ter Stegen,/en/squads/206d90db/Barcelona-Stats
3,4,Paris S-G,fr FRA,1,38,27,4,7,89,40,49,85,2.24,78.2,48.3,29.9,0.79,46334,Kylian Mbappé - 29,Gianluigi Donnarumma,/en/squads/e2d8892c/Paris-Saint-Germain-Stats
4,5,Arsenal,eng ENG,2,38,26,6,6,88,43,45,84,2.21,71.9,42.0,29.9,0.79,60191,"Martin Ødegaard, Martinelli - 15",Aaron Ramsdale,/en/squads/18bb7c10/Arsenal-Stats
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93,94,Southampton,eng ENG,20,38,6,7,25,36,73,-37,25,0.66,37.7,61.0,-23.3,-0.61,30440,James Ward-Prowse - 9,Gavin Bazunu,/en/squads/33c895d4/Southampton-Stats
94,95,Elche,es ESP,20,38,5,10,23,30,67,-37,25,0.66,37.5,70.0,-32.6,-0.86,19875,Lucas Boyé - 7,Édgar Badía,/en/squads/6c8b07df/Elche-Stats
95,96,Troyes,fr FRA,19,38,4,12,22,45,81,-36,24,0.63,39.1,78.0,-38.9,-1.02,10004,Mama Samba Baldé - 12,Gauthier Gallon,/en/squads/54195385/Troyes-Stats
96,97,Sampdoria,it ITA,20,38,3,10,25,24,71,-47,19,0.50,34.1,66.3,-32.2,-0.85,20133,Manolo Gabbiadini - 7,Emil Audero,/en/squads/8ff9e3b3/Sampdoria-Stats


In [8]:
all_players = get_players(test)

In [9]:
all_players.to_csv("fbref_2023.csv", index=False)

In [67]:
url = f'https://fbref.com/en/squads/54195385/Troyes-Stats'
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
result = requests.get(url, headers=headers)
#players = pd.read_html(result.content)[0]
soup = BeautifulSoup(result.content, 'html.parser')
extra = soup.find("table", {"id": re.compile('stats_standard*')}).find_all('tr')
id_list = []
link_list = []

for j in range(2, len(extra)-2):
    print(extra[j].find('th'))
    id_list.append(extra[j].find('th'))
    link_list.append(extra[j].find('a'))

AttributeError: 'NoneType' object has no attribute 'find_all'