### Import and settings

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

pd.set_option('display.max_rows', 25)
pd.set_option('display.max_columns', 50)

### Functions

In [27]:
def get_teams(year: int):
    teams = pd.DataFrame(columns = ['name', 'link', 'country', 'league_pos', 'MP', 'W', 'D', 'L', 'GF', 'GA', 'PTS',
                                     'xG', 'xGA'])
    
    start_year = year-1
    end_year = year
    
    url = f'https://fbref.com/en/comps/Big5/{start_year}-{end_year}/{start_year}-{end_year}-Big-5-European-Leagues-Stats'
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
    result = requests.get(url, headers=headers)
    soup = BeautifulSoup(result.content, 'html.parser')
    table = soup.find("table", {"id": "big5_table"}).find_all('tr')

    for i in range(1, len(table)):
        team = table[i].find('a')
        vals = table[i].find_all('td')
        teams = pd.concat([pd.DataFrame([[team.text, team['href'], vals[1].text, vals[2].text, vals[3].text, vals[4].text, 
                                          vals[5].text, vals[6].text, vals[7].text,vals[8].text, vals[10].text, 
                                          vals[12].text, vals[13].text]], 
                                        columns=players.columns), teams], ignore_index=True)
        
    time.sleep(2)

    return teams

def get_players(teams: pd.DataFrame):
    links = players['link']

    stats = pd.DataFrame(columns = ['player_id', 'dob', 'club', 'club_id', 'league', 'league_id', 'nation', 'card_id', 'rating', 'pac', 'acceleration', 'sprint_speed', 'sho', 'positioning', 'finishing', 'shot_power', 'long_shots', 
                                    'volleys', 'penalties', 'pas', 'vision', 'crossing', 'fk_accuracy', 'short_passing', 'long_passing', 'curve', 'dri', 'agility', 'balance',
                                    'reactions', 'ball_control', 'dribbling', 'composure', 'def', 'interceptions', 'heading_acc', 'def_awareness', 
                                    'standing_tackle', 'sliding_tackle', 'phy', 'jumping', 'stamina', 'strength', 'aggression', 'gk_diving', 'gk_handling', 'gk_kicking',
                                    'gk_reflexes', 'gk_speed', 'gk_positioning'])

    for i in range(len(links)):
        url = f'https://fbref.com{links[i]}'
        headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
        result = requests.get(url, headers=headers)
        soup = BeautifulSoup(result.content, 'html.parser')
        rating = soup.find("div", {"class": "pcdisplay-rat"})
        ids = soup.find("div", {"id": "page-info"})
        stat_tab = soup.find("div", {"class": "card-body"})
        ss = stat_tab.find_all("div", {"class": "stat_val"})
        dob = soup.find("a", {"style": "color : #346fda;"}).get("title").split()[2]
        
        spec_list = soup.find("div", {"id": "info_content"})
        spec_tab = spec_list.find_all("td")
        club = ""
        league = ""
        nation = ""
        
        spec_len = len(spec_tab)
        club_id = spec_tab[spec_len-2].text
        league_id = spec_tab[spec_len-1].text
        year = int(ids.get("data-year"))
        int_rep = False
        if spec_list.find_all("th")[spec_len-14].text == "Intl. Rep ":
            int_rep = True
        if year == 23:
            if int_rep:
                club = spec_tab[spec_len-19].find("a").text
                try:
                    league = spec_tab[spec_len-17].find("a").text
                except:
                    print(url)
                nation = spec_tab[spec_len-18].find("a").text
            else:
                club = spec_tab[spec_len-18].find("a").text
                league = spec_tab[spec_len-16].find("a").text
                nation = spec_tab[spec_len-17].find("a").text
        elif year < 23 and year >= 15:
                club = spec_tab[1].find("a").text
                league = spec_tab[3].find("a").text
                nation = spec_tab[2].find("a").text
        else:
            print("L")
            
            
        if ss and position[i] != 'GK':
            stats = pd.concat([pd.DataFrame([[ids.get("data-baseid"), dob, club, club_id, league, league_id, nation, ids.get('data-id'), rating.text, ss[1].text, ss[3].text, ss[5].text, ss[7].text, ss[9].text,
                                            ss[11].text, ss[13].text, ss[15].text, ss[17].text, ss[19].text, ss[21].text, ss[23].text, ss[25].text,
                                            ss[27].text, ss[29].text, ss[31].text, ss[33].text, ss[35].text, ss[37].text, ss[39].text, ss[41].text, ss[43].text,
                                            ss[45].text, ss[47].text, ss[49].text, ss[51].text, ss[53].text, ss[55].text, ss[57].text, ss[59].text,
                                            ss[61].text, ss[63].text, ss[65].text, ss[67].text, ss[69].text, None, None, None, None, None, None]], columns=stats.columns), stats], ignore_index=True)            
        else:
            stats = pd.concat([pd.DataFrame([[ids.get("data-baseid"), dob, club, club_id, league, league_id, nation, ids.get('data-id'), rating.text, None, None, None, 
                                                None, None, None, None, None, None, None, None, None, None,
                                                None, None, None, None, None, None, None, None, None, None, None, None, None, None,
                                                None, None, None, None, None, None, None, None, ss[1].text, ss[5].text, ss[9].text, 
                                                ss[13].text, ss[17].text, ss[21].text]], columns=stats.columns), stats], ignore_index=True)

    if out and i % count == 0:
            print(i)
    
    return stats

In [13]:
players = pd.DataFrame(columns = ['name', 'link', 'country', 'league_pos', 'MP', 'W', 'D', 'L', 'GF', 'GA', 'PTS',
                                     'xG', 'xGA'])
    
year = 2023

start_year = year-1
end_year = year

url = f'https://fbref.com/en/comps/Big5/{start_year}-{end_year}/{start_year}-{end_year}-Big-5-European-Leagues-Stats'
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
result = requests.get(url, headers=headers)
soup = BeautifulSoup(result.content, 'html.parser')
teams = soup.find("table", {"id": "big5_table"}).find_all('tr')
print(teams[1].find('a'))
teams[1].find_all('td')[4]

<a href="/en/squads/d48ad4ff/Napoli-Stats">Napoli</a>


<td class="right" data-stat="wins">28</td>

In [28]:
test = get_teams(2023)
test

Unnamed: 0,name,link,country,league_pos,MP,W,D,L,GF,GA,PTS,xG,xGA
0,Angers,/en/squads/69236f98/Angers-Stats,fr FRA,20,38,4,6,28,33,81,18,40.9,61.7
1,Sampdoria,/en/squads/8ff9e3b3/Sampdoria-Stats,it ITA,20,38,3,10,25,24,71,19,34.1,66.3
2,Troyes,/en/squads/54195385/Troyes-Stats,fr FRA,19,38,4,12,22,45,81,24,39.1,78.0
3,Elche,/en/squads/6c8b07df/Elche-Stats,es ESP,20,38,5,10,23,30,67,25,37.5,70.0
4,Southampton,/en/squads/33c895d4/Southampton-Stats,eng ENG,20,38,6,7,25,36,73,25,37.7,61.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
93,Arsenal,/en/squads/18bb7c10/Arsenal-Stats,eng ENG,2,38,26,6,6,88,43,84,71.9,42.0
94,Paris S-G,/en/squads/e2d8892c/Paris-Saint-Germain-Stats,fr FRA,1,38,27,4,7,89,40,85,78.2,48.3
95,Barcelona,/en/squads/206d90db/Barcelona-Stats,es ESP,1,38,28,4,6,70,20,88,75.5,33.2
96,Manchester City,/en/squads/b8fd03ef/Manchester-City-Stats,eng ENG,1,38,28,5,5,94,33,89,78.7,32.1
