In [37]:
import requests 
from bs4 import BeautifulSoup
import pandas as pd
import time
from unidecode import unidecode

In [2]:
search_url = "https://fbref.com/en/comps/1/World-Cup-Stats"
response = requests.get(search_url)
soup = BeautifulSoup(response.content)

In [3]:
all_teams_table = soup.find('div', {'id': 'div_results202210_overall'}).find('table')
team_ids = {}
for row in all_teams_table.find_all('tr'):
    curr_row_data = row.find('td' , {'data-stat': 'team'})
    if curr_row_data and curr_row_data.a:
        team_name = curr_row_data.a.text
        team_id = curr_row_data.a.attrs['href'].split('/')[3]
        team_ids[team_name] = team_id

In [6]:
team_ids

{'Argentina': 'f9fddd6e',
 'France': 'b1b36dcd',
 'Croatia': '7b08e376',
 'Morocco': 'af41ccda',
 'Netherlands': '5bb5024a',
 'England': '1862c019',
 'Brazil': '304635c3',
 'Portugal': '4a1b4ea8',
 'Japan': 'ffcf1690',
 'Senegal': '9ab5c684',
 'Australia': 'b90bf4f9',
 'Switzerland': '81021a70',
 'Spain': 'b561dd30',
 'United States': '0f66725b',
 'Poland': '8912dcf0',
 'Korea Republic': '473f0fbf',
 'Germany': 'c1e40422',
 'Ecuador': '123acaf8',
 'Cameroon': '896550da',
 'Uruguay': '870e020f',
 'Tunisia': 'a7c7562a',
 'Mexico': 'b009a548',
 'Belgium': '361422b9',
 'Ghana': '9349828d',
 'Saudi Arabia': '6e84edac',
 'IR Iran': '6a08f71e',
 'Costa Rica': '1ea5ab66',
 'Denmark': '29a4e4af',
 'Serbia': '1d6f5c9b',
 'Wales': 'b1bbcad3',
 'Canada': '9c6d90a0',
 'Qatar': '9b696ed1'}

In [12]:
def get_player_data(team_id , team_name):
    players = []
    url = f"https://fbref.com/en/squads/{team_id}/2022/{team_name}-Men-Stats"
    team_soup = BeautifulSoup(requests.get(url).content)
    squad_table = team_soup.find('table', id = 'roster').tbody
    for row in squad_table.find_all('tr'):
        player_dict = {"team": team_name}
        for td in row:
            player_dict[td.attrs['data-stat']] = td.text
        players.append(player_dict)
    
    return players

In [15]:
player_data = []

for team_name, team_id in team_ids.items():
    player_data.append(get_player_data(team_id, team_name))
    print(f"{team_name} Done.")
    time.sleep(30)


Argentina Done.
France Done.
Croatia Done.
Morocco Done.
Netherlands Done.
England Done.
Brazil Done.
Portugal Done.
Japan Done.
Senegal Done.
Australia Done.
Switzerland Done.
Spain Done.
United States Done.
Poland Done.
Korea Republic Done.
Germany Done.
Ecuador Done.
Cameroon Done.
Uruguay Done.
Tunisia Done.
Mexico Done.
Belgium Done.
Ghana Done.
Saudi Arabia Done.
IR Iran Done.
Costa Rica Done.
Denmark Done.
Serbia Done.
Wales Done.
Canada Done.
Qatar Done.


In [16]:
player_data_flattened = [player for team in player_data for player in team]

In [56]:
players_df = pd.DataFrame(player_data_flattened)

In [57]:
players_df.head()

Unnamed: 0,team,shirtnumber,player,position,club,birth_place,birth_date,age,games,minutes,goals
0,Argentina,8,Marcos Acuña,"DF,FW",1.es Sevilla,"Zapala, Argentina",1991-10-28,31-023,6,373.0,0.0
1,Argentina,16,Thiago Almada,MF,1.us Atlanta Utd,"Argentina, Argentina",2001-04-26,21-208,1,7.0,0.0
2,Argentina,9,Julián Álvarez,FW,1.eng Manchester City,"Argentina, Argentina",2000-01-31,22-293,7,464.0,4.0
3,Argentina,1,Franco Armani,GK,1.ar River Plate,"Casilda, Argentina",1986-10-16,36-035,0,,
4,Argentina,15,Ángel Correa,MF,1.es Atlético Madrid,"Rosario, Argentina",1995-03-09,27-256,1,5.0,0.0


In [58]:
players_df['team'] = players_df['team'].str.lower().str.strip()

In [59]:
players_df['player'] = players_df['player'].apply(unidecode).str.lower().str.strip()

In [60]:
players_df['club'] = players_df['club'].apply(lambda x: " ".join(x.split()[1:])).apply(unidecode).str.lower().str.strip()

In [61]:
players_df['position'] = players_df['position'].apply(lambda x:(x.split(',')[0]))

In [62]:
players_df.head()

Unnamed: 0,team,shirtnumber,player,position,club,birth_place,birth_date,age,games,minutes,goals
0,argentina,8,marcos acuna,DF,sevilla,"Zapala, Argentina",1991-10-28,31-023,6,373.0,0.0
1,argentina,16,thiago almada,MF,atlanta utd,"Argentina, Argentina",2001-04-26,21-208,1,7.0,0.0
2,argentina,9,julian alvarez,FW,manchester city,"Argentina, Argentina",2000-01-31,22-293,7,464.0,4.0
3,argentina,1,franco armani,GK,river plate,"Casilda, Argentina",1986-10-16,36-035,0,,
4,argentina,15,angel correa,MF,atletico madrid,"Rosario, Argentina",1995-03-09,27-256,1,5.0,0.0


In [None]:
players_df.to_csv('../data/players.csv', index = False)