In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

## Crawling

In [2]:
# En caso de que quiera interactuar con el usuario
def create_list_of_links(url):
    #url = input('Ingrese una pagina de su interés')
    reqs = requests.get(url)
    soup = BeautifulSoup(reqs.text, 'html.parser')               # response
 
    urls = []
    diccionario = {}
    
    for link in soup.find_all('a'):
        #print(link.get('href'))
        urls.append(link.get('href'))
    lista = list(diccionario.fromkeys(urls))
    lista.remove(None)
    
    return lista

https://en.wikipedia.org/wiki/2022_FIFA_World_Cup_squads

In [3]:
listA = create_list_of_links('https://en.wikipedia.org/wiki/2022_FIFA_World_Cup_squads')

In [4]:
len(listA)

1995

 - We need to list all the web pages which contain __'/wiki/xxxx_FIFA_World_Cup_squads'__ , where xxxx represents a year. Web [root](https://en.wikipedia.org) 

Let's filter the urls we want

In [5]:
sudDirsStringLength = len('/wiki/1930_FIFA_World_Cup_squads')  # we could try to filter by the subdirectories string length

In [6]:
subString = 'squads'                   # keyword
rootURL = 'https://en.wikipedia.org'   # website root address

The function `filter_list` receives 3 parameters; a list with all the web sites we got after parsing, a substring, and the website root addres.

In [7]:
def filter_list(aList, mySubString,rootAddress):
    requiredURLS = []
    for element in aList:
        if ((subString in element) and(len(element)==sudDirsStringLength)):
            desiredURL = rootAddress + element
            requiredURLS.append(desiredURL)
    return requiredURLS       

In [8]:
# Get all the squads of every FIFA World Cup (list)
squadsHystorical = filter_list(listA,subString, rootURL)   

In [9]:
squadsHystorical

['https://en.wikipedia.org/wiki/1930_FIFA_World_Cup_squads',
 'https://en.wikipedia.org/wiki/1934_FIFA_World_Cup_squads',
 'https://en.wikipedia.org/wiki/1938_FIFA_World_Cup_squads',
 'https://en.wikipedia.org/wiki/1950_FIFA_World_Cup_squads',
 'https://en.wikipedia.org/wiki/1954_FIFA_World_Cup_squads',
 'https://en.wikipedia.org/wiki/1958_FIFA_World_Cup_squads',
 'https://en.wikipedia.org/wiki/1962_FIFA_World_Cup_squads',
 'https://en.wikipedia.org/wiki/1966_FIFA_World_Cup_squads',
 'https://en.wikipedia.org/wiki/1970_FIFA_World_Cup_squads',
 'https://en.wikipedia.org/wiki/1974_FIFA_World_Cup_squads',
 'https://en.wikipedia.org/wiki/1978_FIFA_World_Cup_squads',
 'https://en.wikipedia.org/wiki/1982_FIFA_World_Cup_squads',
 'https://en.wikipedia.org/wiki/1986_FIFA_World_Cup_squads',
 'https://en.wikipedia.org/wiki/1990_FIFA_World_Cup_squads',
 'https://en.wikipedia.org/wiki/1994_FIFA_World_Cup_squads',
 'https://en.wikipedia.org/wiki/1998_FIFA_World_Cup_squads',
 'https://en.wikipedia.o

We got all the urls we needeed to start 

# Specify the data we are looking for

In [10]:
# Select a table and parse it to get all the html info

In [11]:
page = requests.get(squadsHystorical[0]).text    # at the first web page (1st World Cup)
soup = BeautifulSoup(page, 'html.parser')

In [12]:
allTablesInPage = soup.find_all('table')

## Voy a suponer que en todas las páginas voy a encontrarme con el mismo tipo de tabla

In [13]:
# There are several types of table in the webpage. Therefore, we need to find out which one includes the squads data.

In [14]:
def get_tables_classes(setOfTables):   
    classes = []
    for table in setOfTables:
        classes.append(table.get('class'))
    return  classes  

In [15]:
tablesClasses = get_tables_classes(allTablesInPage)

The `class_=['sortable', 'wikitable', 'plainrowheaders']` is our candidate table class because there are few of the other table classes to be considered as the ones containing the __xx__ different squads data. Let's check this out

In [16]:
# get the number of participating national teams, this number may vary from cup to cup.
def get_total_squads(setOfTables):
    counter = 0
    for table in setOfTables:
        if table.get('class') == ['sortable', 'wikitable', 'plainrowheaders']:
            counter +=1
    return counter

In [17]:
totalSquads = get_total_squads(allTablesInPage)

In [18]:
totalSquads

13

In [19]:
def get_first_squad_table(mySoup):
    tableToList = mySoup.find('table', class_="sortable wikitable plainrowheaders")
    return tableToList

In [20]:
playersTableText = get_first_squad_table(soup)
#playersTableText

In [21]:
# We can get the country names from the 'h3' tags. However there are more 'h3' tags than squads, that's why we'll limit the 
# numbers of tags to get.

In [22]:
listH3CountryNames = soup.find_all("h3", limit = totalSquads)

So, we have thirteen tables, and country names for the thirteen squads of the first World Cup, it seems we are good to go.

In [23]:
worldCupYears = []
for i in range(1930,2023,4):
    if i <= 1938:
        worldCupYears.append(i)
    elif ((i > 1938) & (i <1950)):
        pass
    else:
        worldCupYears.append(i)

### Creo el DataFrame

As we've identified that all the squads data is in `['sortable', 'wikitable', 'plainrowheaders']` class tables, we'll get all of these tables.

In [24]:
def get_all_squads_tables(mySoup):
    allTables = mySoup.find_all('table', class_="sortable wikitable plainrowheaders")
    return allTables

In [25]:
squadsTables = get_all_squads_tables(soup)   

In [26]:
# Not all teams have the same number of players. So, we'll have to update this number for each squad.

In [27]:
def get_total_players_in_squad(mySoup, index):
    totalRows = len(mySoup.find_all('table', class_="sortable wikitable plainrowheaders")[index].find_all('tr')[1::1])
    return totalRows

In [28]:
totalPlayers = get_total_players_in_squad(soup, 4)

In [29]:
totalPlayers

17

In [30]:
# Pandas doesn´t fill the column "Date of birth" correctly, so we'll have work around this issue

This is the table shape and the corresponding tags containing the data 

|No.   |Pos.|Player|Date of birth | Caps | Club |
|------|--------|------|--------------|------|------|
|td tag| th tag |td tag|td tag        |td tag|td tag|

So, we have six columns, and we are interested only on the 'Date of birth' column. In order to get all the dates of birth, we'll look for all the 'td' tags, then filter those containing a date. Normally, we would start filtering at the fourth element, because that's the index of 'Date of birth'. However, the 'Position' column won't be collected because uses a 'th' tag, which lowers by one the index of the elements to its right. For the same reason, we'll use an offset of five to get each next date of birth, instead of six.

The tables of 2018 and 2022 have an additional column that stores each player's goals with their national team before the Worl Cup of that year. The resulting shape is the following:

|No.   |Pos.|Player|Date of birth | Caps |Goals | Club |
|------|--------|------|--------------|------|------|------|
|td tag| th tag |td tag|td tag        |td tag|td tag|td tag|

The reasoning to get the 'Date of birth' for these tables is pretty much the same, the only one difference is that the offset have grown by one.

Let's find out how many columns were created inside a 'td' tag, and how many inside a 'th' tag. Once we get these numbers, we'll use them to calculate an offset to reach the next data of the same column.

In [31]:
def columns_in_tags(aTable):
    # Navigate downwards in the data tree
    tbodyContents = aTable.tbody.contents   
    # The 3rd element corresponds to the 1st data row. Once there, we can start counting the 'th' tags with relevant data
    amountOfThTags  = len(tbodyContents[2].find_all("th")) 
    # and also the 'td' tags
    totalTdTags = len(tbodyContents[2].find_all('td'))      # number of 'td' tags > number of 'th' tags 
    
    return (totalTdTags, amountOfThTags)

In [32]:
columns_in_tags(squadsTables[0])

(5, 1)

In [33]:
def get_list_of_birthdays(aTable, number):
    listOfBirthDays = []
    offset = columns_in_tags(aTable)[0]
    contador = 0
    for i in range(number):
        position = 2 + (contador*offset)    # The date of birth is in the fourth column so our initial position should be at the 
        try:                                # index [3], not[2], but remember that there is a 'th' tag in the middle of the 'td' 
                                            # tags. So our starting point was moved one to the left.
            data = aTable.find_all('td')      # find all the 'td' tags that store the data
            dateOfBirth = data[position]       
            listOfBirthDays.append(dateOfBirth.span.text)
        except (IndexError, AttributeError):
            listOfBirthDays.append(dateOfBirth.text)
        contador +=1
    return listOfBirthDays

In [34]:
listOfBirthDays = get_list_of_birthdays(squadsTables[4], totalPlayers)

In [35]:
#listOfBirthDays

In [36]:
def get_list_of_clubs_federations(aTable, number):
    listOfClubsFederations = []
    offset = columns_in_tags(aTable)[0]
    contador = 0
    for i in range(number):
        position = 4 + (contador*offset)
        try:
            data = aTable.find_all('td')  
            clubFederation = data[position]  
            listOfClubsFederations.append(clubFederation.a.img.get("alt"))
        except (IndexError, AttributeError):
            listOfClubsFederations.append(clubFederation.span.img.get("alt"))
        contador +=1
    return listOfClubsFederations

In [37]:
listOfClubsOrigin = get_list_of_clubs_federations(squadsTables[4], totalPlayers)

In [38]:
#listOfClubsOrigin

In [39]:
# The following function receives both, a beautiful soup object with semi-processed data and the total number of participating
# national teams of a given World Cup; it returns a list with the names of all the teams involved in the event.

In [40]:
def get_listed_national_teams(resultSet, numberOfSquads):
    listNationalTeams=[]
    for i in range(numberOfSquads):
        listNationalTeams.append(resultSet[i].span.text)
    return listNationalTeams  

In [41]:
listNationalTeams= get_listed_national_teams(listH3CountryNames, totalSquads)

In [42]:
listNationalTeams

['Argentina',
 'Chile',
 'France',
 'Mexico',
 'Yugoslavia',
 'Brazil',
 'Bolivia',
 'Uruguay',
 'Romania',
 'Peru',
 'United States',
 'Paraguay',
 'Belgium']

## Files Management

In [56]:
import os 
from os import path

In [54]:
def create_directories_tree(listOfWCYears):
    
    for i in range(len(listOfWCYears)):
        year = str(listOfWCYears[i])
        os.makedirs(f'dataSets\\{year}_world_cup', exist_ok=True)  # makes a new directory
         

In [66]:
if (path.exists('dataSets\\1930_world_cup')) & (len(os.listdir('dataSets'))==(len(worldCupYears))):   
    pass
else:
    create_directories_tree(worldCupYears)

In [61]:
listdir = os.listdir('dataSets')

## Execution

In [67]:
j = 0
for i in range(totalSquads):
    
    myTTable = allTablesInPage[i]
    totalRows =  get_total_players_in_squad(soup, i) 
    
    tags = columns_in_tags(myTTable)
    
    # get the number of columns inside a 'td' tag
    columnsAsTdTags = tags[0]
    
    listOfBirthDays =  get_list_of_birthdays(myTTable, totalRows)
    listOfClubsOrigin = get_list_of_clubs_federations(myTTable, totalRows)
    
    df = pd.read_html(str(myTTable))
    df = pd.concat(df)
    

    if 'Goals' not in df.columns:
        df.insert(5, 'Goals', np.nan)
        
    df['Club Origin'] = listOfClubsOrigin
    df['Date of birth (age)'] = listOfBirthDays

    get_listed_national_teams(listH3CountryNames, totalSquads)    
    df.insert(0, 'National Team', listNationalTeams[i])
    players = df.pop('Player')
    df.insert(0,'Player', players) 
    nameFile = (f'{worldCupYears[i]}_{listNationalTeams[i]}')           
    
    df.to_csv(f'dataSets\\{listdir[j]}\\{nameFile}.csv', index=False)

In [None]:
df.to_csv(f'dataSets\\{listdir[i]}\\{nameFile}.csv', index=False)

In [46]:
pd.read_csv('1930Yugoslavia.csv')

Unnamed: 0,Player,National Team,No.,Pos.,Date of birth (age),Caps,Goals,Club,Club Origin
0,Milorad Arsenijević,Yugoslavia,,MF,(1906-06-06),16,,BSK Beograd,Kingdom of Yugoslavia
1,Ivan Bek,Yugoslavia,,FW,(1909-10-29),2,,FC Sète,France
2,Momčilo Đokić,Yugoslavia,,MF,(1911-02-27),2,,SK Jugoslavija,Kingdom of Yugoslavia
3,Branislav Hrnjiček,Yugoslavia,,MF,(1908-06-05),4,,SK Jugoslavija,Kingdom of Yugoslavia
4,Milutin Ivković (c),Yugoslavia,,DF,(1906-03-03),22,,SK Soko,Kingdom of Yugoslavia
5,Milovan Jakšić,Yugoslavia,,GK,(1909-09-19),2,,SK Soko,Kingdom of Yugoslavia
6,Blagoje Marjanović,Yugoslavia,,FW,(1907-09-09),15,,BSK Beograd,Kingdom of Yugoslavia
7,Bozidar Marković,Yugoslavia,,FW,(1900-01-01),0,,SK Vojvodina,Kingdom of Yugoslavia
8,Dragoslav Mihajlović,Yugoslavia,,DF,(1906-12-13),1,,BSK Beograd,Kingdom of Yugoslavia
9,Dragutin Najdanović,Yugoslavia,,FW,(1908-04-15),3,,BSK Beograd,Kingdom of Yugoslavia
