In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

## Crawling

In [2]:
# Get all the web pages linked to the input url
def create_list_of_links(url):
    #url = input('Ingrese una pagina de su interés')
    reqs = requests.get(url)
    soup = BeautifulSoup(reqs.text, 'html.parser')               
 
    urls = []
    diccionario = {}
    
    for link in soup.find_all('a'):
        urls.append(link.get('href'))
    lista = list(diccionario.fromkeys(urls))
    lista.remove(None)
    
    return lista

In [3]:
listA = create_list_of_links('https://en.wikipedia.org/wiki/2022_FIFA_World_Cup_squads')

 - We need to list all the web pages which contain __'/wiki/xxxx_FIFA_World_Cup_squads'__ , where xxxx represents a year. Web [root](https://en.wikipedia.org) 

Let's filter the urls we want

In [4]:
sudDirsStringLength = len('/wiki/1930_FIFA_World_Cup_squads')  # we could try to filter by the subdirectories string length

In [5]:
subString = 'squads'                   # keyword
rootURL = 'https://en.wikipedia.org'   # website root address

The function `filter_list` receives three parameters. The first is a list of all the websites we got after crawling. The second, a substring with a keyword shared by all wanted urls. And the third is the website root address. 

In [6]:
def filter_list(aList, mySubString,rootAddress):
    requiredURLS = []
    for element in aList:
        if ((subString in element) and(len(element)==sudDirsStringLength)):
            desiredURL = rootAddress + element
            requiredURLS.append(desiredURL)
    return requiredURLS       

## Functions to process the gathered data

The following function, `get_soup`, receives the filtered url list plus an index and returns a BeautifulSoup object which encloses all the url parsed code. 

In [7]:
def get_soup(WorldCupsList, index):   
    
    page = requests.get(WorldCupsList[index]).text
    soup = BeautifulSoup(page, 'html.parser')
    
    return soup

The function `get_total_players_in_squad` counts all the rows of the national teams tables; or what
is the same, all the players belonging to a given team.

In [8]:
def get_total_players_in_squad(mySoup, index):
    totalRows = len(mySoup.find_all('table', class_="sortable wikitable plainrowheaders")[index].find_all('tr')[1::1])
    return totalRows

The tables we've found have an odd and confusing structure. They used different tags to create the columns instead of using the same. The 'Pos.' column is in a "th" tag, while the rest are inside a "td" tag. This fact has complicated our work a little bit. So, we'll have to create two separate counts, one for each type of tag, because we'll have to reshape these tables later.

The function `count_columns_in_tags` counts and discriminates tags.

In [9]:
def count_columns_in_tags(aTable):
    # Navigate downwards in the data tree
    tbodyContents = aTable.tbody.contents   
    # The 3rd element corresponds to the 1st data row. Once in there, we can start counting the 'th' tags with relevant data
    amountOfThTags  = len(tbodyContents[2].find_all("th")) 
    # and also the 'td' tags
    totalTdTags = len(tbodyContents[2].find_all('td'))      # number of 'td' tags > number of 'th' tags 
    
    return (totalTdTags, amountOfThTags)

As Pandas doesn't fill the column named "Date of birth" we'll work around this issue as follows:
Firstly, we'll use the function below, `get_list_of_birthdays`, to get a list of each footballer's birthday.
Secondly, we'll convert the table to a DataFrame.
Finally, we'll add a new column to the DataFrame, and set the values with each corresponding element in the list of birthdays.

In [10]:
def get_list_of_birthdays(aTable, number):
    listOfBirthDays = []
    offset = count_columns_in_tags(aTable)[0]
    contador = 0
    for i in range(number):
        position = 2 + (contador*offset)    # The date of birth is in the fourth column so our initial position should be at the 
        try:                                # index [3], not[2], but remember that there is a 'th' tag in the middle of the 'td' 
                                            # tags. So our starting point was moved one to the left.
            data = aTable.find_all('td')      # find all the 'td' tags that store the data
            dateOfBirth = data[position]       
            listOfBirthDays.append(dateOfBirth.span.text)
        except (IndexError, AttributeError):
            listOfBirthDays.append(dateOfBirth.text)
        contador +=1
    return listOfBirthDays

In the original tables, there is a column with the player's club data, this piece of data is essential, but also we'd like to know the country where a given footballer had been playing until the World Cup kick-off. In each of their cells there is a flag icon that holds the name of the country to which that club belongs. We'll make use of the `get_list_of_clubs_federations` function to extract the country related to each club.

In [11]:
def get_list_of_clubs_federations(aTable, number):
    listOfClubsFederations = []
    offset = count_columns_in_tags(aTable)[0]
    contador = 0
    for i in range(number):
        position = (offset -1) + (contador*offset) # number of 'td' tags columns - number of 'th' tags columns
        data = aTable.find_all('td')  
        clubFederation = data[position] 
        try:
             
            listOfClubsFederations.append(clubFederation.a.img.get("alt"))
        except (IndexError, AttributeError):
            if clubFederation.find(class_="flagicon"):
                listOfClubsFederations.append(clubFederation.span.img.get("alt"))
            else:
                listOfClubsFederations.append(np.nan)
        contador +=1
    return listOfClubsFederations

The function `get_listed_national_teams` gets the names of the countries from the 'H3' or 'H2' tags(depending on the web page code) located above each squad's table.

In [12]:
def get_listed_national_teams(resultSet, numberOfSquads):
    listNationalTeams=[]
    for i in range(numberOfSquads):
        try:
            listNationalTeams.append(resultSet[i].span.text)
        except(AttributeError):
            listNationalTeams.append(resultSet[i].text)
    return listNationalTeams  

####  Functions to deal with additional tables

The following function `exists_replacement_table` searches inside the tables containing squads. If it finds a small one, that means it found an additional table with the data of a replacement player in it.

In [13]:
def exists_replacement_table(squadsTables):
    for i in range(len(squadsTables)):
        if len(squadsTables[i].find_all('tr')[1::1]) < 4:
            return True
        
    return False

If there are replacement players tables the function `report_additional_tables` will catch and list them, for further treatment.

In [14]:
def report_additional_tables(worldCup, squadsTables, reportingList):
    for i in range(len(squadsTables)):
        if len(squadsTables[i].find_all('tr')[1::1]) < 4:
            reportingList.append((worldCup,i))
    return reportingList

Once we have found additional tables we'll need to set them aside. That's what `take_off_additional_tables` does, by renaming their css class.

In [15]:
def take_off_additional_tables(aSoup, squadsTables):
    for i in range(len(squadsTables)):
        if len(squadsTables[i].find_all('tr')[1::1]) < 4:

            # change the css class of the additional tables to differentiate them from the ones with the whole team data.
            additionalTable = squadsTables[i]
            additionalTable['class'] = 'replacement'

    return soup.find_all('table', class_="sortable wikitable plainrowheaders")

## Added data

In [16]:
# Create a list of all the years in which a World Cup was played.
worldCupYears = []
for i in range(1930,2023,4):
    if i <= 1938:
        worldCupYears.append(i)
    elif ((i > 1938) & (i <1950)):
        pass
    else:
        worldCupYears.append(i)

In [17]:
wcKickOffDates = ['1930-07-17','1934-05-27','1938-06-04','1950-06-24','1954-06-16','1958-06-08','1962-05-30','1966-07-11',
                    '1970-05-31','1974-06-13','1978-06-01','1982-06-13','1986-05-31','1990-06-08','1994-06-17','1998-06-10',
                    '2002-05-31','2006-06-09','2010-06-11','2014-06-12','2018-06-14','2022-11-20']

In [18]:
# Initialize a list to collect and identify all the additional players tables,
# that could break our code.
additionalTablesList = []

## Files Management

In [19]:
import os 
from os import path

In [20]:
def create_directories_tree(listOfWCYears):
    
    for i in range(len(listOfWCYears)):
        year = str(listOfWCYears[i])
        os.makedirs(f'dataSets\\{year}_world_cup', exist_ok=True)  # makes a new directory      

In [21]:
if (path.exists('dataSets') == False):
    create_directories_tree(worldCupYears)
if (path.exists('dataSets\\1930_world_cup')) & (len(os.listdir('dataSets')) < (len(worldCupYears))):
    create_directories_tree(worldCupYears)

In [22]:
listdir = os.listdir('dataSets')

## Run the program

In [24]:
%%time
squadsHystorical = filter_list(listA,subString, rootURL)
for j in range(len(squadsHystorical)):
    
    soup = get_soup(squadsHystorical, j)
    allSquadsTables = soup.find_all('table', class_="sortable wikitable plainrowheaders")
    wcKickOff = wcKickOffDates[j]
    
    if exists_replacement_table(allSquadsTables):
        report_additional_tables(worldCupYears[j],allSquadsTables, additionalTablesList)
        take_off_additional_tables(soup, allSquadsTables)
        allSquadsTables = soup.find_all('table', class_="sortable wikitable plainrowheaders")
    
    totalSquads = len(soup.find_all('table',{'class': 'sortable wikitable plainrowheaders'}))
    listH3CountryNames = soup.find_all("h3", limit = totalSquads)
    listH2CountryNames = soup.find_all("span", class_="mw-headline", limit = totalSquads)
    allSquadsTables = soup.find_all('table', class_="sortable wikitable plainrowheaders")

    for i in range(totalSquads):
    
        myTable = allSquadsTables[i]
        totalRows =  get_total_players_in_squad(soup, i) 
    
        tagsTuple = count_columns_in_tags(myTable)
    
        # get the number of columns inside the 'td' tags
        columnsAsTdTags = tagsTuple[0]
            
        try:
            listNationalTeams= get_listed_national_teams(listH3CountryNames, totalSquads)
        except (IndexError):
            listNationalTeams= get_listed_national_teams(listH2CountryNames, totalSquads)
    
        listOfBirthDays =  get_list_of_birthdays(myTable, totalRows)
        listOfClubsOrigin = get_list_of_clubs_federations(myTable, totalRows)
        
        df = create_a_dataframe(str(myTable), listOfClubsOrigin, listOfBirthDays, wcKickOff, listNationalTeams[i])
        
#         df = pd.read_html(str(myTable))
#         df = pd.concat(df)
#         ########################################################### WCKickOff Parámetro día para el dataframe
#         if 'Goals' not in df.columns:
#             df.insert(5, 'Goals', np.nan)
        
#         df['Club Origin'] = listOfClubsOrigin
#         df['Date of birth (age)'] = listOfBirthDays
        
#         df.insert(0, 'National Team', listNationalTeams[i])
#         players = df.pop('Player')
#         df.insert(0,'Player', players) 
        
        nameFile = (f'{worldCupYears[j]}_{listNationalTeams[i]}')           
    
        df.to_csv(f'dataSets\\{listdir[j]}\\{nameFile}.csv', index=False)

CPU times: total: 2min 2s
Wall time: 3min 28s


In [25]:
additionalTablesList

[(1990, 5), (1990, 23)]

In [26]:
os.makedirs(f'errors', exist_ok=True)

In [27]:
with open("errors\\additionalTables.txt", "w") as output:
    output.write(str(additionalTablesList))

In [28]:
f = open("errors\\additionalTables.txt", "r")
print(f.readline())

[(1990, 5), (1990, 23)]
