In [1]:
import requests
import copy
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
#extract ifsc wiki page and find all tables
mainPage = requests.get("https://en.wikipedia.org/wiki/IFSC_Climbing_World_Championships")
mainSoup = BeautifulSoup(mainPage.content, 'html.parser')
mainTables = list(mainSoup.find_all(class_="wikitable plainrowheaders"))

In [3]:
#Make each events table a list
mensLeadTable = list(mainTables[0].find_all('td'))
mensBoulderTable = list(mainTables[1].find_all('td'))
mensSpeedTable = list(mainTables[2].find_all('td'))
mensCombinedTable = list(mainTables[3].find_all('td'))

womensLeadTable = list(mainTables[4].find_all('td'))
womensBoulderTable = list(mainTables[5].find_all('td'))
womensSpeedTable = list(mainTables[6].find_all('td'))
womensCombinedTable = list(mainTables[7].find_all('td'))

In [4]:
#Function the names that have their own wikipeida page, 
#extract the name as well as the sub-link to the page
def nameCollection(wikiTable):
    nameList = []
    innerList = []
        
    for data in wikiTable:  
        if data.has_attr('align'):
            continue
        else:
            innerList = [data.find_all('a')[1].get_text(),data.find_all('a')[1]['href']]
            
        if innerList not in nameList and '/w/' not in innerList[1]:    
            nameList.append(innerList)
    return nameList

In [5]:
#place data in lists
mensLeadNames = nameCollection(mensLeadTable)
mensBoulderNames = nameCollection(mensBoulderTable)
mensSpeedNames = nameCollection(mensSpeedTable)
mensCombinedNames = nameCollection(mensCombinedTable)

womensLeadNames = nameCollection(womensLeadTable)
womensBoulderNames = nameCollection(womensBoulderTable)
womensSpeedNames = nameCollection(womensSpeedTable)
womensCombinedNames = nameCollection(womensCombinedTable)

fullMensNames = copy.deepcopy(mensLeadNames) + copy.deepcopy(mensBoulderNames) + copy.deepcopy(mensSpeedNames) + copy.deepcopy(mensCombinedNames)
fullWomensNames = copy.deepcopy(womensLeadNames) + copy.deepcopy(womensBoulderNames) + copy.deepcopy(womensSpeedNames) + copy.deepcopy(womensCombinedNames)

In [6]:
#go through all athlete wikis and collect height/weight information.
#this function also partially validates data by removing incorrect data
def appendNameData(nameList):
    for name in nameList:
        namePage = requests.get("https://en.wikipedia.org" + name[1])
        nameSoup = BeautifulSoup(namePage.content, 'html.parser')
        infoTable = list(nameSoup.find_all(class_="infobox vcard"))
        infoRows = list(infoTable[0].find_all("tr"))
        for row in infoRows:
            if list(row.children)[0].name != "th":
                continue

            if list(row.children)[0].getText() == "Height":
                if list(row.children)[1].getText()[:3].isdigit():
                    nameList[nameList.index(name)].append(list(row.children)[1].getText()[:3])
            if list(row.children)[0].getText() == "Weight":
                if list(row.children)[1].getText()[:2].isdigit():
                    nameList[nameList.index(name)].append(list(row.children)[1].getText()[:2])

In [7]:
#function that removes climber names with no valid data and removes duplicates for combined lists
def validateData(dataList):
    validList = []
    for row in dataList:
        if len(row) == 4 and row not in validList:
            validList.append(row)
    return validList

In [8]:
#create valid list and display it as a dataFrame
appendNameData(fullMensNames)
validMensList = validateData(fullMensNames)
pd.DataFrame(validMensList, columns = ["Name", "Wiki sub-link", "Height", "Weight"])

Unnamed: 0,Name,Wiki sub-link,Height,Weight
0,François Legrand,/wiki/Fran%C3%A7ois_Legrand_(climber),178,65
1,Yuji Hirayama,/wiki/Yuji_Hirayama,173,65
2,François Petit,/wiki/Fran%C3%A7ois_Petit_(climber),173,58
3,Tomáš Mrázek,/wiki/Tom%C3%A1%C5%A1_Mr%C3%A1zek,178,62
4,Patxi Usobiaga Lakunza,/wiki/Patxi_Usobiaga_Lakunza,174,62
5,Alexandre Chabot,/wiki/Alexandre_Chabot,170,58
6,Ramón Julián Puigblanque,/wiki/Ram%C3%B3n_Juli%C3%A1n_Puigblanque,159,48
7,Adam Ondra,/wiki/Adam_Ondra,185,70
8,Jakob Schubert,/wiki/Jakob_Schubert,173,63
9,Sean McColl,/wiki/Sean_McColl,169,60


In [10]:
appendNameData(fullWomensNames)
validWomensList = validateData(fullWomensNames)
pd.DataFrame(validWomensList, columns = ["Name", "Wiki sub-link", "Height", "Weight"])

Unnamed: 0,Name,Wiki sub-link,Height,Weight
0,Liv Sansoz,/wiki/Liv_Sansoz,162,45
1,Muriel Sarkany,/wiki/Muriel_Sarkany,154,47
2,Sandrine Levet,/wiki/Sandrine_Levet,170,52
3,Angela Eiter,/wiki/Angela_Eiter,154,46
4,Akiyo Noguchi,/wiki/Akiyo_Noguchi,165,49
5,Maja Vidmar,/wiki/Maja_Vidmar_(climber),162,47
6,Johanna Ernst,/wiki/Johanna_Ernst,158,48
7,Kim Ja-in,/wiki/Kim_Ja-in,153,42
8,Jessica Pilz,/wiki/Jessica_Pilz,163,52
9,Anna Stöhr,/wiki/Anna_St%C3%B6hr,163,53
