# Pull results from the Gymternet for the 2024 Olympics

In [3]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests
import re
import unicodedata

In [4]:
athleteURL = 'https://thegymter.net/simone-biles/'
sourceAthlete = requests.get(athleteURL).text
soupA = BeautifulSoup(sourceAthlete,'lxml')

## Define functions for Athlete Extraction

In [76]:
def extractAthleteInfo(aInfoTab):
    athleteDict = {}
    for row in aInfoTab.findAll('tr'):
        rowParts = row.findAll('td')
        athleteDict[rowParts[0].text] = rowParts[1].text
        
    ser = pd.Series(data=athleteDict)
    
    return ser

In [67]:
def obtainYearList(pageSoup):
    yearList = []
    for para in pageSoup.findAll('p'):
    
        #Search for table year headers
        paraList = para.text.split()
        if len(paraList) == 2 and paraList[1] == 'Results':
            yearList.append(int(paraList[0]))
    
    return yearList
    

In [311]:
def extractSeasonScores(seasonTable, year, verbose = 0):
    scoresDict = {'date':[], 'competition':[],'type':[],'vt':[],'ub':[], 'bb':[], 'fx':[], 'aa':[]}
    compRowSpan = 1
    for i in range(1,len(seasonTable.findAll('tr'))):

        row = seasonTable.findAll('tr')[i]
        row_split = row.findAll('td')
        row_len = len(row_split)
        #Deal with competitions that span multiple rows
        if row_len == 6:
            scoresDict['type'].append(row_split[0].text.strip())
            scoresDict['vt'].append(row_split[1].text.strip())
            scoresDict['ub'].append(row_split[2].text.strip())
            scoresDict['bb'].append(row_split[3].text.strip())
            scoresDict['fx'].append(row_split[4].text.strip())
            scoresDict['aa'].append(row_split[5].text.strip())
            compRowSpan -= 1
        elif compRowSpan != 1:
            #Deal with site issues with rowspan being mistakenly too big, aka the 2015 Jade Barbosa problem
            scoresDict['date'].pop()
            scoresDict['competition'].pop()
        else:
            if row.findAll('td')[0].has_attr('rowspan') == False:
                compRowSpan = 1
            else:
                compRowSpan = int(row.findAll('td')[0]['rowspan'])
            for i in range(compRowSpan):
                scoresDict['date'].append(row_split[0].text.strip())
                scoresDict['competition'].append(row_split[1].text.strip())            
           
            scoresDict['type'].append(row_split[2].text.strip())
            scoresDict['vt'].append(row_split[3].text.strip())
            scoresDict['ub'].append(row_split[4].text.strip())
            scoresDict['bb'].append(row_split[5].text.strip())
            scoresDict['fx'].append(row_split[6].text.strip())
            scoresDict['aa'].append(row_split[7].text.strip()) 
            
    #Final pop-off for too big rowspan values
    if compRowSpan != 1:
        scoresDict['date'].pop()
        scoresDict['competition'].pop()
    
    if verbose == 1:
        print(scoresDict)
        for key in scoresDict:
            print(key, len(scoresDict[key]))
    events = ['vt','ub','bb','fx','aa']
    df = pd.DataFrame(scoresDict)
    for event in events:
        df[event] = df[event].str.strip('*')
        df[event] = df[event].str.replace('-','').replace('——','0').replace('','0')
    df[events] = df[events].astype('float')
    df['year'] = year
    
    return df
        

In [274]:
def pullAthleteResults(athleteURL, verbose = 0):
    #Pull webpage using URL
    sourceAthlete = requests.get(athleteURL).text
    soupA = BeautifulSoup(sourceAthlete,'lxml')
    
    #Obtain all web tables (where results are stored)
    tablesA = soupA.find_all('table')
    
    #Athete info is available in the first table
    athleteInfo = extractAthleteInfo(tablesA[0])
    
    #Pull years the athlete competed
    yearList = obtainYearList(soupA)
    
    #Loop over season score tables to obtain scores
    for i in range(len(yearList)):
        #Pull season results
        seasonScores = extractSeasonScores(tablesA[i+1], yearList[i], verbose)
        if verbose == 1:
            print(yearList[i])
        
        if i == 0:
            athleteScores = seasonScores.copy()
        else:
            athleteScores = pd.concat([athleteScores,seasonScores],ignore_index=True,axis=0)
            
    #Add in athlete info
    athleteScores['name'] = athleteInfo['Full Name']
    athleteScores['country'] = athleteInfo['Nation']
    athleteScores['birthdate'] = athleteInfo['Birthdate']
    athleteScores['status'] = athleteInfo['Status']
    
    #Clean up and mark senior status
    athleteScores['birthdate'] = pd.to_datetime(athleteScores['birthdate'])
    athleteScores['birthyear'] = athleteScores['birthdate'].dt.year
    athleteScores['senior'] = ((athleteScores['year'] - athleteScores['birthyear']) >= 16)
        
    return athleteScores

In [168]:
%%time
adf = pullAthleteResults('https://thegymter.net/jade-carey/')
adf

CPU times: total: 297 ms
Wall time: 701 ms


Unnamed: 0,date,competition,type,vt,ub,bb,fx,aa,year,name,country,birthdate,status,birthyear,senior
0,Jun 27-30,U.S. Olympic Trials,AA,14.675,13.075,13.625,14.150,55.525,2024,Jade Carey,United States,2000-05-27,"Active, NCAA (Oregon State)",2000,True
1,Jun 27-30,U.S. Olympic Trials,QF,14.600,13.575,13.575,14.075,55.825,2024,Jade Carey,United States,2000-05-27,"Active, NCAA (Oregon State)",2000,True
2,May 30-Jun 2,U.S. Championships,AA,14.600,13.650,12.300,13.700,54.250,2024,Jade Carey,United States,2000-05-27,"Active, NCAA (Oregon State)",2000,True
3,May 30-Jun 2,U.S. Championships,QF,14.500,13.500,13.450,13.600,55.050,2024,Jade Carey,United States,2000-05-27,"Active, NCAA (Oregon State)",2000,True
4,May 17-18,U.S. Classic,AA,14.300,12.850,13.450,13.800,54.400,2024,Jade Carey,United States,2000-05-27,"Active, NCAA (Oregon State)",2000,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57,Oct 2-8,World Championships,QF,14.849,0.000,0.000,14.100,0.000,2017,Jade Carey,United States,2000-05-27,"Active, NCAA (Oregon State)",2000,True
58,Aug 17-20,U.S. Championships,AA,14.400,0.000,13.850,14.400,0.000,2017,Jade Carey,United States,2000-05-27,"Active, NCAA (Oregon State)",2000,True
59,Aug 17-20,U.S. Championships,QF,14.700,0.000,12.750,14.100,0.000,2017,Jade Carey,United States,2000-05-27,"Active, NCAA (Oregon State)",2000,True
60,Jul 29,U.S. Classic,AA,14.350,0.000,13.750,13.950,0.000,2017,Jade Carey,United States,2000-05-27,"Active, NCAA (Oregon State)",2000,True


## Define functions for info page extraction

In [170]:
infoPageURL = 'https://thegymter.net/2024-olympic-games-wag-qualifiers/'
sourceI = requests.get(infoPageURL).text
soupI = BeautifulSoup(sourceI,'lxml')

### Countries

In [209]:
#Main country table
countryData = soupI.findAll('table')[0].findAll('tr')
countryDict = {'team':[], 'name':[], 'link':[], 'alternate':[]}
for i in range(1,len(countryData)):
    #Skip 2023 world champs header
    if i == 4:
        continue
    row = countryData[i]
    country = row.findAll('td')[0].text
    athletes = row.findAll('td')[1]
    for j in range(len(athletes.findAll('a'))):
        countryDict['team'].append(country)
        countryDict['name'].append(athletes.findAll('a')[j].text)
        countryDict['link'].append(athletes.findAll('a')[j]['href'])
        if j < 5:
            countryDict['alternate'].append(0)
        else:
            countryDict['alternate'].append(1)

countryDf = pd.DataFrame(countryDict)
countryDf['team_qualified'] = 1


In [210]:
countryDf.head(14)

Unnamed: 0,team,name,link,alternate,team_qualified
0,United States,Simone Biles,https://thegymter.net/simone-biles/,0,1
1,United States,Jade Carey,https://thegymter.net/jade-carey/,0,1
2,United States,Jordan Chiles,https://thegymter.net/jordan-chiles/,0,1
3,United States,Sunisa Lee,https://thegymter.net/sunisa-lee/,0,1
4,United States,Hezly Rivera,https://thegymter.net/hezly-rivera/,0,1
5,United States,Joscelyn Roberson,https://thegymter.net/joscelyn-roberson/,1,1
6,United States,Leanne Wong,https://thegymter.net/leanne-wong/,1,1
7,United States,Tiana Sumanasekera,https://thegymter.net/tiana-sumanasekera/,1,1
8,United States,Kaliya Lincoln,https://thegymter.net/kaliya-lincoln/,1,1
9,Great Britain,Becky Downie,https://thegymter.net/becky-downie/,0,1


### Individuals

In [242]:
individualData = soupI.findAll('table')[1].findAll('tr')
individualData2 = soupI.findAll('table')[2].findAll('tr')
individualData3 = soupI.findAll('table')[3].findAll('tr')
individualData4 = soupI.findAll('table')[4].findAll('tr')


In [245]:
individualData4

[<tr style="background-color: #1e90ff">
 <td style="text-align: left;color: white"><b>TRIPARTITE ATHLETE</b></td>
 </tr>,
 <tr>
 <td><a href="https://thegymter.net/lynnzee-brown/">Lynnzee Brown</a> (Haiti)</td>
 </tr>]

In [240]:
individualDict = {'team':[], 'name':[], 'link':[]}
for i in range(1,len(individualData)):
    #Dodge headers
    if (i == 9) or (i == 12):
        continue
    #print(i)
    for athlete in individualData[i].findAll('td'):
        #print(athlete)
        break_info = athlete.text.split()
        if len(break_info) == 0:
            continue
        country = break_info[-1].strip(')(*')
        name =  " ".join(break_info[:-1])
        individualDict['team'].append(country)
        individualDict['name'].append(name)
        individualDict['link'].append(athlete.find('a')['href'])
               
individualDf = pd.DataFrame(individualDict)
individualDf['alternate'] = 0
individualDf['team_qualified'] = 0
    

In [247]:
individualDict = {'team':[], 'name':[], 'link':[]}
for i in range(1,len(individualData2)):
    #Dodge headers
    if (i == 9) or (i == 12):
        continue
    #print(i)
    for athlete in individualData2[i].findAll('td'):
        #print(athlete)
        break_info = athlete.text.split()
        if len(break_info) == 0:
            continue
        country = break_info[-1].strip(')(*')
        name =  " ".join(break_info[:-1])
        individualDict['team'].append(country)
        individualDict['name'].append(name)
        individualDict['link'].append(athlete.find('a')['href'])
               
individualDf2 = pd.DataFrame(individualDict)
individualDf2['alternate'] = 0
individualDf2['team_qualified'] = 0
    

In [248]:
individualDict = {'team':[], 'name':[], 'link':[]}
for i in range(1,len(individualData3)):
    #Dodge headers
    if (i == 9) or (i == 12):
        continue
    #print(i)
    for athlete in individualData3[i].findAll('td'):
        #print(athlete)
        break_info = athlete.text.split()
        if len(break_info) == 0:
            continue
        country = break_info[-1].strip(')(*')
        name =  " ".join(break_info[:-1])
        individualDict['team'].append(country)
        individualDict['name'].append(name)
        individualDict['link'].append(athlete.find('a')['href'])
               
individualDf3 = pd.DataFrame(individualDict)
individualDf3['alternate'] = 0
individualDf3['team_qualified'] = 0
    

In [249]:
individualDict = {'team':[], 'name':[], 'link':[]}
for i in range(1,len(individualData4)):
    #Dodge headers
    if (i == 9) or (i == 12):
        continue
    #print(i)
    for athlete in individualData4[i].findAll('td'):
        #print(athlete)
        break_info = athlete.text.split()
        if len(break_info) == 0:
            continue
        country = break_info[-1].strip(')(*')
        name =  " ".join(break_info[:-1])
        individualDict['team'].append(country)
        individualDict['name'].append(name)
        individualDict['link'].append(athlete.find('a')['href'])
               
individualDf4 = pd.DataFrame(individualDict)
individualDf4['alternate'] = 0
individualDf4['team_qualified'] = 0
    

In [253]:
individualDf = pd.concat([individualDf, individualDf2, individualDf3, individualDf4], ignore_index=True)
individualDf

Unnamed: 0,team,name,link,alternate,team_qualified
0,Algeria,Kaylia Nemour,https://thegymter.net/kaylia-nemour/,0,0
1,Germany,Pauline Schäfer,https://thegymter.net/pauline-schafer/,0,0
2,Mexico,Alexa Moreno,https://thegymter.net/alexa-moreno/,0,0
3,Portugal,Filipa Martins,https://thegymter.net/filipa-martins/,0,0
4,Philippines,Aleah Finnegan,https://thegymter.net/aleah-finnegan/,0,0
5,Hungary,Lili Czifra,https://thegymter.net/lili-czifra/,0,0
6,Spain,Alba Petisco,https://thegymter.net/alba-petisco/,0,0
7,Ukraine,Anna Lashchevska,https://thegymter.net/anna-lashchevska/,0,0
8,Switzerland,Lena Bickel,https://thegymter.net/lena-bickel/,0,0
9,Panama,Hillary Heron,https://thegymter.net/hillary-heron/,0,0


In [259]:
#Clean
individualDf.loc[10,'team'] = 'South Africa'
individualDf.loc[10,'name'] = 'Caitlin Rooskrantz'
individualDf.loc[22,'team'] = 'North Korea'
individualDf.loc[22,'name'] = 'An Chang Ok'
individualDf.loc[24,'team'] = 'New Zealand'
individualDf.loc[24,'name'] = 'Georgia-Rose Brown'
#Remove a non-competing athlete
individualDf = individualDf[individualDf.team != 'Zealand']


In [260]:
allDf = pd.concat([countryDf, individualDf], ignore_index=True)
allDf

Unnamed: 0,team,name,link,alternate,team_qualified
0,United States,Simone Biles,https://thegymter.net/simone-biles/,0,1
1,United States,Jade Carey,https://thegymter.net/jade-carey/,0,1
2,United States,Jordan Chiles,https://thegymter.net/jordan-chiles/,0,1
3,United States,Sunisa Lee,https://thegymter.net/sunisa-lee/,0,1
4,United States,Hezly Rivera,https://thegymter.net/hezly-rivera/,0,1
...,...,...,...,...,...
113,Colombia,Luisa Blanco,https://thegymter.net/luisa-blanco/,0,0
114,Belgium,Maellyse Brassart,https://thegymter.net/maellyse-brassart/,0,0
115,Egypt,Jana Mahmoud,https://thegymter.net/jana-mahmoud/,0,0
116,Philippines,Emma Malabuyo,https://thegymter.net/emma-malabuyo/,0,0


## Bring everything together

In [314]:
%%time
for i in range(len(allDf)):
    #Set athlete variables
    athlete = allDf.loc[i, 'name']
    print('Extracting:',athlete)
    team = allDf.loc[i, 'team']
    url = allDf.loc[i, 'link']
    alternate = allDf.loc[i, 'alternate']
    tq = allDf.loc[i,'team_qualified']
    
    #Pull data
    adf = pullAthleteResults(url)
    adf['name'] = athlete
    adf['team'] = team
    adf['link'] = url
    adf['alternate'] = alternate
    adf['team_qualified'] = tq
    
    #Combine results
    if i == 0:
        full_df = adf.copy()
    else:
        full_df = pd.concat([full_df,adf],ignore_index=True)

Extracting: Simone Biles
Extracting: Jade Carey
Extracting: Jordan Chiles
Extracting: Sunisa Lee
Extracting: Hezly Rivera
Extracting: Joscelyn Roberson
Extracting: Leanne Wong
Extracting: Tiana Sumanasekera
Extracting: Kaliya Lincoln
Extracting: Becky Downie
Extracting: Ruby Evans
Extracting: Georgia-Mae Fenton
Extracting: Alice Kinsella
Extracting: Abigail Martin
Extracting: Charlotte Booth
Extracting: Ruby Stacey
Extracting: Ellie Black
Extracting: Cassie Lee
Extracting: Shallon Olsen
Extracting: Ava Stewart
Extracting: Aurélie Tran
Extracting: Emma Spence
Extracting: Sydney Turner
Extracting: Rose Woo
Extracting: Luo Huan
Extracting: Ou Yushan
Extracting: Qiu Qiyuan
Extracting: Zhang Yihan
Extracting: Zhou Yaqin
Extracting: Du Siyu
Extracting: Huang Zhuofan
Extracting: Zhang Qingying
Extracting: Rebeca Andrade
Extracting: Jade Barbosa
Extracting: Lorrane Oliveira
Extracting: Flavia Saraiva
Extracting: Julia Soares
Extracting: Andreza Lima
Extracting: Carolyne Pedro
Extracting: Angel

In [316]:
full_df

Unnamed: 0,date,competition,type,vt,ub,bb,fx,aa,year,name,country,birthdate,status,birthyear,senior,team,link,alternate,team_qualified
0,Jun 27-30,U.S. Olympic Trials,AA,15.500,14.200,13.900,14.725,58.325,2024,Simone Biles,United States,1997-03-14,Active,1997,True,United States,https://thegymter.net/simone-biles/,0,1
1,Jun 27-30,U.S. Olympic Trials,QF,15.975,14.425,13.650,14.850,58.900,2024,Simone Biles,United States,1997-03-14,Active,1997,True,United States,https://thegymter.net/simone-biles/,0,1
2,May 30-Jun 2,U.S. Championships,AA,15.000,14.400,14.800,15.100,59.300,2024,Simone Biles,United States,1997-03-14,Active,1997,True,United States,https://thegymter.net/simone-biles/,0,1
3,May 30-Jun 2,U.S. Championships,QF,15.800,14.650,14.800,15.200,60.450,2024,Simone Biles,United States,1997-03-14,Active,1997,True,United States,https://thegymter.net/simone-biles/,0,1
4,May 17-18,U.S. Classic,AA,15.600,14.550,14.550,14.800,59.500,2024,Simone Biles,United States,1997-03-14,Active,1997,True,United States,https://thegymter.net/simone-biles/,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6515,May 22-26,Pan American Championships,QF,12.667,11.000,10.733,12.233,46.633,2024,Lynnzee Brown,"Haiti, formerly United States",1998-09-04,"Active, NCAA (Denver)",1998,True,Haiti,https://thegymter.net/lynnzee-brown/,0,0
6516,Oct 21-25,Pan American Games,AA,13.400,11.000,11.533,12.200,48.133,2023,Lynnzee Brown,"Haiti, formerly United States",1998-09-04,"Active, NCAA (Denver)",1998,True,Haiti,https://thegymter.net/lynnzee-brown/,0,0
6517,Oct 21-25,Pan American Games,QF,13.400,11.066,11.300,11.866,47.632,2023,Lynnzee Brown,"Haiti, formerly United States",1998-09-04,"Active, NCAA (Denver)",1998,True,Haiti,https://thegymter.net/lynnzee-brown/,0,0
6518,Sep 30-Oct 8,World Championships,QF,13.500,11.100,11.166,12.000,47.766,2023,Lynnzee Brown,"Haiti, formerly United States",1998-09-04,"Active, NCAA (Denver)",1998,True,Haiti,https://thegymter.net/lynnzee-brown/,0,0


## Mark domestic and international competitions

In [320]:
comp_df = full_df['competition'].value_counts().reset_index().rename(columns={'index':'competition','competition':'number_of_appearances'})
comp_df['domestic'] = 1
comp_df

Unnamed: 0,competition,number_of_appearances,domestic
0,World Championships,539,1
1,European Championships,437,1
2,City of Jesolo Trophy,193,1
3,U.S. Championships,136,1
4,Australian Championships,117,1
...,...,...,...
392,Cottbus Challenge Cup,1,1
393,Blume Memorial,1,1
394,Portugese Championships,1,1
395,Portuguese Super Finals,1,1


In [341]:
int_meets = [0,1,2,4,5,6,7,9,10,12,15,16,18,19,24,25,26,28,29,30,31,32,35,36,37,45,53,62,68,74,78,79,81,86,87,92,98,105,107,110,112,114,115,119,
             142,163,172,178,182,185,188,191,204,220,234,236,239,252,256,263,264,276,277,280,297,288,291,299,304,325,326,327,392]

In [343]:
comp_df.loc[comp_df.index.isin(int_meets),'domestic'] = 0

In [344]:
comp_df

Unnamed: 0,competition,number_of_appearances,domestic
0,World Championships,539,0
1,European Championships,437,0
2,City of Jesolo Trophy,193,0
3,U.S. Championships,136,1
4,Australian Championships,117,0
...,...,...,...
392,Cottbus Challenge Cup,1,0
393,Blume Memorial,1,1
394,Portugese Championships,1,1
395,Portuguese Super Finals,1,1


In [345]:
final_df = pd.merge(full_df,comp_df[['competition','domestic']],how='inner',on='competition')
final_df

Unnamed: 0,date,competition,type,vt,ub,bb,fx,aa,year,name,country,birthdate,status,birthyear,senior,team,link,alternate,team_qualified,domestic
0,Jun 27-30,U.S. Olympic Trials,AA,15.500,14.200,13.900,14.725,58.325,2024,Simone Biles,United States,1997-03-14,Active,1997,True,United States,https://thegymter.net/simone-biles/,0,1,1
1,Jun 27-30,U.S. Olympic Trials,QF,15.975,14.425,13.650,14.850,58.900,2024,Simone Biles,United States,1997-03-14,Active,1997,True,United States,https://thegymter.net/simone-biles/,0,1,1
2,Jun 24-27,U.S. Olympic Trials,AA,15.400,13.833,13.700,14.600,57.533,2021,Simone Biles,United States,1997-03-14,Active,1997,True,United States,https://thegymter.net/simone-biles/,0,1,1
3,Jun 24-27,U.S. Olympic Trials,QF,15.466,14.600,15.133,15.366,60.565,2021,Simone Biles,United States,1997-03-14,Active,1997,True,United States,https://thegymter.net/simone-biles/,0,1,1
4,Jul 8-10,U.S. Olympic Trials,AA,16.200,14.900,14.750,15.550,61.400,2016,Simone Biles,United States,1997-03-14,Active,1997,True,United States,https://thegymter.net/simone-biles/,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6515,Aug 18-20,Pharaoh’s Cup,QF,0.000,10.600,0.000,12.050,0.000,2023,Jana Mahmoud,Egypt,2004-08-17,Active,2004,True,Egypt,https://thegymter.net/jana-mahmoud/,0,0,1
6516,Jun 12-15,Asian Junior Cup,EF,12.900,11.050,10.500,12.800,0.000,2019,Jana Mahmoud,Egypt,2004-08-17,Active,2004,False,Egypt,https://thegymter.net/jana-mahmoud/,0,0,1
6517,Jun 12-15,Asian Junior Cup,AA,12.900,10.600,12.800,12.050,48.350,2019,Jana Mahmoud,Egypt,2004-08-17,Active,2004,False,Egypt,https://thegymter.net/jana-mahmoud/,0,0,1
6518,Mar 9,Egyptian Championships,AA,11.900,9.066,10.866,10.500,42.332,2017,Jana Mahmoud,Egypt,2004-08-17,Active,2004,False,Egypt,https://thegymter.net/jana-mahmoud/,0,0,1


In [346]:
final_df.to_csv('ScoreResults2024AndEarlier.csv')