# **World Championship 2021**

## **49er and 49erFX**

In [1]:
from selenium.webdriver import Chrome
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [2]:
options = Options()
options.add_argument("--headless")

Check your chrome version searching for chrome://version. If your chrome version is 115 or newer, you can install chromedriver here https://googlechromelabs.github.io/chrome-for-testing/#stable. Else, check it here https://chromedriver.chromium.org/downloads. After downloading, unzip the file, move it to the same folder as this notebook and inform the path in the next cell.

In [3]:
PATH_TO_CHROMEDRIVER = 'chromedriver-mac-x64/chromedriver'

In [4]:
URL = 'https://49er.org/events/2021-world-championship/#result-49'

driver = Chrome(options=options)
driver.get(URL)
html = driver.page_source

soup = BeautifulSoup(html, "html.parser")

Even though we pass the link to extract only the 49er results, 49erFX will also be extracted. We want both data, so we will keep it.

In [5]:
data = []

for row in soup.find_all('tr'):
    resultados = []
    for cell in row.find_all('td'):
        # if cell contains <strike> tag, add () around it
        if cell.find('strike'):
            cell.string = '(' + cell.string + ')'
        resultados.append(cell.text)
    data.append(resultados)

In [6]:
# turn data into a pandas dataframe
df = pd.DataFrame(data)

# change column names
df.columns = ['Posição Geral', 'Sail Number', 'Nome Competidor', 'Nett', 'Q1', 'Q2', 'Q3', 'Q4', 'Q5', 'Q6', 
              'Q7', 'Q8', 'Q9', 'Q10', 'Q11', 'Q12', 'Q13', 'Q14', 'Q15', 'Q16', 'Q17', 'Pontuação Total']

# drop Sail Number column
df.drop('Sail Number', axis=1, inplace=True)

In [7]:
# divide dataframe into 49er and 49erFX
df_49er = df.iloc[1:37, :]
df_49erFX = df.iloc[38:60, :]

In [8]:
# drop Q17 from df_49erFX
df_49erFX.drop('Q17', axis=1, inplace=True)

In [9]:
# turn all the Q1, Q2, ..., Q17 columns into rows
df_49er = pd.melt(df_49er, id_vars=['Posição Geral', 'Nome Competidor', 'Nett', 'Pontuação Total'], 
                  value_vars=['Q1', 'Q2', 'Q3', 'Q4', 'Q5', 'Q6', 
                              'Q7', 'Q8', 'Q9', 'Q10', 'Q11', 'Q12', 
                              'Q13', 'Q14', 'Q15', 'Q16', 'Q17'])

df_49erFX = pd.melt(df_49erFX, id_vars=['Posição Geral', 'Nome Competidor', 'Nett', 'Pontuação Total'],
                    value_vars=['Q1', 'Q2', 'Q3', 'Q4', 'Q5', 'Q6', 
                                'Q7', 'Q8', 'Q9', 'Q10', 'Q11', 'Q12', 
                                'Q13', 'Q14', 'Q15', 'Q16'])

In [10]:
# rename column 'variable' to 'Flotilha'
df_49er.rename(columns={'variable': 'Flotilha',
                        'value': 'Pontuação Regata'}, inplace=True)

df_49erFX.rename(columns={'variable': 'Flotilha',
                          'value': 'Pontuação Regata'}, inplace=True)

df_49er["Regata"] = df_49er["Flotilha"]
df_49erFX["Regata"] = df_49erFX["Flotilha"]

In [11]:
# define functions to rename fleet names
def rename_fleet_49er(x):
    if x == 'Q17':
        x = 'MR'
    else:
        x = 'G'
    return x

def rename_fleet_49erFX(x):
    if x == 'Q16':
        x = 'MR'
    else:
        x = 'G'
    return x

df_49er['Flotilha'] = df_49er['Flotilha'].apply(rename_fleet_49er)
df_49erFX['Flotilha'] = df_49erFX['Flotilha'].apply(rename_fleet_49erFX)

In [12]:
df_49er['Nome Competição'] = ['World Championship 2021'] * len(df_49er)
df_49er['ID Competição'] = [11] * len(df_49er)
df_49er['Classe Vela'] = ['49er'] * len(df_49er)
df_49er['Punição'] = [''] * len(df_49er)

df_49erFX['Nome Competição'] = ['World Championship 2021'] * len(df_49erFX)
df_49erFX['ID Competição'] = [11] * len(df_49erFX)
df_49erFX['Classe Vela'] = ['49erFX'] * len(df_49erFX)
df_49erFX['Punição'] = [''] * len(df_49erFX)

In [13]:
column_order = ['Nome Competidor', 'ID Competição', 'Classe Vela', 
                'Pontuação Regata', 'Flotilha', 'Posição Geral', 
                'Punição', 'Pontuação Total', 'Nett', 'Nome Competição', 'Regata']

df_49er = df_49er[column_order]
df_49erFX = df_49erFX[column_order]

In [14]:
df_49er.head()

Unnamed: 0,Nome Competidor,ID Competição,Classe Vela,Pontuação Regata,Flotilha,Posição Geral,Punição,Pontuação Total,Nett,Nome Competição,Regata
0,Bart LAMBRIEXFloris van de WERKEN,11,49er,3.0,G,1,,118.0,92.0,World Championship 2021,Q1
1,Tim FISCHERFabian GRAF,11,49er,1.0,G,2,,110.0,94.0,World Championship 2021,Q1
2,Frederik RASKJakob PRECHT JENSEN,11,49er,4.0,G,3,,135.0,112.0,World Championship 2021,Q1
3,Ian BARROWSHans HENKEN,11,49er,9.0,G,4,,135.0,113.0,World Championship 2021,Q1
4,Dominik BUKSAKSzymon WIERZBICKI,11,49er,10.0,G,5,,134.0,118.0,World Championship 2021,Q1


In [15]:
df_49erFX.head()

Unnamed: 0,Nome Competidor,ID Competição,Classe Vela,Pontuação Regata,Flotilha,Posição Geral,Punição,Pontuação Total,Nett,Nome Competição,Regata
0,Odile van AANHOLTElise RUYTER,11,49erFX,1.0,G,1,,93.0,75.0,World Championship 2021,Q1
1,Helene NÆSSMarie RØNNINGEN,11,49erFX,5.0,G,2,,103.0,83.0,World Championship 2021,Q1
2,Martine GRAELKahena KUNZE,11,49erFX,10.0,G,3,,112.0,94.0,World Championship 2021,Q1
3,Ronja GRÖNBLOMVeera HOKKA,11,49erFX,8.0,G,4,,134.0,112.0,World Championship 2021,Q1
4,Patricia SUÁREZMaria CANTERO IZQUIERDO,11,49erFX,7.0,G,5,,141.0,121.0,World Championship 2021,Q1


In [16]:
df_49er.to_csv('../scraped-data/new_World Championship 2021_49er.csv', index=False)
df_49erFX.to_csv('../scraped-data/new_World Championship 2021_49erFX.csv', index=False)