# Berlin Scraper

    Years: 2009 (so far) - 2019
    URL: https://www.bmw-berlin-marathon.com/en/impressions/statistics-and-history/results-archive/
    

In [None]:
import pandas as pd
from urllib.request import urlopen
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import ElementNotVisibleException, ElementNotSelectableException, NoSuchElementException, TimeoutException, UnexpectedAlertPresentException, StaleElementReferenceException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
import time

# Define helper methods

In [None]:
# Parse a page of marathon finishers, adding the results to the master list
def parse_page(html):
    
    soup = BeautifulSoup(html, 'html.parser')

    page_finishers = soup.find("table").tbody.find_all("tr")

    for finisher in page_finishers:
        
        if not finisher.has_attr('class') or 'shown' not in finisher['class']:
            split_list = finisher.find("ul", "splits")
            if split_list is None:
                continue
            splits = split_list.find_all('li')
            
            split_5k = splits[0].find_all('span')[1].get_text()
            split_10k = splits[1].find_all('span')[1].get_text()
            split_15k = splits[2].find_all('span')[1].get_text()
            split_20k = splits[3].find_all('span')[1].get_text()
            time_half = splits[4].find_all('span')[1].get_text()
            split_25k = splits[5].find_all('span')[1].get_text()
            split_30k = splits[6].find_all('span')[1].get_text()
            split_35k = splits[7].find_all('span')[1].get_text()
            split_40k = splits[8].find_all('span')[1].get_text()
            
            split_times = {
                'split_5k' : split_5k,
                'split_10k' : split_10k,
                'split_15k' : split_15k,
                'split_20k' : split_20k,
                'time_half' : time_half,
                'split_25k' : split_25k,
                'split_30k' : split_30k,
                'split_35k' : split_35k,
                'split_40k' : split_40k
            }
            
            if len(masterResults) > 0:
                masterResults[len(masterResults)-1].update(split_times)
            continue
        
        cells = finisher.find_all('td')
        
        #skip header row
        if cells is None or len(cells) == 0:
            continue

        place_overall = cells[1].get_text()
        
        first_name = cells[2].get_text()
        last_name = cells[3].get_text()
        
        nationality = cells[4].get_text()
        club = cells[6].get_text()
        gender = cells[7].get_text()
        
        time_full = cells[8].get_text()

        result = {
            "place_overall" : place_overall,
            "first_name" : first_name,
            "last_name" : last_name,
            "nationality" : nationality,
            "club" : club,
            "time_full" : time_full,
            "gender" : gender
        }

        masterResults.append(result)

    time.sleep(0.25)

In [None]:
def find_pagination_limit(html):
    soup = BeautifulSoup(html, 'html.parser')
    
    #The second to last item in the pagination object displays the final page number, that's what we want
    pagination = soup.find("ul", "pagination").find_all('li')
    li_length = len(pagination)
    last_page = int(pagination[li_length - 2].text)
    
    return last_page

In [None]:
def cleanup_and_save(year):
    df_results = pd.DataFrame(masterResults)
    
    #add the event name
    df_results['Race'] = 'Berlin'
    
    #add the year
    df_results['year'] = year
    
    df_results.drop_duplicates(inplace=True, ignore_index=True)
    
    #save to csv
    df_results.to_csv(f'results-{year}.csv', index=False)
    print(f"Scraping complete.  Gathered {df_results.shape[0]} results")

# Do the scraping

In [None]:
def try_do_scrape(current_page, last_page):
    while current_page < last_page:

        ### Wait for the 'loading' modal to go away
        wait = WebDriverWait(driver, 10)
        wait.until(EC.element_to_be_clickable((By.XPATH, '//table/tbody/tr')))

        current_page = int(driver.find_element_by_xpath("//li[@class='paginate_button page-item active']").text)
        print(f"Page {current_page} of {last_page}: {len(masterResults)} results so far")

        ## First, "click" on each tr to expand it
        
        ### Grab the table rows
        result_row = driver.find_element_by_xpath('//table/tbody').find_elements_by_tag_name('tr')

        ### Click 'em all
        for row in result_row:
            row.click()

        ## Second, send the table to parse_page
        parse_page(driver.page_source)

        ## Last, find and click the next page button (using Keys.ENTER results in fewer errors than .click())
        next_button = driver.find_element_by_xpath("//a[text()='Next']")
        next_button.send_keys(Keys.ENTER)
        
        ### Wait for the next button to become stale, meaning we've successfully navigated to the next page of results
        ### This will throw a TimeoutException on the last page of results, since that page has no next button
        ### We'll just catch the exception once, rather than checking the button's existance on every single page
        wait.until(EC.staleness_of(next_button))

        ### Scroll back to the top so that the first result row is in view
        driver.execute_script("arguments[0].scrollIntoView();", select_element)

In [None]:
#initialize the WebDriver
DRIVER_PATH = 'C:/dev/chromedriver.exe'
options = Options()
options.add_argument("--window-size=1920,1200")
driver = webdriver.Chrome(options=options, executable_path=DRIVER_PATH)

#Open the results page
driver.get('https://www.bmw-berlin-marathon.com/en/impressions/statistics-and-history/results-archive/')

#Give the page a second to load
time.sleep(1)

#Close the cookie modal if it exists
try:
    driver.find_element_by_xpath("//button[@aria-label='Accept all']").click()
except NoSuchElementException:
    print('Cookie modal not present')
    
#Give the page another second, it can be slow sometimes
time.sleep(1)
    
#There is a dropdown that controls which year of results are shown
#Get the dropdown, and create a dictionary with each year and the WebDriver element needed to select that year
select_element = driver.find_element_by_xpath(f"//select[@class='events border-input']")
select_options = select_element.find_elements_by_tag_name('option')
year_map = dict()
for opt in select_options:
    year = opt.text[0:4]
    year_map[year] = opt
    
#Create a list to store the results
masterResults = []

##########################
#####   CHOOSE YEAR   ####
##########################
year = '2009'
    
#Select the year, then wait for results to load
year_map.get(year).click()
time.sleep(0.25)

#initiatize pagination
current_page = 1
last_page = find_pagination_limit(driver.page_source)

#loop through each page and gather the results
num_error = 0
while num_error < 5:
    
    try:
        try_do_scrape(current_page, last_page)
        
    except TimeoutException:
        current_page = int(driver.find_element_by_xpath("//li[@class='paginate_button page-item active']").text)
        #TimeoutException is thrown on the final page
        #If we're on the final page, quit the loop and save
        if current_page == last_page:
            break
            
    except (UnexpectedAlertPresentException, StaleElementReferenceException) as ex:
        print(f"**Encountered exception #{num_error+1}**")
        print(ex.Message)
        time.sleep(1)
        
        current_page = int(driver.find_element_by_xpath("//li[@class='paginate_button page-item active']").text)
        
        
        if current_page == last_page:
            break
        #Go back a page and start over (we'll dedupe during cleanup)
        current_page -= 1
        prev_button = driver.find_element_by_xpath("//a[text()='Previous']")
        prev_button.send_keys(Keys.ENTER)
        driver.execute_script("arguments[0].scrollIntoView();", select_element)
        num_error += 1
    
cleanup_and_save(year)