# Berlin Scraper

    Years: X - X
    URL: https://www.bmw-berlin-marathon.com/en/impressions/statistics-and-history/results-archive/
    

In [18]:
import pandas as pd
from urllib.request import urlopen
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import ElementNotVisibleException, ElementNotSelectableException, NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
import concurrent.futures
import time
import multiprocessing

# Define helper methods

In [9]:
# Parse a page of marathon finishers, adding the results to the master list
def parse_page(html):
    
    soup = BeautifulSoup(html, 'html.parser')

    page_finishers = soup.find("table").tbody.find_all("tr")

    for finisher in page_finishers:
        
        if not finisher.has_attr('class') or 'shown' not in finisher['class']:
            split_list = finisher.find("ul", "splits")
            if split_list is None:
                continue
            splits = split_list.find_all('li')
            
            split_5k = splits[0].find_all('span')[1].get_text()
            split_10k = splits[1].find_all('span')[1].get_text()
            split_15k = splits[2].find_all('span')[1].get_text()
            split_20k = splits[3].find_all('span')[1].get_text()
            time_half = splits[4].find_all('span')[1].get_text()
            split_25k = splits[5].find_all('span')[1].get_text()
            split_30k = splits[6].find_all('span')[1].get_text()
            split_35k = splits[7].find_all('span')[1].get_text()
            split_40k = splits[8].find_all('span')[1].get_text()
            
            split_times = {
                'split_5k' : split_5k,
                'split_10k' : split_10k,
                'split_15k' : split_15k,
                'split_20k' : split_20k,
                'time_half' : time_half,
                'split_25k' : split_25k,
                'split_30k' : split_30k,
                'split_35k' : split_35k,
                'split_40k' : split_40k
            }
            
            if len(masterResults) > 0:
                masterResults[len(masterResults)-1].update(split_times)
            continue
        
        cells = finisher.find_all('td')
        
        #skip header row
        if cells is None or len(cells) == 0:
            continue

        place_overall = cells[1].get_text()
        
        first_name = cells[2].get_text()
        last_name = cells[3].get_text()
        
        nationality = cells[4].get_text()
        club = cells[6].get_text()
        gender = cells[7].get_text()
        
        time_full = cells[8].get_text()

        result = {
            "place_overall" : place_overall,
            "first_name" : first_name,
            "last_name" : last_name,
            "nationality" : nationality,
            "club" : club,
            "time_full" : time_full,
            "gender" : gender
        }

        masterResults.append(result)

    time.sleep(0.25)

In [14]:
def find_pagination_limit(html):
    soup = BeautifulSoup(html, 'html.parser')
    
    #The second to last item in the pagination object displays the final page number, that's what we want
    pagination = soup.find("ul", "pagination").find_all('li')
    li_length = len(pagination)
    last_page = int(pagination[li_length - 2].text)
    
    return last_page

In [11]:
def cleanup_and_save(year):
    df_results = pd.DataFrame(masterResults)
    
    #add the event name
    df_results['Race'] = 'Berlin'
    
    #add the year
    df_results['year'] = year
    
    #save to csv
    df_results.to_csv(f'results-{year}.csv', index=False)
    print(f"Scraping complete.  Gathered {df_results.shape[0]} results")

# Do the scraping

In [17]:
DRIVER_PATH = 'D:/dev/chromedriver.exe'
options = Options()
options.add_argument("--window-size=1920,1200")
driver = webdriver.Chrome(options=options, executable_path=DRIVER_PATH)
actions = ActionChains(driver)

#open the results page
driver.get('https://www.bmw-berlin-marathon.com/en/impressions/statistics-and-history/results-archive/')

time.sleep(1)

#Close the cookie modal if it exists
try:
    driver.find_element_by_xpath("//button[@aria-label='Accept all']").click()
except NoSuchElementException:
    print('Cookie modal not present')
    
#select the appropriate year from the dropdown
select_element = driver.find_element_by_xpath(f"//select[@class='events border-input']")
select_options = select_element.find_elements_by_tag_name('option')

#there is a dropdown that controls which year of results are shown
#get the dropdown, and create a dictionary with each year and the WebDriver element needed to select that year
year_map = dict()
for opt in select_options:
    year = opt.text[0:4]
    year_map[year] = opt
    
#Create a list to store the results
masterResults = []


#####   CHOOSE YEAR   ####
year = '2018'
    
year_map.get(year).click()
time.sleep(0.25)

#initiatize pagination
current_page = 1
last_page = find_pagination_limit(driver.page_source)

#loop through each page and gather the results
while current_page < last_page:

    ## First, "click" on each tr to expand it

    ### Wait fot the 'loading' modal to go away
    wait = WebDriverWait(driver, 10)
    wait.until(EC.element_to_be_clickable((By.XPATH, '//table/tbody/tr')))
    
    current_page = int(driver.find_element_by_xpath("//li[@class='paginate_button page-item active']").text)
    print(f"On page {current_page} of {last_page}")

    ### Grab the table rows
    result_row = driver.find_element_by_xpath('//table/tbody').find_elements_by_tag_name('tr')

    ### Click 'em all
    for row in result_row:
        row.click()

    ### Second, send the table to parse_page
    parse_page(driver.page_source)

    num_results = len(masterResults)
    print(f"Total results so far: {num_results}")

    ### Last, find and click the next page button
    next_button = driver.find_element_by_xpath("//a[text()='Next']")
    next_button.send_keys(Keys.Enter)
    wait.until(EC.staleness_of(next_button))
    driver.execute_script("arguments[0].scrollIntoView();", select_element)

cleanup_and_save(year)

On page 1 of 2033
Total results so far: 20
Total results so far: 40
Total results so far: 60
Total results so far: 80
Total results so far: 100
Total results so far: 120
Total results so far: 140
Total results so far: 160
Total results so far: 180
Total results so far: 200
Total results so far: 220
Total results so far: 240
Total results so far: 260
Total results so far: 280
Total results so far: 300
Total results so far: 320
Total results so far: 340
Total results so far: 360
Total results so far: 380
Total results so far: 400
Total results so far: 420
Total results so far: 440
Total results so far: 460
Total results so far: 480
Total results so far: 500
Total results so far: 520
Total results so far: 540
Total results so far: 560
Total results so far: 580
Total results so far: 600
Total results so far: 620
Total results so far: 640
Total results so far: 660
Total results so far: 680
Total results so far: 700
Total results so far: 720
Total results so far: 740
Total results so far: 76

Total results so far: 6120
Total results so far: 6140
Total results so far: 6160
Total results so far: 6180
Total results so far: 6200
Total results so far: 6220
Total results so far: 6240
Total results so far: 6260
Total results so far: 6280
Total results so far: 6300
Total results so far: 6320
Total results so far: 6340
Total results so far: 6360
Total results so far: 6380
Total results so far: 6400
Total results so far: 6420
Total results so far: 6440
Total results so far: 6460
Total results so far: 6480
Total results so far: 6500
Total results so far: 6520
Total results so far: 6540
Total results so far: 6560
Total results so far: 6580
Total results so far: 6600
Total results so far: 6620
Total results so far: 6640
Total results so far: 6660
Total results so far: 6680
Total results so far: 6700
Total results so far: 6720
Total results so far: 6740
Total results so far: 6760
Total results so far: 6780
Total results so far: 6800
Total results so far: 6820
Total results so far: 6840
T

UnexpectedAlertPresentException: Alert Text: DataTables warning: table id=DataTables_Table_1 - Ajax error. For more information about this error, please see http://datatables.net/tn/7
Message: unexpected alert open: {Alert text : DataTables warning: table id=DataTables_Table_1 - Ajax error. For more information about this error, please see http://datatables.net/tn/7}
  (Session info: chrome=91.0.4472.77)
