# New York City Scraper

    Years: 2014 - 2019
    URL: https://results.nyrr.org/event/M{year}/finishers
    
    Uses multithreading and Selenium to process 5,000 results per thread.  Warning, this will consume about 1 GB of RAM per worker, so set the number carefully.

In [1]:
import pandas as pd
from urllib.request import urlopen
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import ElementNotVisibleException, ElementNotSelectableException, NoSuchElementException, TimeoutException, UnexpectedAlertPresentException, StaleElementReferenceException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
import concurrent.futures
import multiprocessing
import time

# Helper Methods

In [2]:
DRIVER_PATH = 'C:/dev/chromedriver.exe'

def find_page_limit(url):
    options = Options()
    options.add_argument("--window-size=1920,1200")
    driver = webdriver.Chrome(options=options, executable_path=DRIVER_PATH)

    #Open the results page
    driver.get(url)
    
    runner_count = driver.find_element_by_xpath("//ul[@class='submenu nav nav-tabs']//span[@class='ng-binding']").text
    return runner_count
    

In [3]:
def generate_urls(limit, base_url):
    
    #There are 51 results per page.  To speed things up, we'll grab 100 pages of results per worker.
        
    i = 1
    
    while i <= limit:
        pages_to_scrape.append(base_url + f'#page=1&opf={i}')
        i += 5100

In [4]:
def scrape_results(url):
    
    options = Options()
    options.add_argument("--window-size=1920,1200")
    driver = webdriver.Chrome(options=options, executable_path=DRIVER_PATH)

    # Open the results page
    driver.get(url)
    
    # Give the page some time to load
    time.sleep(5)
    
    #Change to narrow view
    driver.find_element_by_xpath("//span[@class='grid-row-btn narrow-row-icon']").click()
    
    # We're opening 10+ pages all at once, so give it a little more time to load
    time.sleep(5)
    
    try_do_scrape(driver)
    driver.quit()

In [5]:
def try_do_scrape(driver):
    
    # Click the 'Load More Results' button 100 times, then grab all the results on the page
    
    for i in range(1, 101):
        
        try:
            ## Last, find and click the next page button (using Keys.ENTER results in fewer errors than .click())
            next_button = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, "//a[@class='button-load-more']")))

            next_button.click();
            
            #wait.until(EC.element_to_be_clickable((By.XPATH, "//a[@class='button-load-more']")))
            #next_button = driver.find_element_by_xpath("//a[@class='button-load-more']")
            #next_button.send_keys(Keys.ENTER)
            
            #next_button.click()

            ### Wait for the next button to become stale, meaning we've successfully navigated to the next page of results
            ### This will throw a TimeoutException on the last page of results, since that page has no next button
            ### We'll just catch the exception once, rather than checking the button's existance on every single page
            
            time.sleep(0.5)
        except (NoSuchElementException, TimeoutException):
            continue
            
    ## Send the table to parse_page
    parse_page(driver.page_source)

In [6]:
# Parse a page of marathon finishers, adding the results to the master list
def parse_page(html):
    
    soup = BeautifulSoup(html, 'html.parser')

    page_finishers = soup.find_all('div','cmd-finisher ng-scope')

    for finisher in page_finishers:
        
        name = finisher.find('div','name rms-grid-line ng-binding').get_text().strip()
        
        details = finisher.find('div', 'details rms-grid-line').find_all('span')
        gender = details[0].get_text()[0:1]
        age = details[0].get_text()[1:]
        nationality = details[1].get_text()
        bib_number = finisher.find('span', attrs={"ng-if": "eventFinisher.bib"}).get_text().strip("Bib ")
        
        time_full = finisher.find('span', 'result right-floated-item long-text').get_text()[4:]
        place = finisher.find('span', 'result right-floated-item mid-text').get_text()[5:]

        result = {
            "place" : place,
            "name" : name,
            "gender" : gender,
            "age" : age,
            "nationality" : nationality,
            "bib_number" : bib_number,
            "time_full" : time_full
        }

        masterResults.append(result)

In [7]:
def cleanup_and_save(year, limit):
    df_results = pd.DataFrame(masterResults)
    
    # Dedupe, just in case we gathered duplicate pages
    df_results = df_results.drop_duplicates(ignore_index=True)
    
    # Add the event name
    df_results['Race'] = 'New York City'
    df_results['Year'] = year
    
    # Save to csv
    df_results.to_csv(f'results-{year}.csv', index=False)
    print(f"Scraping complete.  Gathered {df_results.shape[0]} of {limit} results.")

# Do the scrape

In [12]:
###### SET YEAR HERE #######
year = '2019'
############################

url = f'https://results.nyrr.org/event/M{year}/finishers'

# find page limit
limit = int(find_page_limit(url))

# generate urls to scrape
pages_to_scrape = []
masterResults = []
generate_urls(limit, url)

# Scrape the urls

## We'll use multithreading to speed up the scraping process significantly
## However, performance seems to suffer when using more than 4 workers

with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
    executor.map(scrape_results, pages_to_scrape)
cleanup_and_save(year, limit)

Scraping complete.  Gathered 53520 of 53520 results.
