# New York City Scraper

    Years: X - X
    URL: https://results.nyrr.org/event/M{year}/finishers
    
    
    

In [1]:
import pandas as pd
from urllib.request import urlopen
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import ElementNotVisibleException, ElementNotSelectableException, NoSuchElementException, TimeoutException, UnexpectedAlertPresentException, StaleElementReferenceException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
import time

# Define helper methods

In [2]:
# Parse a page of marathon finishers, adding the results to the master list
def parse_page(html):
    
    soup = BeautifulSoup(html, 'html.parser')

    page_finishers = soup.find_all('div','cmd-finisher ng-scope')

    for finisher in page_finishers:
        
        name = finisher.find('div','name rms-grid-line ng-binding').get_text().strip()
        
        details = finisher.find('div', 'details rms-grid-line').find_all('span')
        gender = details[0].get_text()[0:1]
        age = details[0].get_text()[1:]
        nationality = details[1].get_text()
        bib_number = details[2].get_text()[4:]
        
        time_full = finisher.find('span', 'result right-floated-item long-text').get_text()[4:]
        place = finisher.find('span', 'result right-floated-item mid-text').get_text()[5:]

        result = {
            "place" : place,
            "name" : name,
            "gender" : gender,
            "age" : age,
            "nationality" : nationality,
            "bib_number" : bib_number,
            "time_full" : time_full
        }

        masterResults.append(result)

    time.sleep(0.25)

In [3]:
def cleanup_and_save(year):
    df_results = pd.DataFrame(masterResults)
    
    #add the event name
    df_results['Race'] = 'New York City'
    
    #add the year
    df_results['year'] = year
    
    df_results.drop_duplicates(inplace=True, ignore_index=True)
    
    #save to csv
    df_results.to_csv(f'results-{year}.csv', index=False)
    print(f"Scraping complete.  Gathered {df_results.shape[0]} results")

# Do the scraping

In [6]:
def try_do_scrape():
    
    has_more_pages = True
    
    while has_more_pages:
        
        try:
            ## Last, find and click the next page button (using Keys.ENTER results in fewer errors than .click())
            next_button = driver.find_element_by_xpath("//a[@class='button-load-more']")
            #next_button.send_keys(Keys.ENTER)
            next_button.click()

            ### Wait for the next button to become stale, meaning we've successfully navigated to the next page of results
            ### This will throw a TimeoutException on the last page of results, since that page has no next button
            ### We'll just catch the exception once, rather than checking the button's existance on every single page
            #wait.until(EC.staleness_of(next_button))
            time.sleep(0.25)
        except:
            has_more_pages = False
            ## Send the table to parse_page
            parse_page(driver.page_source)

In [32]:
def load_results(place_start):
    #open the advanced filter
    driver.find_element_by_xpath("//div[@class='short-filter button-advanced-filter']").click()
    time.sleep(0.25)
    
    #find and click the Place select
    place_select = driver.find_element_by_xpath("//select[@ng-model='overallPlaceCompareOption']")
    place_select.click()
    
    #find and click the 'Greater than' option
    place_select.find_elements_by_xpath("//select[@ng-model='overallPlaceCompareOption']/option")[2].click()
    
    #enter starting place
    place_input = driver.find_element_by_xpath("//input[@ng-model='overallPlace']")
    place_input.send_keys(place_start)
    
    #find and click 'Apply' button
    driver.find_element_by_xpath("//a[text()='Apply']").click()

In [10]:
##########################
#####   CHOOSE YEAR   ####
##########################
year = '2019'


#initialize the WebDriver
DRIVER_PATH = 'C:/dev/chromedriver.exe'
options = Options()
options.add_argument("--window-size=1920,1200")
driver = webdriver.Chrome(options=options, executable_path=DRIVER_PATH)

#Open the results page
driver.get(f'https://results.nyrr.org/event/M{year}/finishers')

#Give the page a second to load
time.sleep(1)  

#Create a list to store the results
masterResults = []

#Change to narrow view
driver.find_element_by_xpath("//span[@class='grid-row-btn narrow-row-icon']").click()


##########
## TODO ##
# 1: Find the total number of runners
# 2: load_results(place_start, place_end)
# 3: go directly to starting url: https://results.nyrr.org/event/M2019/finishers#page=1&opf={starting_index}
# 4: click to the end
# 5: load results into dataframe
# 6: combine all dataframes

#try_do_scrape()
        
#cleanup_and_save(year)

In [33]:
load_results('500')