# Setup

### Import Libraries

In [43]:
from splinter import Browser
from splinter.exceptions import ElementDoesNotExist
from bs4 import BeautifulSoup as bs
import re
import requests
import time
import pandas as pd
import unicodedata

### Set URLs

In [44]:
# base URL for race results
base_url = "https://results.nyrr.org/event/"

# specific races

race_code = "M2019" # 2019 NYC Marathon. includes gun time and splits

# race_code = "20WH5K" # 2020 Washington Heights 5K. 4942 finishers. yes to gun time, no to splits

# race_code = "a70414" # 2007 Brooklyn Half. 4853 finishers. no gun time, no splits

# race_code = "19nyc60" # 2019 NYC 60K. 375 finishers, gun time, no splits

### Set File Locations

In [45]:
# Set path for chromedriver
executable_path = {'chromedriver.exe'}

# Set path for output file

# Scraping

In [4]:
# Open browser
browser = Browser('chrome', headless=False)

In [5]:
# # Open website
# # website is formed by a base url plus a bib number
# # bib numbers for M2019:
# # bib_number = 6 # US runner, with team affiliation
# # bib_number = 9197 # US runner, no team
# # bib_number = 5604 # Non-US runner, no team affiliation
# bib_number = 106 # Non-US runner, with team affiliation

# bib numbers for 19nyc60:
# bib_number = 12 # valid
# bib_number = 13 # invalid


# # # special test case: Anonymous runner (no name, no location, no age)
# # # url = "https://results.nyrr.org/event/880625/result/12566474"

url = base_url + race_code + "/result/" + str(bib_number)
browser.visit(url)
time.sleep(1)

In [6]:
# Detect invalid bib number
# If an invalid bib number is used, the site redirects to the results home page
if browser.url == url:
    print(f"Bib number {bib_number} is valid.")
else:
    print('Redirected.')

Bib number 12 is valid.


In [7]:
# Scrape page into soup
html = browser.html
soup = bs(html, "html.parser")

In [51]:
### Demonstration Loop 2
runners = []
scraping_log = {}
for bib_number in range(0, 20):
    url = base_url + race_code + "/result/" + str(bib_number)
    browser.visit(url)
    # need a short sleep period to give the site a chance to redirect an invalid bib number.
    # we also don't need to beat on the website too hard.
    time.sleep(3)
#     time.sleep(random.uniform(3,7))
    if browser.url == url:
        try:
            soup = bs(browser.html, "html.parser")
            runners.append(finisher_scraper(soup))
            scraping_log[bib_number] = "success"
        except:
            scraping_log[bib_number] = "scraping error"
    else:
        scraping_log[bib_number] = "DNF"
print('Loop complete')

Loop complete


In [52]:
runners

[{'name': 'Shura Kitata',
  'geo_subregion': 'Addis Ababa',
  'country': 'ETH',
  'gender': 'M',
  'age': '23',
  'bib': '2',
  'team': 'NIKE',
  'official_time': '2:10:39',
  'pace_per_mile': '04:59',
  'place_overall': '5',
  'place_gender': '5',
  'age_group': '20-24',
  'place_age-group': '1',
  'place_country': '3',
  'place_age‐graded': '7',
  'time_age‐graded': '2:10:39',
  'percentage_age‐graded': '94.11%',
  'gun_time': '2:10:39',
  'gun_place': '5',
  '3m': '0:14:48',
  '5k': '0:15:18',
  '4m': '0:19:43',
  '5m': '0:24:36',
  '6m': '0:29:30',
  '10k': '0:30:33',
  '7m': '0:34:22',
  '8m': '0:39:12',
  '9m': '0:44:11',
  '15k': '0:45:44',
  '10m': '0:49:06',
  '11m': '0:54:10',
  '12m': '0:59:05',
  '20k': '1:01:20',
  '13m': '1:04:15',
  'half': '1:04:51',
  '14m': '1:09:17',
  '15m': '1:14:22',
  '25k': '1:17:05',
  '16m': '1:19:17',
  '17m': '1:24:16',
  '18m': '1:29:10',
  '30k': '1:32:26',
  '19m': '1:34:13',
  '20m': '1:39:01',
  '21m': '1:44:08',
  '35k': '1:47:37',
  '

In [48]:
len(runners)

16

In [49]:
scraping_log

{0: 'DNF',
 1: 'DNF',
 2: 'success',
 3: 'success',
 4: 'success',
 5: 'success',
 6: 'success',
 7: 'success',
 8: 'success',
 9: 'success',
 10: 'success',
 11: 'success',
 12: 'success',
 13: 'DNF',
 14: 'success',
 15: 'success',
 16: 'success',
 17: 'DNF',
 18: 'success',
 19: 'success'}

## Extracting data

### Scraping function

In [50]:
def finisher_scraper(soup):
    finisher_dict = {}
# block 1
    thangs = soup.find(class_='cursor-pointer results-link').findChildren()
    finisher_dict['name'] = thangs[0].text
    finisher_dict['geo_subregion'] = thangs[1].text.split('|')[0].strip()
    finisher_dict['country'] = thangs[1].text.split('|')[1].strip()
    for thing in soup.find_all('strong'):
        if thing.next_sibling:
            pattern = re.compile('[MF][1-9][0-9]?')
            if pattern.match(thing.next_sibling.strip()):
                finisher_dict['gender'] = thing.next_sibling.strip()[0]
                finisher_dict['age'] = thing.next_sibling.strip()[1:]
                continue
            finisher_dict[thing.text.strip().lower().replace(" ", "_").replace("/", "_and_")] = thing.next_sibling.strip()
# block 2
    regex = re.compile('.*form-group-item.*')
#     block_2_soup = soup.find_all("div", {"class" : regex})[:4] # this gets only block 2
    block_2_soup = soup.find_all("div", {"class" : regex}) # this gets block 3 as well
    for spoonful in block_2_soup:
        labels = spoonful.find_all('label')
        values = spoonful.find_all('span', {'class': 'label-value ng-binding'})
        for x in range(0, len(labels)):
            # there was a '\xa0' coming in as part of the 'Place Country' label
            # use unicodedata.normalize to replace that with a space.
            label = unicodedata.normalize("NFKD", labels[x].text)
            if label.endswith("Place Age‐Group"):
                # if we find the place in age-group, add the age group to the dictionary.
                finisher_dict["age_group"] = re.findall(r'\(([^]]*)\)', label)[0]
                label = "Place Age-Group"
            elif label.endswith("Place Country"):
                label = "Place Country"
            finisher_dict[label.strip().lower().replace(" ", "_")] = values[x].text

    return finisher_dict

In [42]:
# test scraping function
finisher_scraper(soup)

{'name': 'Alex Lorton',
 'geo_subregion': 'New York, NY',
 'country': 'USA',
 'gender': 'M',
 'age': '35',
 'age_and_gender': 'M35',
 'bib': '19',
 'team': 'Urban Athletics',
 'official_time': '4:18:45',
 'pace_per_mile': '06:57',
 'place_overall': '4',
 'place_gender': '4',
 'age_group': '35-39',
 'place_age-group': '1',
 'place_country': '4',
 'place_age‐graded': '0',
 'time_age‐graded': '0:00:00',
 'percentage_age‐graded': '0%',
 'net_time': '4:18:45',
 'net_place': '4'}

#### Block 1: Extracting name, location, gender, age, team and bib number
Basic runner information is contained in a block color-coded to gender.

In [None]:
# create a dictionary for block 1 items
dict_block_1 = {}

In [None]:
# find name and location:
# these are both within a clickable area with class 'cursor-pointer results-link'.
# Name is the first child element there.
# Location is the second, and it is formatted as 'sub-location | country',
# where 'sub-location' may be City, ST for locations in the USA, 
# and 'country' is the IAAF country code (https://www.arrs.run/IAAF_CC2.htm)

thangs = soup.find(class_='cursor-pointer results-link').findChildren()
if thangs[0].text:
    print(thangs[0].text)
    dict_block_1['name'] = thangs[0].text
if thangs[1].text:
    print(thangs[1].text.split('|')[0].strip())
    dict_block_1['geo_subdivision'] = thangs[1].text.split('|')[0].strip()
    print(thangs[1].text.split('|')[1].strip())
    dict_block_1['country'] = thangs[1].text.split('|')[1].strip()

dict_block_1

In [None]:
# find gender, age, bib number and team:
# these are listed as naked text after a strong tag. Not all runners have a team affiliation, so need to make sure
# we are OK if there is no text after the strong tag.

for thing in soup.find_all('strong'):
    if thing.next_sibling:
        dict_block_1[thing.text.lower()] = thing.next_sibling.strip()
dict_block_1

#### Block 2: Extracting race outcome data
Block 2 contains time, pace, place, place by category (ender, age, country), age-graded place and time, and age-graded percentage. Gun time and gun place are included for some, but not all races.

In [None]:
# create a dictionary for block 2 items
dict_block_2 = {}

In [None]:
regex_1 = re.compile('.*form-group-item.*')
regex_2 = re.compile('.*ng-hide.*')
block_2_soup = soup.find_all("div", {"class" : regex_1})[:4]
# block_2_soup = soup.find_all("div", {"class" : regex}) # this would get block 3 as well
for spoonful in block_2_soup:
    if spoonful.find("div", {"class" : regex_2}):
        continue
    labels = spoonful.find_all('label')
    values = spoonful.find_all('span', {'class': 'label-value ng-binding'})
    for x in range(0, len(labels)):
        # there was a '\xa0' coming in as part of the 'Place Country' label
        # use unicodedata.normalize to replace that with a space.
        label = unicodedata.normalize("NFKD", labels[x].text)
        print(label, values[x].text)
        dict_block_2[label] = values[x].text

In [None]:
block_2_soup[1]

In [None]:
block_2_soup[2]

In [None]:
dict_block_2

#### Block 3: Extracting Splits
Splits are a more detailed breakdown of the runner's time at checkpoints along the course. Typically, only the longer races have splits. The splits can vary from race to race - it all depends upon where the checkpoints are located.

In [None]:
# extract split labels and make them into a list
split_labels_soup = soup.find_all('div', {'ng-show': 'resultDetails.splitResults.length > 0'})[0].find_all('label', {'class': 'ng-binding'})

# if this list populates, continue with extracting the splits
if split_labels_soup:
    split_labels = [thing.text for thing in split_labels_soup]

    # extract split values and make them into a list
    split_times_soup = soup.find_all('div', {'ng-show': 'resultDetails.splitResults.length > 0'})[0].find_all('span', {'class': 'label-value ng-binding'})
    split_times = [thing.text for thing in split_times_soup]

    # make a dictionary for the splits
    dict_block_3 = dict(zip(split_labels, split_times))

    print(dict_block_3)

# if the list didn't populate, the race did not have splits.
else:
      print("This race has no splits.")

In [None]:
# extract distance and date
soup.find_all(class_='sub-title')[0].text

In [None]:
things = soup.find_all(class_='label-value ng-binding')