# Setup

### Import Libraries

In [1]:
from splinter import Browser
from splinter.exceptions import ElementDoesNotExist
from bs4 import BeautifulSoup as bs
import re
import requests
import time
import pandas as pd
import unicodedata

### Set URLs

In [2]:
# base URL for race results
base_url = "https://results.nyrr.org/event/"

# specific races

race_code = "M2019" # 2019 NYC Marathon. includes gun time and splits

# race_code = "20WH5K" # 2020 Washington Heights 5K. 4942 finishers. yes to gun time, no to splits

# race_code = "a70414" # 2007 Brooklyn Half. 4853 finishers. no gun time, no splits

# race_code = "19nyc60" # 2019 NYC 60K. 375 finishers, gun time, no splits

### Set File Locations

In [3]:
# Set path for chromedriver
executable_path = {'chromedriver.exe'}

# Set path for output file

# Scraping

In [4]:
# Open browser
browser = Browser('chrome', headless=False)

In [5]:
# Open website
# website is formed by a base url plus a bib number

# bib numbers for M2019:
bib_number = 6 # US runner, with team affiliation
# bib_number = 9197 # US runner, no team
# bib_number = 5604 # Non-US runner, no team affiliation
# bib_number = 106 # Non-US runner, with team affiliation

# bib numbers for 19nyc60:
# bib_number = 12 # valid
# bib_number = 13 # invalid


# # # special test case: Anonymous runner (no name, no location, no age)
# # # url = "https://results.nyrr.org/event/880625/result/12566474"

url = base_url + race_code + "/result/" + str(bib_number)
browser.visit(url)
time.sleep(1)

In [6]:
# Detect invalid bib number
# If an invalid bib number is used, the site redirects to the results home page
if browser.url == url:
    print(f"Bib number {bib_number} is valid.")
else:
    print('Redirected.')

Bib number 6 is valid.


In [7]:
# Scrape page into soup
html = browser.html
soup = bs(html, "html.parser")

In [8]:
len(soup)

1

## Extracting data

#### Block 1: Extracting name, location, gender, age, team and bib number
Basic runner information is contained in a block color-coded to gender.

In [9]:
# create a dictionary for block 1 items
dict_block_1 = {}

In [10]:
# find name and location:
# these are both within a clickable area with class 'cursor-pointer results-link'.
# Name is the first child element there.
# Location is the second, and it is formatted as 'sub-location | country',
# where 'sub-location' may be City, ST for locations in the USA, 
# and 'country' is the IAAF country code (https://www.arrs.run/IAAF_CC2.htm)

thangs = soup.find(class_='cursor-pointer results-link').findChildren()
if thangs[0].text:
    print(thangs[0].text)
    dict_block_1['name'] = thangs[0].text
if thangs[1].text:
    print(thangs[1].text.split('|')[0].strip())
    dict_block_1['geo_subdivision'] = thangs[1].text.split('|')[0].strip()
    print(thangs[1].text.split('|')[1].strip())
    dict_block_1['country'] = thangs[1].text.split('|')[1].strip()

dict_block_1

Jared Ward
Mapleton, UT
USA


{'name': 'Jared Ward', 'geo_subdivision': 'Mapleton, UT', 'country': 'USA'}

In [11]:
# find gender, age, bib number and team:
# these are listed as naked text after a strong tag. Not all runners have a team affiliation, so need to make sure
# we are OK if there is no text after the strong tag.

for thing in soup.find_all('strong'):
    if thing.next_sibling:
        dict_block_1[thing.text.lower()] = thing.next_sibling.strip()
dict_block_1

{'name': 'Jared Ward',
 'geo_subdivision': 'Mapleton, UT',
 'country': 'USA',
 'age/gender': 'M31',
 'bib': '6',
 'team ': 'Saucony'}

#### Block 2: Extracting race outcome data
Block 2 contains time, pace, place, place by category (ender, age, country), age-graded place and time, and age-graded percentage. Gun time and gun place are included for some, but not all races.

In [12]:
# create a dictionary for block 2 items
dict_block_2 = {}

In [13]:
regex_1 = re.compile('.*form-group-item.*')
regex_2 = re.compile('.*ng-hide.*')
block_2_soup = soup.find_all("div", {"class" : regex_1})[:4]
# block_2_soup = soup.find_all("div", {"class" : regex}) # this would get block 3 as well
for spoonful in block_2_soup:
    if spoonful.find("div", {"class" : regex_2}):
        continue
    labels = spoonful.find_all('label')
    values = spoonful.find_all('span', {'class': 'label-value ng-binding'})
    for x in range(0, len(labels)):
        # there was a '\xa0' coming in as part of the 'Place Country' label
        # use unicodedata.normalize to replace that with a space.
        label = unicodedata.normalize("NFKD", labels[x].text)
        print(label, values[x].text)
        dict_block_2[label] = values[x].text

Official Time 2:10:45
Pace per Mile 05:00
Place Overall 6
Place Gender 6
(30-34) Place Age‐Group 1
(USA) Place Country 1
Place Age‐Graded 8
Time Age‐Graded 2:10:45
Percentage Age‐Graded 94.04%
Gun Time 2:10:45
Gun Place 6


In [14]:
block_2_soup[1]

<div class="shifting-height row bottom-border form-group-item">
<div class="col-xs-4 col-sm-4 col-md-4">
<label>Place Gender</label>
<span class="label-value ng-binding">6</span>
<span class="label-value-of ng-binding">of 30,801</span>
</div>
<div class="col-xs-4 col-sm-4 col-md-4 no-right-padding-for-mobile" ng-hide="resultDetails.placeAgeGroup == 0">
<label><span class="less-font-for-mobile ng-binding">(30-34)</span> Place Age‑Group</label>
<span class="label-value ng-binding">1</span>
<span class="label-value-of ng-binding">of 4,075</span>
</div>
<div class="col-xs-4 col-sm-4 col-md-4" ng-hide="resultDetails.placeAgeGroup == 0">
<label><span class="less-font-for-mobile ng-binding" ng-show="resultDetails.iaaf">(USA)</span> Place Country</label>
<span class="label-value ng-binding">1</span>
<span class="label-value-of ng-binding">of 28,467</span>
</div>
</div>

In [15]:
block_2_soup[2]

<div class="shifting-height row bottom-border form-group-item" ng-hide="resultDetails.placeAgeGrade == 0 &amp;&amp; resultDetails.timeAgeGrade == '0:00:00' &amp;&amp; resultDetails.percentAgeGrade == 0">
<div class="col-xs-4 col-sm-4 col-md-4">
<label>Place Age‑Graded</label>
<span class="label-value ng-binding">8</span>
<span class="label-value-of ng-binding">of 30,801</span>
</div>
<div class="col-xs-4 col-sm-4 col-md-4">
<label>Time Age‑Graded</label>
<span class="label-value ng-binding">2:10:45</span>
</div>
<div class="col-xs-4 col-sm-4 col-md-4">
<label>Percentage Age‑Graded</label>
<span class="label-value ng-binding">94.04%</span>
</div>
</div>

In [16]:
dict_block_2

{'Official Time': '2:10:45',
 'Pace per Mile': '05:00',
 'Place Overall': '6',
 'Place Gender': '6',
 '(30-34) Place Age‐Group': '1',
 '(USA) Place Country': '1',
 'Place Age‐Graded': '8',
 'Time Age‐Graded': '2:10:45',
 'Percentage Age‐Graded': '94.04%',
 'Gun Time': '2:10:45',
 'Gun Place': '6'}

#### Block 3: Extracting Splits
Splits are a more detailed breakdown of the runner's time at checkpoints along the course. Typically, only the longer races have splits. The splits can vary from race to race - it all depends upon where the checkpoints are located.

In [17]:
# extract split labels and make them into a list
split_labels_soup = soup.find_all('div', {'ng-show': 'resultDetails.splitResults.length > 0'})[0].find_all('label', {'class': 'ng-binding'})

# if this list populates, continue with extracting the splits
if split_labels_soup:
    split_labels = [thing.text for thing in split_labels_soup]

    # extract split values and make them into a list
    split_times_soup = soup.find_all('div', {'ng-show': 'resultDetails.splitResults.length > 0'})[0].find_all('span', {'class': 'label-value ng-binding'})
    split_times = [thing.text for thing in split_times_soup]

    # make a dictionary for the splits
    dict_block_3 = dict(zip(split_labels, split_times))

    print(dict_block_3)

# if the list didn't populate, the race did not have splits.
else:
      print("This race has no splits.")

{'3M': '0:14:46', '5K': '0:15:19', '4M': '0:19:43', '5M': '0:24:37', '6M': '0:29:30', '10K': '0:30:34', '7M': '0:34:26', '8M': '0:39:15', '9M': '0:44:14', '15K': '0:45:46', '10M': '0:49:10', '11M': '0:54:17', '12M': '0:59:08', '20K': '1:01:20', '13M': '1:04:15', 'HALF': '1:04:50', '14M': '1:09:17', '15M': '1:14:23', '25K': '1:17:10', '16M': '1:19:22', '17M': '1:24:17', '18M': '1:29:10', '30K': '1:32:27', '19M': '1:34:14', '20M': '1:39:13', '21M': '1:44:15', '35K': '1:47:57', '22M': '1:49:12', '23M': '1:54:16', '24M': '1:59:35', '40K': '2:03:47', '25M': '2:04:34', '26M': '2:09:37', 'MAR': '2:10:45'}


In [18]:
# extract distance and date
soup.find_all(class_='sub-title')[0].text

'Marathon | Nov 3, 2019'

In [19]:
things = soup.find_all(class_='label-value ng-binding')

### Scraping function

In [20]:
def finisher_scraper(soup):
    finisher_dict = {}
# block 1
    thangs = soup.find(class_='cursor-pointer results-link').findChildren()
    finisher_dict['name'] = thangs[0].text
    finisher_dict['geo_subregion'] = thangs[1].text.split('|')[0].strip()
    finisher_dict['country'] = thangs[1].text.split('|')[1].strip()
    for thing in soup.find_all('strong'):
        if thing.next_sibling:
            pattern = re.compile('[MF][1-9][0-9]?')
            if pattern.match(thing.next_sibling.strip()):
                finisher_dict['gender'] = thing.next_sibling.strip()[0]
                finisher_dict['age'] = thing.next_sibling.strip()[1:]
                continue
            finisher_dict[thing.text.strip().lower().replace(" ", "_").replace("/", "_and_")] = thing.next_sibling.strip()
# block 2
    regex = re.compile('.*form-group-item.*')
#     block_2_soup = soup.find_all("div", {"class" : regex})[:4] # this gets only block 2
    block_2_soup = soup.find_all("div", {"class" : regex}) # this gets block 3 as well
    for spoonful in block_2_soup:
        labels = spoonful.find_all('label')
        values = spoonful.find_all('span', {'class': 'label-value ng-binding'})
        for x in range(0, len(labels)):
            # there was a '\xa0' coming in as part of the 'Place Country' label
            # use unicodedata.normalize to replace that with a space.
            label = unicodedata.normalize("NFKD", labels[x].text)
            if label.endswith("Place Age‐Group"):
                # if we find the place in age-group, add the age group to the dictionary.
                finisher_dict["age_group"] = re.findall(r'\(([^]]*)\)', label)[0]
                label = "Place Age-Group"
            elif label.endswith("Place Country"):
                label = "Place Country"
            finisher_dict[label.strip().lower().replace(" ", "_")] = values[x].text

    return finisher_dict

In [21]:
# test scraping function
finisher_scraper(soup)

{'name': 'Jared Ward',
 'geo_subregion': 'Mapleton, UT',
 'country': 'USA',
 'gender': 'M',
 'age': '31',
 'bib': '6',
 'team': 'Saucony',
 'official_time': '2:10:45',
 'pace_per_mile': '05:00',
 'place_overall': '6',
 'place_gender': '6',
 'age_group': '30-34',
 'place_age-group': '1',
 'place_country': '1',
 'place_age‐graded': '8',
 'time_age‐graded': '2:10:45',
 'percentage_age‐graded': '94.04%',
 'gun_time': '2:10:45',
 'gun_place': '6',
 '3m': '0:14:46',
 '5k': '0:15:19',
 '4m': '0:19:43',
 '5m': '0:24:37',
 '6m': '0:29:30',
 '10k': '0:30:34',
 '7m': '0:34:26',
 '8m': '0:39:15',
 '9m': '0:44:14',
 '15k': '0:45:46',
 '10m': '0:49:10',
 '11m': '0:54:17',
 '12m': '0:59:08',
 '20k': '1:01:20',
 '13m': '1:04:15',
 'half': '1:04:50',
 '14m': '1:09:17',
 '15m': '1:14:23',
 '25k': '1:17:10',
 '16m': '1:19:22',
 '17m': '1:24:17',
 '18m': '1:29:10',
 '30k': '1:32:27',
 '19m': '1:34:14',
 '20m': '1:39:13',
 '21m': '1:44:15',
 '35k': '1:47:57',
 '22m': '1:49:12',
 '23m': '1:54:16',
 '24m': '

### Demonstration Loop

In [22]:
### Demonstration Loop

# Create lists and dictionaries for results
runners = []
scraping_log = {}

# Open up a browser
browser = Browser('chrome', headless=False)

for bib_number in range(0, 20):
    url = base_url + race_code + "/result/" + str(bib_number)
    browser.visit(url)
    # need a short sleep period to give the site a chance to redirect an invalid bib number.
    # we also don't need to beat on the website too hard.
    time.sleep(3)
#     time.sleep(random.uniform(3,7))
    if browser.url == url:
        try:
            soup = bs(browser.html, "html.parser")
            runners.append(finisher_scraper(soup))
            scraping_log[bib_number] = "success"
        except:
            scraping_log[bib_number] = "scraping error"
    else:
        scraping_log[bib_number] = "DNF"
print('Loop complete')

Loop complete


In [23]:
len(runners)

16

In [24]:
runners

[{'name': 'Shura Kitata',
  'geo_subregion': 'Addis Ababa',
  'country': 'ETH',
  'gender': 'M',
  'age': '23',
  'bib': '2',
  'team': 'NIKE',
  'official_time': '2:10:39',
  'pace_per_mile': '04:59',
  'place_overall': '5',
  'place_gender': '5',
  'age_group': '20-24',
  'place_age-group': '1',
  'place_country': '3',
  'place_age‐graded': '7',
  'time_age‐graded': '2:10:39',
  'percentage_age‐graded': '94.11%',
  'gun_time': '2:10:39',
  'gun_place': '5',
  '3m': '0:14:48',
  '5k': '0:15:18',
  '4m': '0:19:43',
  '5m': '0:24:36',
  '6m': '0:29:30',
  '10k': '0:30:33',
  '7m': '0:34:22',
  '8m': '0:39:12',
  '9m': '0:44:11',
  '15k': '0:45:44',
  '10m': '0:49:06',
  '11m': '0:54:10',
  '12m': '0:59:05',
  '20k': '1:01:20',
  '13m': '1:04:15',
  'half': '1:04:51',
  '14m': '1:09:17',
  '15m': '1:14:22',
  '25k': '1:17:05',
  '16m': '1:19:17',
  '17m': '1:24:16',
  '18m': '1:29:10',
  '30k': '1:32:26',
  '19m': '1:34:13',
  '20m': '1:39:01',
  '21m': '1:44:08',
  '35k': '1:47:37',
  '

In [25]:
scraping_log

{0: 'DNF',
 1: 'DNF',
 2: 'success',
 3: 'success',
 4: 'success',
 5: 'success',
 6: 'success',
 7: 'success',
 8: 'success',
 9: 'success',
 10: 'success',
 11: 'success',
 12: 'success',
 13: 'DNF',
 14: 'success',
 15: 'success',
 16: 'success',
 17: 'DNF',
 18: 'success',
 19: 'success'}

In [26]:
df = pd.DataFrame(runners)

In [27]:
df.info(verbose = True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16 entries, 0 to 15
Data columns (total 53 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   name                   16 non-null     object
 1   geo_subregion          16 non-null     object
 2   country                16 non-null     object
 3   gender                 16 non-null     object
 4   age                    16 non-null     object
 5   bib                    16 non-null     object
 6   team                   15 non-null     object
 7   official_time          16 non-null     object
 8   pace_per_mile          16 non-null     object
 9   place_overall          16 non-null     object
 10  place_gender           16 non-null     object
 11  age_group              16 non-null     object
 12  place_age-group        16 non-null     object
 13  place_country          16 non-null     object
 14  place_age‐graded       16 non-null     object
 15  time_age‐graded        16

In [28]:
df

Unnamed: 0,name,geo_subregion,country,gender,age,bib,team,official_time,pace_per_mile,place_overall,...,20m,21m,35k,22m,23m,24m,40k,25m,26m,mar
0,Shura Kitata,Addis Ababa,ETH,M,23,2,NIKE,2:10:39,04:59,5,...,1:39:01,1:44:08,1:47:37,1:48:53,1:53:58,1:59:21,2:03:37,2:04:24,2:09:33,2:10:39
1,Geoffrey Kamworor,Kapchorwa District,KEN,M,26,3,NIKE,2:08:13,04:54,1,...,1:38:59,1:44:07,1:47:34,1:48:44,1:53:20,1:57:59,2:01:48,2:02:30,2:07:11,2:08:13
2,Tamirat Tola,Addis Ababa,ETH,M,28,4,adidas,2:09:20,04:56,4,...,1:39:00,1:44:07,1:47:35,1:48:43,1:53:21,1:58:25,2:02:29,2:03:15,2:08:13,2:09:20
3,Albert Korir,Kapkitony,KEN,M,25,5,adidas,2:08:36,04:55,2,...,1:39:00,1:44:08,1:47:35,1:48:44,1:53:20,1:58:06,2:02:00,2:02:44,2:07:32,2:08:36
4,Jared Ward,"Mapleton, UT",USA,M,31,6,Saucony,2:10:45,05:00,6,...,1:39:13,1:44:15,1:47:57,1:49:12,1:54:16,1:59:35,2:03:47,2:04:34,2:09:37,2:10:45
5,Michel Butter,Castricum,NLD,M,33,7,New Balance,2:25:06,05:33,37,...,1:43:56,1:50:22,1:55:04,1:56:42,2:03:26,2:10:32,2:16:06,2:17:07,2:23:41,2:25:06
6,Brett Robinson,Melbourne,AUS,M,28,8,NIKE,2:17:50,05:16,23,...,1:39:52,1:45:24,1:49:30,1:50:56,1:56:46,2:02:54,2:08:26,2:09:27,2:16:21,2:17:50
7,Jack Rayner,Melbourne,AUS,M,23,9,NIKE,2:16:58,05:14,22,...,1:40:59,1:46:40,1:50:55,1:52:22,1:58:10,2:04:07,2:08:58,2:09:51,2:15:39,2:16:58
8,Stephen Sambu,"Tucson, AZ",USA,M,31,10,NIKE,2:11:11,05:01,7,...,1:39:12,1:44:15,1:47:58,1:49:13,1:54:16,1:59:39,2:03:59,2:04:46,2:09:58,2:11:11
9,Yoshiki Takenouchi,Kyoto,JPN,M,27,11,NTT West,2:11:18,05:01,8,...,1:39:13,1:44:15,1:48:00,1:49:18,1:54:27,1:59:50,2:04:09,2:04:56,2:10:11,2:11:18
