In [13]:
import pandas as pd
import yaml
import time
import os
import re
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup

In [14]:
def scrape_ultrasignup_results(url, year):
    # Set up headless browser
    options = Options()
    options.add_argument("--headless")
    driver = webdriver.Chrome(options=options)

    # Load page
    driver.get(url)
    time.sleep(3)  # Wait for JavaScript to load

    # Get page source and parse
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    driver.quit()

    table = soup.find('table', id='list')
    rows = table.find_all('tr')

    # Extract race date from the page
    text = soup.get_text()

    # Match date range like "February 8 - 9, 2025"
    match_range = re.search(r'([A-Za-z]+)\s+(\d{1,2})\s*[-–—]\s*\d{1,2},\s*(\d{4})', text)

    # Match single date like "Saturday, June 24, 2023"
    match_single = re.search(r'\b(?:Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday),\s+([A-Za-z]+)\s+(\d{1,2}),\s+(\d{4})\b', text)

    if match_range:
        month = match_range.group(1)
        day = match_range.group(2)
        year_str = match_range.group(3)
        race_date = f"{month} {day}, {year_str}"
    elif match_single:
        month = match_single.group(1)
        day = match_single.group(2)
        year_str = match_single.group(3)
        race_date = f"{month} {day}, {year_str}"
    else:
        race_date = 'Unknown'


    print(f"Scraping: {url}")
    print(f"Total rows found: {len(rows)}")

    data = []
    current_status = 'Finisher'

    for row in rows:
        row_classes = row.get('class', [])

        if 'jqgroup' in row_classes:
            header_text = row.get_text(strip=True).lower()
            if 'did not finish' in header_text:
                current_status = 'DNF'
            elif 'did not start' in header_text:
                current_status = 'DNS'
            elif 'finisher' in header_text:
                current_status = 'Finisher'
            continue

        if 'jqgrow' not in row_classes:
            continue

        cells = row.find_all('td')
        if len(cells) < 8 or current_status == 'DNS':
            continue    

        data.append({
            'Date': race_date,
            'Year': year,
            'Rank': cells[1].get_text(strip=True),
            'First_Name': cells[2].get_text(strip=True),
            'Last_Name': cells[3].get_text(strip=True),
            'Name': f"{cells[2].get_text(strip=True)} {cells[3].get_text(strip=True)}",
            'City': cells[4].get_text(strip=True),
            'State': cells[5].get_text(strip=True),
            'Age': cells[6].get_text(strip=True),
            'Gender_Div': cells[7].get_text(strip=True),
            'Div_Place': cells[8].get_text(strip=True),
            'Time': cells[9].get_text(strip=True),
            'Ultra_Rank': cells[10].get_text(strip=True),
            'Status': current_status
        })

    return pd.DataFrame(data)

In [24]:
def load_race_config(race_file):
    folder = "ultrasignup_yaml"
    path = os.path.join("..", "config", folder, race_file)

    with open(path, "r") as f:
        return yaml.safe_load(f)

In [25]:
# This is the only cell that needs updating. Replace with desired yaml.
race = load_race_config("javelina_jundred_100m.yaml")

In [18]:
all_results = []

for year, race_id in race["race_ids"].items():
    url = f"https://ultrasignup.com/results_event.aspx?did={race_id}"
    df = scrape_ultrasignup_results(url, year)
    df['Race'] = race["race_name"]
    df['Race_Loc'] = race["race_loc"]
    df['Race_Dist'] = race["race_dist"]
    df['Year'] = int(year)
    all_results.append(df)

race_df = pd.concat(all_results, ignore_index=True)

Scraping: https://ultrasignup.com/results_event.aspx?did=12159
Total rows found: 398
Scraping: https://ultrasignup.com/results_event.aspx?did=15839
Total rows found: 408
Scraping: https://ultrasignup.com/results_event.aspx?did=19185
Total rows found: 415
Scraping: https://ultrasignup.com/results_event.aspx?did=27560
Total rows found: 569
Scraping: https://ultrasignup.com/results_event.aspx?did=31274
Total rows found: 519
Scraping: https://ultrasignup.com/results_event.aspx?did=35377
Total rows found: 579
Scraping: https://ultrasignup.com/results_event.aspx?did=43813
Total rows found: 540
Scraping: https://ultrasignup.com/results_event.aspx?did=53326
Total rows found: 566
Scraping: https://ultrasignup.com/results_event.aspx?did=63015
Total rows found: 728
Scraping: https://ultrasignup.com/results_event.aspx?did=74613
Total rows found: 258
Scraping: https://ultrasignup.com/results_event.aspx?did=81876
Total rows found: 591
Scraping: https://ultrasignup.com/results_event.aspx?did=90708
To

In [23]:
# Sanity Check. Compare values to Ultrasignup results.
# status_counts = race_df.groupby(['Year', 'Status']).size().unstack(fill_value=0)
# status_counts

Status,DNF,Finisher
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
2011,201,174
2012,225,159
2013,242,156
2014,218,290
2015,216,281
2016,271,285
2017,185,348
2018,195,368
2019,242,425
2020,86,160


In [None]:
# race_name = race_df
# race_name.to_csv('../data/file_name.csv', index = False, encoding = 'utf-8')