In [123]:
import pandas as pd
import yaml
import time
import os
import re
import requests
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
from datetime import datetime

In [140]:
def scrape_ultrasignup_results(url, year, race_id, series_id):
    # Set up headless browser
    options = Options()
    options.add_argument("--headless")
    driver = webdriver.Chrome(options=options)

    # Load page
    driver.get(url)
    time.sleep(3)  # Wait for JavaScript to load

    # Get page source and parse
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    driver.quit()

    table = soup.find('table', id='list')
    rows = table.find_all('tr')

    # Extract race date from the page
    text = soup.get_text()

    # Match date range like "February 8 - 9, 2025"
    match_range = re.search(r'([A-Za-z]+)\s+(\d{1,2})\s*[-–—]\s*\d{1,2},\s*(\d{4})', text)

    # Match single date like "Saturday, June 24, 2023"
    match_single = re.search(r'\b(?:Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday),\s+([A-Za-z]+)\s+(\d{1,2}),\s+(\d{4})\b', text)

    if match_range:
        month = match_range.group(1)
        day = match_range.group(2)
        year_str = match_range.group(3)
        raw_date = f"{month} {day}, {year_str}"
    elif match_single:
        month = match_single.group(1)
        day = match_single.group(2)
        year_str = match_single.group(3)
        raw_date = f"{month} {day}, {year_str}"
    else:
        raw_date = 'Unknown'

        
    # Convert to mm/dd/yyyy
    # Will handle both Full month and Abbreviated month
    
    try:
        race_date = datetime.strptime(raw_date, "%B %d, %Y").strftime("%m/%d/%Y")  
        # Full month
    except ValueError:
        try:
            race_date = datetime.strptime(raw_date, "%b %d, %Y").strftime("%m/%d/%Y")  
        # Abbreviated month
        except ValueError:
            print(f"Date parse failed for: {raw_date}")
            race_date = 'Unknown'


    print(f"Scraping: {url}")
    print(f"Total rows found: {len(rows)}")

    data = []
    current_status = 'Finisher'

    for row in rows:
        row_classes = row.get('class', [])

        if 'jqgroup' in row_classes:
            header_text = row.get_text(strip=True).lower()
            if 'did not finish' in header_text:
                current_status = 'DNF'
            elif 'did not start' in header_text:
                current_status = 'DNS'
            elif 'finisher' in header_text:
                current_status = 'Finisher'
            continue

        if 'jqgrow' not in row_classes:
            continue

        cells = row.find_all('td')
        if len(cells) < 8 or current_status == 'DNS':
            continue    

        data.append({
            'Series_ID': series_id,
            'Race_ID': race_id,
            'Race_Date': race_date,
            'Year': year,
            'Rank': cells[1].get_text(strip=True),
            'Status': current_status,
            # 'First_Name': cells[2].get_text(strip=True),
            # 'Last_Name': cells[3].get_text(strip=True),
            'Name': f"{cells[2].get_text(strip=True)} {cells[3].get_text(strip=True)}".lower(),
            'City': cells[4].get_text(strip=True),
            'State': cells[5].get_text(strip=True),
            'Gender': cells[7].get_text(strip=True),
            'Gender_Rank': cells[8].get_text(strip=True),
            'Age': cells[6].get_text(strip=True),
            'Time': cells[9].get_text(strip=True)
        })

    return pd.DataFrame(data)

In [141]:
def load_race_config(race_file):
    folder = "ultrasignup_yaml"
    path = os.path.join("../..", "config", folder, race_file)

    with open(path, "r") as f:
        return yaml.safe_load(f)

#### Update load_race_config with desired race yaml path.

In [142]:
# This is the only cell that needs updating. Replace with desired yaml.
race = load_race_config("gorge_waterfalls_100k.yaml")

In [143]:
all_results = []

for year, race_id in race["race_ids"].items():
    url = f"https://ultrasignup.com/results_event.aspx?did={race_id}"
    df = scrape_ultrasignup_results(url, year, race_id, race["series_id"])
    df['Race_Name'] = race["race_name"]
    df['Race_Loc'] = race["race_loc"]
    df['Race_Dist'] = race["race_dist"]
    df['Year'] = int(year)
    df['Race_ID'] = race_id
    df['Series_ID'] = race["series_id"]
    all_results.append(df)

race_df = pd.concat(all_results, ignore_index=True)

Scraping: https://ultrasignup.com/results_event.aspx?did=91882
Total rows found: 221
Scraping: https://ultrasignup.com/results_event.aspx?did=91883
Total rows found: 304


In [145]:
race_df.head()

Unnamed: 0,Series_ID,Race_ID,Race_Date,Year,Rank,Status,Name,City,State,Gender,Gender_Rank,Age,Time,Race_Name,Race_Loc,Race_Dist
0,74589,91882,04/09/2016,2016,1,Finisher,rui ueda,Tokyo,,M,1,22,9:09:37,Gorge Waterfalls 100K,"Cascade Locks, OR",100K
1,74589,91882,04/09/2016,2016,2,Finisher,chris mocko,San Francisco,CA,M,2,30,9:10:04,Gorge Waterfalls 100K,"Cascade Locks, OR",100K
2,74589,91882,04/09/2016,2016,3,Finisher,jeremy humphrey,McCall,ID,M,3,36,9:34:11,Gorge Waterfalls 100K,"Cascade Locks, OR",100K
3,74589,91882,04/09/2016,2016,4,Finisher,ryan ghelfi,Ashland,OR,M,4,27,9:36:08,Gorge Waterfalls 100K,"Cascade Locks, OR",100K
4,74589,91882,04/09/2016,2016,5,Finisher,rob russell,Redmond,OR,M,5,38,9:38:11,Gorge Waterfalls 100K,"Cascade Locks, OR",100K


In [146]:
# Sanity Check. Compare values to Ultrasignup results.
status_counts = race_df.groupby(['Year', 'Status']).size().unstack(fill_value=0)
status_counts

Status,DNF,Finisher
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
2016,0,219
2017,58,206


#### The following will define and return a column "age_category" which aligns with UTMB results and officially recognized age groups.

In [147]:
def map_utmb_age_category(age):
    if age < 18:
        return 'U18'
    elif age < 20:
        return 'U20'
    elif age <= 34:
        return '20-34'
    elif age <= 39:
        return '35-39'
    elif age <= 44:
        return '40-44'
    elif age <= 49:
        return '45-49'
    elif age <= 54:
        return '50-54'
    elif age <= 59:
        return '55-59'
    elif age <= 64:
        return '60-64'
    elif age <= 69:
        return '65-69'
    elif age <= 74:
        return '70-74'
    elif age <= 79:
        return '75-79'
    else:
        return '80'


In [148]:
race_df['Age'] = pd.to_numeric(race_df['Age'], errors = 'coerce')
race_df['Age_Category'] = race_df['Age'].apply(map_utmb_age_category)
race_df['Age'] = race_df['Age_Category']
race_df.drop(columns = ['Age_Category'], inplace = True)
race_df.rename(columns={'Age': 'Age_Category'}, inplace=True)

In [149]:
race_df.head()

Unnamed: 0,Series_ID,Race_ID,Race_Date,Year,Rank,Status,Name,City,State,Gender,Gender_Rank,Age_Category,Time,Race_Name,Race_Loc,Race_Dist
0,74589,91882,04/09/2016,2016,1,Finisher,rui ueda,Tokyo,,M,1,20-34,9:09:37,Gorge Waterfalls 100K,"Cascade Locks, OR",100K
1,74589,91882,04/09/2016,2016,2,Finisher,chris mocko,San Francisco,CA,M,2,20-34,9:10:04,Gorge Waterfalls 100K,"Cascade Locks, OR",100K
2,74589,91882,04/09/2016,2016,3,Finisher,jeremy humphrey,McCall,ID,M,3,35-39,9:34:11,Gorge Waterfalls 100K,"Cascade Locks, OR",100K
3,74589,91882,04/09/2016,2016,4,Finisher,ryan ghelfi,Ashland,OR,M,4,20-34,9:36:08,Gorge Waterfalls 100K,"Cascade Locks, OR",100K
4,74589,91882,04/09/2016,2016,5,Finisher,rob russell,Redmond,OR,M,5,35-39,9:38:11,Gorge Waterfalls 100K,"Cascade Locks, OR",100K


#### Replace City, State with Nationality to align with information available from UTMB races. In the future, compliling a dataset with city, state, country would be interesting for mapping purposes but is likely too time consuming for this project.

In [150]:
race_df['City'] = race_df['City'].str.strip().str.title()

In [151]:
code_to_country = {
    # United States
    'CO': 'USA', 'WY': 'USA', 'WA': 'USA', 'CA': 'USA', 'UT': 'USA', 'NY': 'USA', 'AK': 'USA', 'AZ': 'USA',
    'MA': 'USA', 'MI': 'USA', 'TX': 'USA', 'PA': 'USA', 'MN': 'USA', 'MT': 'USA', 'WI': 'USA', 'DC': 'USA',
    'MO': 'USA', 'OR': 'USA', 'NE': 'USA', 'FL': 'USA', 'NM': 'USA', 'NC': 'USA', 'GA': 'USA', 'OH': 'USA',
    'IL': 'USA', 'IN': 'USA', 'AR': 'USA', 'IA': 'USA', 'VA': 'USA', 'AL': 'USA', 'OK': 'USA', 'KS': 'USA',
    'NJ': 'USA', 'CT': 'USA', 'LA': 'USA', 'TN': 'USA', 'WV': 'USA', 'VT': 'USA', 'NV': 'USA', 'SD': 'USA',
    'ID': 'USA', 'MD': 'USA', 'SC': 'USA', 'MS': 'USA', 'NH': 'USA', 'ND': 'USA', 'KY': 'USA', 'ME': 'USA',
    'DE': 'USA', 'RI': 'USA', 'HI': 'USA', 'PRI': 'USA', 'VIR': 'USA',
    # Canada
    'QC': 'CAN', 'AB': 'CAN', 'BC': 'CAN', 'MB': 'CAN', 'ON': 'CAN', 'SK': 'CAN', 'NB': 'CAN', 'NS': 'CAN', 'YT': 'CAN',
    # Mexico
    'MEX': 'MEX', 'DIF': 'MEX', 'JAL': 'MEX', 'PUE': 'MEX', 'AGU': 'MEX', 'DUR': 'MEX',
    # Costa Rica
    'CRI': 'CRI',
    # Australia
    'AUS': 'AUS',
    # China
    'CHN': 'CHN',
    # Estonia
    'EST': 'EST',
    # Japan
    'JPN': 'JPN',
    # Brazil
    'BRA': 'BRA',
    # Argentina
    'ARG': 'ARG',
    # Slovakia
    'SVK': 'SVK',
    # Netherlands
    'NLD': 'NLD', 'NL': 'NLD',
    # Bulgaria
    'BGR': 'BGR',
    # United Kingdom
    'GBR': 'GBR',
    # Germany
    'DEU': 'DEU',
    # Chile
    'CHL': 'CHL',
    # Norway
    'NOR': 'NOR',
    # France
    'FRA': 'FRA',
    # Spain
    'ESP': 'ESP',
    # Bermuda
    'BMU': 'BMU',
    # Italy
    'ITA': 'ITA',
    # Czech Republic
    'CZE': 'CZE',
    # Poland
    'POL': 'POL',
    # Switzerland
    'CHE': 'CHE',
    # Aruba
    'ABW': 'ABW',
    # Finland
    'FIN': 'FIN',
    # Sweden
    'SWE': 'SWE',
    # Panama
    'PAN': 'PAN',
    # Denmark
    'DNK': 'DNK',
    # Ecuador
    'ECU': 'ECU',
    # Ireland
    'IRL': 'IRL',
    # Austria
    'AUT': 'AUT',
    # Iran
    'IRN': 'IRN',
    # Singapore
    'SGP': 'SGP',
    # Dominican Republic
    'DOM': 'DOM',
    # New Zealand
    'NZL': 'NZL',
    # Taiwan
    'TWN': 'TWN',
    # South Africa
    'ZAF': 'ZAF',
    # Peru
    'PER': 'PER',
    # India
    'IND': 'IND',
    # Greece
    'GRC': 'GRC',
    # Zimbabwe
    'ZWE': 'ZWE',
    # Tanzania
    'TZA': 'TZA',
    # Guatemala
    'GTM': 'GTM',
    # Hong Kong
    'HKG': 'HKG',
    # Thailand
    'THA': 'THA',
    # Saudi Arabia
    'SAU': 'SAU',
    # Philippines
    'PHL': 'PHL',
    # Monaco
    'MCO': 'MCO',
    # Iceland
    'ISL': 'ISL',
    # Belgium
    'BEL': 'BEL',
    # Russia
    'RUS': 'RUS',
    # Croatia
    'HRV': 'HRV',
    # Cayman Islands
    'CYM': 'CYM',
    # Colombia
    'COL': 'COL',
    # Hungary
    'HUN': 'HUN',
    # Kenya
    'KEN': 'KEN',
    # United Arab Emirates
    'ARE': 'ARE', 'AE': 'ARE',
    # Lithuania
    'LTU': 'LTU',
    # Portugal
    'PRT': 'PRT',
    # Slovenia
    'SVN': 'SVN',
    # Indonesia
    'IDN': 'IDN',
    # El Salvador
    'SLV': 'SLV',
    # Romania
    'ROU': 'ROU',
    # Egypt
    'EGY': 'EGY',
    # New Caledonia
    'NCL': 'NCL',
    # Israel
    'ISR': 'ISR',
    # Latvia
    'LVA': 'LVA',
    # Malaysia
    'MYS': 'MYS',
    # Luxembourg
    'LUX': 'LUX',
    # Vietnam
    'VNM': 'VNM',
    # Andorra
    'AND': 'AND',
    # South Korea
    'KOR': 'KOR'
}

In [152]:
# Map State to Country
race_df['Nationality'] = race_df['State'].map(code_to_country)

# Replace NaN with blank
race_df['Nationality'] = race_df['Nationality'].fillna('')

# Replace State with Nationality
race_df['State'] = race_df['Nationality']

# Drop extra columns
race_df.drop(columns=['Nationality', 'City'], inplace=True)

# Rename State to Nationality
race_df.rename(columns={'State': 'Nationality'}, inplace=True)

In [153]:
race_df.head()

Unnamed: 0,Series_ID,Race_ID,Race_Date,Year,Rank,Status,Name,Nationality,Gender,Gender_Rank,Age_Category,Time,Race_Name,Race_Loc,Race_Dist
0,74589,91882,04/09/2016,2016,1,Finisher,rui ueda,,M,1,20-34,9:09:37,Gorge Waterfalls 100K,"Cascade Locks, OR",100K
1,74589,91882,04/09/2016,2016,2,Finisher,chris mocko,USA,M,2,20-34,9:10:04,Gorge Waterfalls 100K,"Cascade Locks, OR",100K
2,74589,91882,04/09/2016,2016,3,Finisher,jeremy humphrey,USA,M,3,35-39,9:34:11,Gorge Waterfalls 100K,"Cascade Locks, OR",100K
3,74589,91882,04/09/2016,2016,4,Finisher,ryan ghelfi,USA,M,4,20-34,9:36:08,Gorge Waterfalls 100K,"Cascade Locks, OR",100K
4,74589,91882,04/09/2016,2016,5,Finisher,rob russell,USA,M,5,35-39,9:38:11,Gorge Waterfalls 100K,"Cascade Locks, OR",100K


In [154]:
race_df.to_csv('../../data/raw_data/ultrasignup_format/gorge_waterfalls_100k_df_clean.csv', index = False, encoding = 'utf-8')