In [1]:
import pandas as pd
import yaml
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup

#### Race YAML structure
> Update to match data for each individual race

In [2]:
utmb_race = {
    'race_name': 'Ultra Trail du Mont Blanc',
    'race_loc': 'Chamonix, France',
    'race_dist': '100M',
    'race_uris': {
        2017: '142.utmb-utmb-',
        2018: '142.utmb-utmb-',
        2019: '142.utmb-utmb-',
        2021: '142.utmb-utmb-',
        2022: '142.utmb-montblancutmb-',
        2023: '142.daciautmb-montblancutmb',
        2024: '142.hokautmbmont-blancutmb',
        2025: '142.hokautmbmont-blancutmb'
        
    },
    'dates': {
        2017: 'August 31, 2017',
        2018: 'August 31, 2018',
        2019: 'August 30, 2019',
        2021: 'August 27, 2021',
        2022: 'August 26, 2022',
        2023: 'September 1, 2023',
        2024: 'August 30, 2024',
        2025: 'August 29, 2025'
    }
}

with open("../config/utmb_100m.yaml", "w") as f:
    yaml.dump(utmb_race, f)

#### Run Once 

In [3]:
def scrape_all_pages(driver, year, race_date, race_name, race_loc, race_dist):
    data = []
    page = 1

    while True:
        print(f"Scraping page {page} for {race_name} {year}")
        time.sleep(2)
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        rows = soup.find_all('div', class_='my-table_row__nlm_j')
        print(f"Total rows found on page {page}: {len(rows)}")

        # Debug: print all visible pagination links
        pagination_links = driver.find_elements(By.XPATH, "//a[contains(@class, 'pagination_paginate_link')]")
        print("Found page links:", [link.text for link in pagination_links])

        for row in rows:
            cells = row.find_all('div', class_='my-table_cell__z__zN')
            if len(cells) < 6:
                continue

            rank = cells[0].get_text(strip=True)
            status = 'DNF' if rank.upper() == 'DNF' else 'Finisher'
            name = cells[1].get_text(strip=True).lower()
            nationality_raw = cells[2].get_text(strip=True).lower()
            nationality = 'USA' if 'united states of america' in nationality_raw else nationality_raw.upper()
            gender_raw = cells[3].get_text(strip=True)
            gender = 'M' if 'Men' in gender_raw else 'F' if 'Women' in gender_raw else gender_raw
            age_category = cells[4].get_text(strip=True)
            time_str = cells[5].get_text(strip=True)

            data.append({
                'Date': race_date,
                'Year': int(year),
                'Rank': rank,
                'Status': status,
                'Name': name,
                'Nationality': nationality,
                'Gender': gender,
                'Age_Category': age_category,
                'Time': time_str,
                'Race': race_name,
                'Race_Loc': race_loc,
                'Race_Dist': race_dist
            })

        # Try to find the next page button by its text label
        try:
            next_page = str(page + 1)
            next_link = WebDriverWait(driver, 5).until(
                EC.presence_of_element_located((By.XPATH, f"//a[normalize-space(text())='{next_page}']"))
            )
            driver.execute_script("arguments[0].click();", next_link)
            page += 1
            time.sleep(3)  # Delay to avoid rate limits
        except:
            print("No more pages found.")
            break

    return pd.DataFrame(data)


def scrape_utmb_race(year, race_date, race_name, race_loc, race_dist, race_uri):
    url = f"https://montblanc.utmb.world/results?year={year}&raceUri={race_uri}.{year}"
    
    options = Options()
    options.add_argument("--headless")
    driver = webdriver.Chrome(options=options)
    driver.get(url)
    time.sleep(3)

    df = scrape_all_pages(driver, year, race_date, race_name, race_loc, race_dist)
    driver.quit()
    return df

## Insert race yaml for desired query and run the next two cell blocks
#### Update "../config/{race.yaml}"

In [4]:
race_yaml = "../config/utmb_100m.yaml"

with open(race_yaml, "r") as f:
    race = yaml.safe_load(f)

In [5]:
all_results = []

for year, date in race["dates"].items():
    uri = race["race_uris"].get(year)
    if not uri:
        print(f"No URI found for {year}, skipping.")
        continue

    df = scrape_utmb_race(
        year=year,
        race_date=date,
        race_name=race["race_name"],
        race_loc=race["race_loc"],
        race_dist=race["race_dist"],
        race_uri=uri
    )
    all_results.append(df)

# Combine all results into a single DataFrame
race_df = pd.concat(all_results, ignore_index=True)

Scraping page 1 for Ultra Trail du Mont Blanc 2017
Total rows found on page 1: 50
Found page links: ['', '1', '2', '...', '50', '51', '']
Scraping page 2 for Ultra Trail du Mont Blanc 2017
Total rows found on page 2: 50
Found page links: ['', '1', '2', '3', '...', '50', '51', '']
Scraping page 3 for Ultra Trail du Mont Blanc 2017
Total rows found on page 3: 50
Found page links: ['', '1', '2', '3', '4', '...', '50', '51', '']
Scraping page 4 for Ultra Trail du Mont Blanc 2017
Total rows found on page 4: 50
Found page links: ['', '1', '2', '3', '4', '5', '...', '50', '51', '']
Scraping page 5 for Ultra Trail du Mont Blanc 2017
Total rows found on page 5: 50
Found page links: ['', '1', '2', '3', '4', '5', '6', '...', '50', '51', '']
Scraping page 6 for Ultra Trail du Mont Blanc 2017
Total rows found on page 6: 50
Found page links: ['', '1', '2', '...', '5', '6', '7', '...', '50', '51', '']
Scraping page 7 for Ultra Trail du Mont Blanc 2017
Total rows found on page 7: 50
Found page links: 

In [8]:
*** = race_df

In [11]:
***.to_csv('../data/ultra_raw_csv/***', index = False, encoding = 'utf-8')