In [1]:
import pandas as pd
import yaml
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup

#### Race YAML structure
> Update to match data for each individual race

In [2]:
# utmb_race = {
#     'race_name': 'Ultra Trail du Mont Blanc',
#     'race_loc': 'Chamonix, France',
#     'race_dist': '100M',
#     'race_uris': {
#         2017: '142.utmb-utmb-',
#         2018: '142.utmb-utmb-',
#         2019: '142.utmb-utmb-',
#         2021: '142.utmb-utmb-',
#         2022: '142.utmb-montblancutmb-',
#         2023: '142.daciautmb-montblancutmb',
#         2024: '142.hokautmbmont-blancutmb',
#         2025: '142.hokautmbmont-blancutmb'
        
#     },
#     'dates': {
#         2017: 'August 31, 2017',
#         2018: 'August 31, 2018',
#         2019: 'August 30, 2019',
#         2021: 'August 27, 2021',
#         2022: 'August 26, 2022',
#         2023: 'September 1, 2023',
#         2024: 'August 30, 2024',
#         2025: 'August 29, 2025'
#     }
# }

# with open("../config/utmb_100m.yaml", "w") as f:
#     yaml.dump(utmb_race, f)

#### Run Once 

In [3]:
def scrape_all_pages(driver, year, race_date, race_name, race_loc, race_dist):
    data = []
    page = 1

    while True:
        print(f"Scraping page {page} for {race_name} {year}")
        time.sleep(2)
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        rows = soup.find_all('div', class_='my-table_row__nlm_j')
        print(f"Total rows found on page {page}: {len(rows)}")

        # Debug: print all visible pagination links
        pagination_links = driver.find_elements(By.XPATH, "//a[contains(@class, 'pagination_paginate_link')]")
        print("Found page links:", [link.text for link in pagination_links])

        for row in rows:
            cells = row.find_all('div', class_='my-table_cell__z__zN')
            if len(cells) < 6:
                continue

            rank = cells[0].get_text(strip=True)
            status = 'DNF' if rank.upper() == 'DNF' else 'Finisher'
            name = cells[1].get_text(strip=True).lower()
            nationality_raw = cells[2].get_text(strip=True).lower()
            nationality = 'USA' if 'united states of america' in nationality_raw else nationality_raw.upper()
            gender_raw = cells[3].get_text(strip=True)
            gender = 'M' if 'Men' in gender_raw else 'F' if 'Women' in gender_raw else gender_raw
            age_category = cells[4].get_text(strip=True)
            time_str = cells[5].get_text(strip=True)

            data.append({
                'Date': race_date,
                'Year': int(year),
                'Rank': rank,
                'Status': status,
                'Name': name,
                'Nationality': nationality,
                'Gender': gender,
                'Age_Category': age_category,
                'Time': time_str,
                'Race': race_name,
                'Race_Loc': race_loc,
                'Race_Dist': race_dist
            })

        # Try to find the next page button by its text label
        try:
            next_page = str(page + 1)
            next_link = WebDriverWait(driver, 5).until(
                EC.presence_of_element_located((By.XPATH, f"//a[normalize-space(text())='{next_page}']"))
            )
            driver.execute_script("arguments[0].click();", next_link)
            page += 1
            time.sleep(3)  # Delay to avoid rate limits
        except:
            print("No more pages found.")
            break

    return pd.DataFrame(data)


def scrape_utmb_race(year, race_date, race_name, race_loc, race_dist, race_uri):
    url = f"https://montblanc.utmb.world/results?year={year}&raceUri={race_uri}.{year}"
    
    options = Options()
    options.add_argument("--headless")
    driver = webdriver.Chrome(options=options)
    driver.get(url)
    time.sleep(3)

    df = scrape_all_pages(driver, year, race_date, race_name, race_loc, race_dist)
    driver.quit()
    return df

## Insert race yaml for desired query and run the next two cell blocks
#### Update "../config/{race.yaml}"

In [18]:
race_yaml = "../config/utmb_yaml/nice_cote_dazur_100k.yaml"

with open(race_yaml, "r") as f:
    race = yaml.safe_load(f)

In [19]:
all_results = []

for year, date in race["dates"].items():
    uri = race["race_uris"].get(year)
    if not uri:
        print(f"No URI found for {year}, skipping.")
        continue

    df = scrape_utmb_index_race(
        year=year,
        race_date=date,
        race_name=race["race_name"],
        race_loc=race["race_loc"],
        race_dist=race["race_dist"],
        race_id_slug=uri
    )
    all_results.append(df)

# Combine all results into a single DataFrame
race_df = pd.concat(all_results, ignore_index=True)

Scraping page 1 for Nice Cote D'Azur 2023
Total rows found on page 1: 50
Found page links: ['', '1', '2', '...', '19', '20', '']
Scraping page 2 for Nice Cote D'Azur 2023
Total rows found on page 2: 50
Found page links: ['', '1', '2', '3', '...', '19', '20', '']
Scraping page 3 for Nice Cote D'Azur 2023
Total rows found on page 3: 50
Found page links: ['', '1', '2', '3', '4', '...', '19', '20', '']
Scraping page 4 for Nice Cote D'Azur 2023
Total rows found on page 4: 50
Found page links: ['', '1', '2', '3', '4', '5', '...', '19', '20', '']
Scraping page 5 for Nice Cote D'Azur 2023
Total rows found on page 5: 50
Found page links: ['', '1', '2', '3', '4', '5', '6', '...', '19', '20', '']
Scraping page 6 for Nice Cote D'Azur 2023
Total rows found on page 6: 50
Found page links: ['', '1', '2', '...', '5', '6', '7', '...', '19', '20', '']
Scraping page 7 for Nice Cote D'Azur 2023
Total rows found on page 7: 50
Found page links: ['', '1', '2', '...', '6', '7', '8', '...', '19', '20', '']
Scr

In [14]:
# Sanity Check. Compare values to Ultrasignup results.
status_counts = race_df.groupby(['Year', 'Status']).size().unstack(fill_value=0)
status_counts

Status,DNF,Finisher
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
2023,99,234
2024,94,212
2025,106,157


In [20]:
race_df

Unnamed: 0,Date,Year,Rank,Status,Name,Nationality,Gender,Age_Category,Time,Race,Race_Loc,Race_Dist
0,"September 30, 2023",2023,1,Finisher,jim walmsley,USA,M,20-34,11:21:05,Nice Cote D'Azur,"Roubion, France",100K
1,"September 30, 2023",2023,2,Finisher,simon gosselin,FRANCE,M,20-34,11:46:29,Nice Cote D'Azur,"Roubion, France",100K
2,"September 30, 2023",2023,3,Finisher,philipp ausserhofer,ITALY,M,20-34,11:55:13,Nice Cote D'Azur,"Roubion, France",100K
3,"September 30, 2023",2023,4,Finisher,thibaut baronian,FRANCE,M,20-34,12:24:58,Nice Cote D'Azur,"Roubion, France",100K
4,"September 30, 2023",2023,5,Finisher,sylvain camus,FRANCE,M,40-44,12:35:07,Nice Cote D'Azur,"Roubion, France",100K
...,...,...,...,...,...,...,...,...,...,...,...,...
3200,"September 27, 2025",2025,DNF,DNF,france cote,CANADA,F,65-69,-,Nice Cote D'Azur,"Roubion, France",100K
3201,"September 27, 2025",2025,DNF,DNF,selina mehta,UNITED KINGDOM,F,20-34,-,Nice Cote D'Azur,"Roubion, France",100K
3202,"September 27, 2025",2025,DNF,DNF,sacha di rienzo,FRANCE,M,20-34,-,Nice Cote D'Azur,"Roubion, France",100K
3203,"September 27, 2025",2025,DNF,DNF,hugo bekaert,FRANCE,M,20-34,-,Nice Cote D'Azur,"Roubion, France",100K


In [21]:
# nice_cote_d_azur_100k_df = race_df

In [23]:
# nice_cote_d_azur_100k_df.to_csv('../data/ultra_raw_csv/100 K/nice_cote_d_azur_100k_df_raw.csv', index = False, encoding = 'utf-8')

# Some races such as Doi Ithanon use an updated structure. The function below has been updated to scrape results from the updated format

In [36]:
race_yaml = "../config/utmb_index_yaml/doi_inthanon_100m.yaml"

with open(race_yaml, "r") as f:
    race = yaml.safe_load(f)

In [37]:
def scrape_utmb_index_race(year, race_date, race_name, race_loc, race_dist, race_id_slug, race_id):
    url = f"https://utmb.world/utmb-index/races/{race_id_slug}.{year}?page=1"
    
    options = Options()
    options.add_argument("--headless")
    driver = webdriver.Chrome(options=options)
    driver.get(url)
    time.sleep(3)

    data = []
    page = 1

    while True:
        print(f"Scraping page {page} for {race_name} {year}")
        time.sleep(2)
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        rows = soup.find_all('div', class_='my-table_row__nlm_j')
        print(f"Total rows found on page {page}: {len(rows)}")

         # Debug: print all visible pagination links
        pagination_links = driver.find_elements(By.XPATH, "//a[contains(@class, 'pagination_paginate_link')]")
        print("Found page links:", [link.text for link in pagination_links])

        for row in rows:
            cells = row.find_all('div', class_='my-table_cell__z__zN')
            if len(cells) < 6:
                continue

            rank = cells[0].get_text(strip=True)
            time_str = cells[1].get_text(strip=True)
            name_tag = cells[2].find('a')
            name = name_tag.get_text(strip=True).lower() if name_tag else ''
            nationality = cells[3].get_text(strip=True).split()[-1].upper()
            gender_raw = cells[4].get_text(strip=True)
            gender = 'M' if 'Men' in gender_raw else 'F' if 'Women' in gender_raw else gender_raw
            age_category = cells[5].get_text(strip=True)
            status = 'DNF' if rank.upper() == 'DNF' else 'Finisher'

            data.append({
                'Date': race_date,
                'Year': int(year),
                'Rank': rank,
                'Status': status,
                'Name': name,
                'Nationality': nationality,
                'Gender': gender,
                'Age_Category': age_category,
                'Time': time_str,
                'Race': race_name,
                'Race_Loc': race_loc,
                'Race_Dist': race_dist,
                'Race_ID': race_id
            })

        # Try to find the next page number
        try:
            next_page = str(page + 1)
            next_link = WebDriverWait(driver, 5).until(
                EC.presence_of_element_located((By.XPATH, f"//a[normalize-space(text())='{next_page}']"))
            )
            driver.execute_script("arguments[0].click();", next_link)
            page += 1
            time.sleep(3)
        except:
            print("No more pages found.")
            break

    driver.quit()
    return pd.DataFrame(data)

In [38]:
all_results = []

for year, date in race["dates"].items():
    uri = race["race_uris"].get(year)
    if not uri:
        print(f"No URI found for {year}, skipping.")
        continue

    df = scrape_utmb_index_race(
        year=year,
        race_date=date,
        race_name=race["race_name"],
        race_loc=race["race_loc"],
        race_dist=race["race_dist"],
        race_id_slug=uri,
        race_id=race["race_id"]
    )
    all_results.append(df)

race_index_df = pd.concat(all_results, ignore_index=True)

Scraping page 1 for Doi Inthanon Thailand 100M 2022
Total rows found on page 1: 25
Found page links: ['', '1', '2', '...', '20', '21', '']
Scraping page 2 for Doi Inthanon Thailand 100M 2022
Total rows found on page 2: 25
Found page links: ['', '1', '2', '3', '...', '20', '21', '']
Scraping page 3 for Doi Inthanon Thailand 100M 2022
Total rows found on page 3: 25
Found page links: ['', '1', '2', '3', '4', '...', '20', '21', '']
Scraping page 4 for Doi Inthanon Thailand 100M 2022
Total rows found on page 4: 25
Found page links: ['', '1', '2', '3', '4', '5', '...', '20', '21', '']
Scraping page 5 for Doi Inthanon Thailand 100M 2022
Total rows found on page 5: 25
Found page links: ['', '1', '2', '3', '4', '5', '6', '...', '20', '21', '']
Scraping page 6 for Doi Inthanon Thailand 100M 2022
Total rows found on page 6: 25
Found page links: ['', '1', '2', '...', '5', '6', '7', '...', '20', '21', '']
Scraping page 7 for Doi Inthanon Thailand 100M 2022
Total rows found on page 7: 25
Found page 

In [41]:
status_counts = race_index_df.groupby(['Year', 'Status']).size().unstack(fill_value=0)
status_counts

Status,DNF,Finisher
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
2022,227,287
2023,193,208
2024,126,233


In [44]:
# doi_inthanon_100m_df_raw = race_index_df

In [45]:
# doi_inthanon_100m_df_raw.to_csv('../data/ultra_raw_csv/100 Mile/doi_inthanon_100m_df_raw_2.csv', index = False, encoding = 'utf-8')