# Developing a way to webscrape data from duv and ultrasignup

In [19]:
import pandas as pd
import requests
from bs4 import BeautifulSoup as BS

In [12]:
URL = 'https://statistik.d-u-v.org/getresulteventalltime.php?event=115254&cat=%23NA&year=all&country=all&Submit.x=13&Submit.y=3'

headers = {'User-Agent': 'capstoneproject (xcswann@gmail.com)'
          }

response = requests.get(URL, headers = headers)

In [13]:
response.status_code

200

In [122]:
def scrape_table(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    table = soup.find('table', attrs={
        'width': '99%',
        'border': '1',
        'cellpadding': '0',
        'cellspacing': '0'
    })
    rows = table.find_all('tr')
    data = []
    for row in rows:
        cells = row.find_all(['td', 'th'])
        cell_data = [cell.get_text(strip=True) for cell in cells]
        data.append(cell_data)
    return data

# URLs for both pages
url_page1 = 'https://statistik.d-u-v.org/getresulteventalltime.php?event=115254&cat=%23NA&year=all&country=all&Submit.x=13&Submit.y=3'
url_page2 = 'https://statistik.d-u-v.org/getresulteventalltime.php?event=115254&cat=%23NA&year=all&country=all&page=2'

# Scrape both pages
data1 = scrape_table(url_page1)
data2 = scrape_table(url_page2)

# Combine and convert to DataFrame
combined_data = data1 + data2
df = pd.DataFrame(combined_data)

# Optional: Use first row as header
df.columns = df.iloc[0]
df = df.drop(index = 0).reset_index(drop = True)

In [None]:
df['Performance'] = df['Performance'].str.replace(' h', '', regex=False)
df['Performance'] = pd.to_timedelta(df['Performance'], errors='coerce')

In [140]:
# Extract the first date from strings like '25.-26.10.2025'
df['Date'] = pd.to_datetime(
    df['Date'].str.extract(r'(\d{2})\.-\d{2}\.(\d{2})\.(\d{4})').apply(
        lambda x: f"{x[0]}.{x[1]}.{x[2]}" if pd.notnull(x[0]) else None,
        axis=1
    ),
    format='%d.%m.%Y',
    errors='coerce'  # Avoid crashing on bad rows
)

In [96]:
df = df[df['YOB'].apply(lambda x: str(x).isnumeric())]
df = df[df['Rank M/F'].apply(lambda x: str(x).strip().isdigit())]

In [97]:
df['Rank M/F'] = df['Rank M/F'].astype(int)

In [98]:
df['YOB'] = df['YOB'].astype(int)

In [144]:
df.columns.values[0] = 'Overall Rank'

In [154]:
df['Year'] = df['Date'].dt.year.fillna(0).astype(int)

In [155]:
df.head(25)

Unnamed: 0,Overall Rank,Performance,Date,Name,Nat.,YOB,Rank,M/F,Rank M/F,Cat,Cat. Rank,Year
0,1,0 days 12:10:12,2025-10-25,"Murray, William",USA,1995,1,M,1,M30,1,2025
1,2,0 days 12:18:06,2025-10-25,"Roche, David",USA,1988,2,M,2,M35,1,2025
2,3,0 days 12:19:59,2025-10-25,"Woodward, Canyon",USA,1993,3,M,3,M30,2,2025
3,4,0 days 12:43:10,2023-10-28,"Rea, Jonathan",USA,1992,1,M,1,M30,1,2023
4,5,0 days 12:45:04,2024-10-26,"Roche, David",USA,1988,1,M,1,M35,1,2024
5,6,0 days 12:48:39,2025-10-25,"Andrews, Chris",USA,1999,4,M,4,M20,1,2025
6,7,0 days 12:54:31,2024-10-26,"Mogavero, Jeff",USA,1993,2,M,2,M30,1,2024
7,8,0 days 12:58:02,2022-10-29,"Jones, Dakota",USA,1990,1,M,1,M30,1,2022
8,9,0 days 12:58:04,2024-10-26,"Green, Dan",USA,1997,3,M,3,M20,1,2024
9,10,0 days 12:58:07,2023-10-28,"Slattengren, Blake",USA,1995,2,M,2,M20,1,2023


In [156]:
df['Year'] = pd.to_numeric(df['Year'], errors='coerce')

In [157]:
df = df.sort_values(['Year', 'Performance']).reset_index(drop=True)

In [158]:
df.head()

Unnamed: 0,Overall Rank,Performance,Date,Name,Nat.,YOB,Rank,M/F,Rank M/F,Cat,Cat. Rank,Year
0,Rank,NaT,NaT,Name,Nat.,YOB,Rank,M/F,Rank M/F,Cat,Cat. Rank,0
1,205,0 days 17:38:51,2003-11-08,"Ehret, Stephanie",USA,1963,1,F,1,W40,1,2003
2,237,0 days 18:05:15,2003-11-08,"Poolheco, Dennis",USA,1960,2,M,1,M40,1,2003
3,409,0 days 19:34:52,2003-11-08,"Smith, Carolyn",USA,1965,3,F,2,W35,1,2003
4,545,0 days 20:26:32,2003-11-08,"Riddick, Jerry",USA,1951,4,M,2,M50,1,2003


In [159]:
# Removing strange header row with NAs showing up as index [0]
df = df[df['Performance'].notna() & df['Year'].notna()].reset_index(drop=True)

In [163]:
df.tail()

Unnamed: 0,Overall Rank,Performance,Date,Name,Nat.,YOB,Rank,M/F,Rank M/F,Cat,Cat. Rank,Year
5424,5355,1 days 05:42:00,2025-10-25,"Dougherty, Thomas",USA,2001,487,M,318,M20,39,2025
5425,5374,1 days 05:45:28,2025-10-25,"Mihalakellis, George",AUS,1977,488,M,319,M45,57,2025
5426,5385,1 days 05:47:36,2025-10-25,"Torio, Alec",USA,1997,489,M,320,M20,40,2025
5427,5388,1 days 05:48:11,2025-10-25,"Lee, Andre",USA,1970,490,M,321,M55,24,2025
5428,5403,1 days 05:51:49,2025-10-25,"Berry, Stanley",USA,1959,491,M,322,M65,4,2025


In [162]:
df.shape

(5429, 12)

In [165]:
df['Est_Age'] = pd.to_numeric(df['Year'], errors='coerce') - pd.to_numeric(df['YOB'], errors='coerce')

In [166]:
df.head()

Unnamed: 0,Overall Rank,Performance,Date,Name,Nat.,YOB,Rank,M/F,Rank M/F,Cat,Cat. Rank,Year,Est_Age
0,205,0 days 17:38:51,2003-11-08,"Ehret, Stephanie",USA,1963,1,F,1,W40,1,2003,40
1,237,0 days 18:05:15,2003-11-08,"Poolheco, Dennis",USA,1960,2,M,1,M40,1,2003,43
2,409,0 days 19:34:52,2003-11-08,"Smith, Carolyn",USA,1965,3,F,2,W35,1,2003,38
3,545,0 days 20:26:32,2003-11-08,"Riddick, Jerry",USA,1951,4,M,2,M50,1,2003,52
4,586,0 days 20:38:39,2003-11-08,"Williams, Karen Lee",USA,1961,5,F,3,W40,2,2003,42


In [167]:
javelina_cleaned = df

In [168]:
javelina_cleaned.to_csv('javelina_cleaned.csv')

# Parsing result pages from each year to include club(athlete home state) and avg speed

In [211]:
import time

def scrape_club_info(event_ids):
    all_data = []
    header_saved = False  # Track if header has been added
    for year, event_id in event_ids.items():
        page = 1
        last_first_row = None
        while True:
            url = f"https://statistik.d-u-v.org/getresultevent.php?event={event_id}&year={year}&page={page}"
            print(f"Scraping {year}, page {page}")
            try:
                response = requests.get(url, timeout=10)
                response.raise_for_status()
            except requests.RequestException as e:
                print(f"Request failed for {year} page {page}: {e}")
                break

            soup = BeautifulSoup(response.content, 'html.parser')
            table = soup.find('table', attrs={'width': '99%'})
            if not table:
                break

            rows = table.find_all('tr')
            page_data = []
            for row in rows:
                cells = row.find_all(['td', 'th'])
                if not cells or all(cell.get_text(strip=True) == '' for cell in cells):
                    continue

                cell_text = [cell.get_text(strip=True).replace('\xa0', ' ') for cell in cells]

                # Save header only once
                if not header_saved and all(cell.name == 'th' for cell in cells):
                    all_data.append(cell_text + ['Year'])
                    header_saved = True
                    continue  # skip adding header again

                # Skip header rows on other pages
                if all(cell.name == 'th' for cell in cells):
                    continue

                page_data.append(cell_text + [year])

            if not page_data:
                break
            first_row_key = tuple(page_data[0])
            if first_row_key == last_first_row:
                break
            last_first_row = first_row_key

            all_data.extend(page_data)
            page += 1
            time.sleep(1)

        print(f"Year {year} yielded {len(page_data)} rows")
            
    return all_data



In [212]:
event_ids = {
    2003:5334,
    2004:5335,
    2005:5336,
    2006:5337,
    2007:5338,
    2008:1784,
    2009:3712,
    2010:6443,
    2011:9678,
    2012:12323,
    2013:17218,
    2014:21294,
    2015:27027,
    2016:32389,
    2017:39360,
    2018:50640,
    2019:58013,
    2020:65613,
    2021:81905,
    2022:81906,
    2023:92928,
    2024:113764,
    2025:115254
}

club_data = scrape_club_info(event_ids)

Scraping 2003, page 1
Scraping 2003, page 2
Year 2003 yielded 81 rows
Scraping 2004, page 1
Scraping 2004, page 2
Year 2004 yielded 71 rows
Scraping 2005, page 1
Scraping 2005, page 2
Year 2005 yielded 48 rows
Scraping 2006, page 1
Scraping 2006, page 2
Year 2006 yielded 71 rows
Scraping 2007, page 1
Scraping 2007, page 2
Year 2007 yielded 66 rows
Scraping 2008, page 1
Scraping 2008, page 2
Year 2008 yielded 72 rows
Scraping 2009, page 1
Scraping 2009, page 2
Year 2009 yielded 125 rows
Scraping 2010, page 1
Scraping 2010, page 2
Year 2010 yielded 137 rows
Scraping 2011, page 1
Scraping 2011, page 2
Year 2011 yielded 174 rows
Scraping 2012, page 1
Scraping 2012, page 2
Year 2012 yielded 160 rows
Scraping 2013, page 1
Scraping 2013, page 2
Year 2013 yielded 157 rows
Scraping 2014, page 1
Scraping 2014, page 2
Year 2014 yielded 290 rows
Scraping 2015, page 1
Scraping 2015, page 2
Year 2015 yielded 281 rows
Scraping 2016, page 1
Scraping 2016, page 2
Year 2016 yielded 280 rows
Scraping 201

In [213]:
club_df

Unnamed: 0,Rank,Performance,"Surname, first name",Club,Nat.,YOB,M/F,Rank M/F,Cat,Cat. Rank,Avg.Speed km/h,Age graded performance,2003
0,1,17:38:51 h,"Ehret, Stephanie",*CO,USA,1963,F,1,W35,1,9.119,17:19:16 h,2003
1,2,18:05:15 h,"Poolheco, Dennis",*AZ,USA,1960,M,1,M40,1,8.898,17:15:53 h,2003
2,3,19:34:52 h,"Smith, Carolyn",*WI,USA,1965,F,2,W35,2,8.219,19:18:40 h,2003
3,4,20:26:32 h,"Riddick, Jerry","*Tucson, AZ",USA,1951,M,2,M50,1,7.873,18:04:53 h,2003
4,5,20:38:39 h,"Williams, Karen Lee",*AK,USA,1961,F,3,W40,1,7.796,19:53:27 h,2003
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5309,487,29:42:00 h,"Dougherty, Thomas","*New York, NY",USA,2001,M,318,M23,83,5.419,29:42:00 h,2025
5310,488,29:45:28 h,"Mihalakellis, George","*Mt Waverley, AUS",AUS,1977,M,319,M45,53,5.408,27:00:51 h,2025
5311,489,29:47:36 h,"Torio, Alec","*Scottsdale, AZ",USA,1997,M,320,M23,84,5.402,29:47:36 h,2025
5312,490,29:48:11 h,"Lee, Andre","*Tempe, AZ",USA,1970,M,321,M55,22,5.400,25:25:52 h,2025


In [216]:
club_df = pd.DataFrame(club_data)
club_df.columns = club_df.iloc[0]
club_df = club_df.drop(index=0).reset_index(drop=True)

In [217]:
club_df

Unnamed: 0,Rank,Performance,"Surname, first name",Club,Nat.,YOB,M/F,Rank M/F,Cat,Cat. Rank,Avg.Speed km/h,Age graded performance,Year
0,1,17:38:51 h,"Ehret, Stephanie",*CO,USA,1963,F,1,W35,1,9.119,17:19:16 h,2003
1,2,18:05:15 h,"Poolheco, Dennis",*AZ,USA,1960,M,1,M40,1,8.898,17:15:53 h,2003
2,3,19:34:52 h,"Smith, Carolyn",*WI,USA,1965,F,2,W35,2,8.219,19:18:40 h,2003
3,4,20:26:32 h,"Riddick, Jerry","*Tucson, AZ",USA,1951,M,2,M50,1,7.873,18:04:53 h,2003
4,5,20:38:39 h,"Williams, Karen Lee",*AK,USA,1961,F,3,W40,1,7.796,19:53:27 h,2003
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5424,487,29:42:00 h,"Dougherty, Thomas","*New York, NY",USA,2001,M,318,M23,83,5.419,29:42:00 h,2025
5425,488,29:45:28 h,"Mihalakellis, George","*Mt Waverley, AUS",AUS,1977,M,319,M45,53,5.408,27:00:51 h,2025
5426,489,29:47:36 h,"Torio, Alec","*Scottsdale, AZ",USA,1997,M,320,M23,84,5.402,29:47:36 h,2025
5427,490,29:48:11 h,"Lee, Andre","*Tempe, AZ",USA,1970,M,321,M55,22,5.400,25:25:52 h,2025


In [222]:
club_df.columns = club_df.columns.to_list()
club_df.columns.values[0] = "Overall Rank"
club_df.columns.values[2] = "Name"

In [223]:
club_df.head()

Unnamed: 0,Overall Rank,Performance,Name,Club,Nat.,YOB,M/F,Rank M/F,Cat,Cat. Rank,Avg.Speed km/h,Age graded performance,Year
0,1,17:38:51 h,"Ehret, Stephanie",*CO,USA,1963,F,1,W35,1,9.119,17:19:16 h,2003
1,2,18:05:15 h,"Poolheco, Dennis",*AZ,USA,1960,M,1,M40,1,8.898,17:15:53 h,2003
2,3,19:34:52 h,"Smith, Carolyn",*WI,USA,1965,F,2,W35,2,8.219,19:18:40 h,2003
3,4,20:26:32 h,"Riddick, Jerry","*Tucson, AZ",USA,1951,M,2,M50,1,7.873,18:04:53 h,2003
4,5,20:38:39 h,"Williams, Karen Lee",*AK,USA,1961,F,3,W40,1,7.796,19:53:27 h,2003


In [224]:
javelina_cleaned['Name'] = javelina_cleaned['Name'].str.strip().str.lower()
club_df['Name'] = club_df['Name'].str.strip().str.lower()

In [225]:
javelina_cleaned['Year'] = javelina_cleaned['Year'].astype(int)
club_df['Year'] = club_df['Year'].astype(int)

In [226]:
club_subset = club_df[['Name', 'Year', 'Club', 'Avg.Speed km/h']]

In [227]:
merged_df = javelina_cleaned.merge(
    club_subset,
    left_on=['Name', 'Year'],
    right_on=['Name', 'Year'],
    how='left'
)

In [228]:
merged_df.head()

Unnamed: 0,Overall Rank,Performance,Date,Name,Nat.,YOB,Rank,M/F,Rank M/F,Cat,Cat. Rank,Year,Est_Age,Club,Avg.Speed km/h
0,205,0 days 17:38:51,2003-11-08,"ehret, stephanie",USA,1963,1,F,1,W40,1,2003,40,*CO,9.119
1,237,0 days 18:05:15,2003-11-08,"poolheco, dennis",USA,1960,2,M,1,M40,1,2003,43,*AZ,8.898
2,409,0 days 19:34:52,2003-11-08,"smith, carolyn",USA,1965,3,F,2,W35,1,2003,38,*WI,8.219
3,545,0 days 20:26:32,2003-11-08,"riddick, jerry",USA,1951,4,M,2,M50,1,2003,52,"*Tucson, AZ",7.873
4,586,0 days 20:38:39,2003-11-08,"williams, karen lee",USA,1961,5,F,3,W40,2,2003,42,*AK,7.796


In [229]:
merged_df['Avg.Speed km/h'] = pd.to_numeric(merged_df['Avg.Speed km/h'], errors='coerce')

merged_df['Pace (min/mile)'] = 60 / (merged_df['Avg.Speed km/h'] * 0.621371)


In [230]:
merged_df.head()

Unnamed: 0,Overall Rank,Performance,Date,Name,Nat.,YOB,Rank,M/F,Rank M/F,Cat,Cat. Rank,Year,Est_Age,Club,Avg.Speed km/h,Pace (min/mile)
0,205,0 days 17:38:51,2003-11-08,"ehret, stephanie",USA,1963,1,F,1,W40,1,2003,40,*CO,9.119,10.588954
1,237,0 days 18:05:15,2003-11-08,"poolheco, dennis",USA,1960,2,M,1,M40,1,2003,43,*AZ,8.898,10.851952
2,409,0 days 19:34:52,2003-11-08,"smith, carolyn",USA,1965,3,F,2,W35,1,2003,38,*WI,8.219,11.748469
3,545,0 days 20:26:32,2003-11-08,"riddick, jerry",USA,1951,4,M,2,M50,1,2003,52,"*Tucson, AZ",7.873,12.264787
4,586,0 days 20:38:39,2003-11-08,"williams, karen lee",USA,1961,5,F,3,W40,2,2003,42,*AK,7.796,12.385925


In [232]:
merged_df['Pace (mm:ss/mile)'] = merged_df['Pace (min/mile)'].apply(
    lambda x: f"{int(x)}:{int((x % 1) * 60):02d}" if pd.notnull(x) else None
)

In [233]:
merged_df.head()

Unnamed: 0,Overall Rank,Performance,Date,Name,Nat.,YOB,Rank,M/F,Rank M/F,Cat,Cat. Rank,Year,Est_Age,Club,Avg.Speed km/h,Pace (min/mile),Pace (mm:ss/mile)
0,205,0 days 17:38:51,2003-11-08,"ehret, stephanie",USA,1963,1,F,1,W40,1,2003,40,*CO,9.119,10.588954,10:35
1,237,0 days 18:05:15,2003-11-08,"poolheco, dennis",USA,1960,2,M,1,M40,1,2003,43,*AZ,8.898,10.851952,10:51
2,409,0 days 19:34:52,2003-11-08,"smith, carolyn",USA,1965,3,F,2,W35,1,2003,38,*WI,8.219,11.748469,11:44
3,545,0 days 20:26:32,2003-11-08,"riddick, jerry",USA,1951,4,M,2,M50,1,2003,52,"*Tucson, AZ",7.873,12.264787,12:15
4,586,0 days 20:38:39,2003-11-08,"williams, karen lee",USA,1961,5,F,3,W40,2,2003,42,*AK,7.796,12.385925,12:23


In [234]:
merged_df.drop(columns=["Pace (min/mile)"], inplace=True)

In [235]:
merged_df.head()

Unnamed: 0,Overall Rank,Performance,Date,Name,Nat.,YOB,Rank,M/F,Rank M/F,Cat,Cat. Rank,Year,Est_Age,Club,Avg.Speed km/h,Pace (mm:ss/mile)
0,205,0 days 17:38:51,2003-11-08,"ehret, stephanie",USA,1963,1,F,1,W40,1,2003,40,*CO,9.119,10:35
1,237,0 days 18:05:15,2003-11-08,"poolheco, dennis",USA,1960,2,M,1,M40,1,2003,43,*AZ,8.898,10:51
2,409,0 days 19:34:52,2003-11-08,"smith, carolyn",USA,1965,3,F,2,W35,1,2003,38,*WI,8.219,11:44
3,545,0 days 20:26:32,2003-11-08,"riddick, jerry",USA,1951,4,M,2,M50,1,2003,52,"*Tucson, AZ",7.873,12:15
4,586,0 days 20:38:39,2003-11-08,"williams, karen lee",USA,1961,5,F,3,W40,2,2003,42,*AK,7.796,12:23


In [236]:
merged_df['Avg.Speed mi/h'] = merged_df['Avg.Speed km/h'] * 0.621371

In [237]:
merged_df.head()

Unnamed: 0,Overall Rank,Performance,Date,Name,Nat.,YOB,Rank,M/F,Rank M/F,Cat,Cat. Rank,Year,Est_Age,Club,Avg.Speed km/h,Pace (mm:ss/mile),Avg.Speed mi/h
0,205,0 days 17:38:51,2003-11-08,"ehret, stephanie",USA,1963,1,F,1,W40,1,2003,40,*CO,9.119,10:35,5.666282
1,237,0 days 18:05:15,2003-11-08,"poolheco, dennis",USA,1960,2,M,1,M40,1,2003,43,*AZ,8.898,10:51,5.528959
2,409,0 days 19:34:52,2003-11-08,"smith, carolyn",USA,1965,3,F,2,W35,1,2003,38,*WI,8.219,11:44,5.107048
3,545,0 days 20:26:32,2003-11-08,"riddick, jerry",USA,1951,4,M,2,M50,1,2003,52,"*Tucson, AZ",7.873,12:15,4.892054
4,586,0 days 20:38:39,2003-11-08,"williams, karen lee",USA,1961,5,F,3,W40,2,2003,42,*AK,7.796,12:23,4.844208


# Method for Ultrasignup
### Requires Selenium

In [370]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import pandas as pd
import time

def scrape_ultrasignup_results(url):
    # Set up headless browser
    options = Options()
    options.add_argument("--headless")
    driver = webdriver.Chrome(options=options)

    # Load page
    driver.get(url)
    time.sleep(3)  # Wait for JavaScript to load

    # Get page source and parse
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    driver.quit()

    table = soup.find('table', id='list')
    rows = table.find_all('tr')

    # Extract race date from the page
    text = soup.get_text()

    # Match date range like "February 8 - 9, 2025"
    match_range = re.search(r'([A-Za-z]+)\s+(\d{1,2})\s*[-–—]\s*\d{1,2},\s*(\d{4})', text)

    # Match single date like "Saturday, June 24, 2023"
    match_single = re.search(r'\b(?:Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday),\s+([A-Za-z]+)\s+(\d{1,2}),\s+(\d{4})\b', text)

    if match_range:
        month = match_range.group(1)
        day = match_range.group(2)
        year_str = match_range.group(3)
        race_date = f"{month} {day}, {year_str}"
    elif match_single:
        month = match_single.group(1)
        day = match_single.group(2)
        year_str = match_single.group(3)
        race_date = f"{month} {day}, {year_str}"
    else:
        race_date = 'Unknown'


    print(f"Scraping: {url}")
    print(f"Total rows found: {len(rows)}")

    data = []
    current_status = 'Finisher'

    for row in rows:
        row_classes = row.get('class', [])

        if 'jqgroup' in row_classes:
            header_text = row.get_text(strip=True).lower()
            if 'did not finish' in header_text:
                current_status = 'DNF'
            elif 'did not start' in header_text:
                current_status = 'DNS'
            elif 'finisher' in header_text:
                current_status = 'Finisher'
            continue

        if 'jqgrow' not in row_classes:
            continue

        cells = row.find_all('td')
        if len(cells) < 8 or current_status == 'DNS':
            continue    

        data.append({
            'Date': race_date,
            'Year': year,
            'Rank': cells[1].get_text(strip=True),
            'First_Name': cells[2].get_text(strip=True),
            'Last_Name': cells[3].get_text(strip=True),
            'Name': f"{cells[2].get_text(strip=True)} {cells[3].get_text(strip=True)}",
            'City': cells[4].get_text(strip=True),
            'State': cells[5].get_text(strip=True),
            'Age': cells[6].get_text(strip=True),
            'Gender_Div': cells[7].get_text(strip=True),
            'Div_Place': cells[8].get_text(strip=True),
            'Time': cells[9].get_text(strip=True),
            'Ultra_Rank': cells[10].get_text(strip=True),
            'Status': current_status
        })

    return pd.DataFrame(data)

In [363]:
# Javelina Jundred Ultrasignup race id's
javelina_jundred_race_ids = [12159, 15839, 19185, 27560, 31274, 35377, 43813, 53326, 63015, 74613, 81876, 90708, 99186, 112165, 122877]
start_year = 2011

all_results = []

for i, race_id in enumerate(javelina_jundred_race_ids):
    year = start_year + i
    race_url = f"https://ultrasignup.com/results_event.aspx?did={race_id}"
    df = scrape_ultrasignup_results(race_url)
    df['Race'] = 'Javelina Jundred'
    df['Race_Loc'] = 'McDowell, AZ'
    df['Race_Dist'] = '100M'
    all_results.append(df)

javelina_jundred_df = pd.concat(all_results, ignore_index=True)

Scraping: https://ultrasignup.com/results_event.aspx?did=12159
Total rows found: 398
Scraping: https://ultrasignup.com/results_event.aspx?did=15839
Total rows found: 408
Scraping: https://ultrasignup.com/results_event.aspx?did=19185
Total rows found: 415
Scraping: https://ultrasignup.com/results_event.aspx?did=27560
Total rows found: 569
Scraping: https://ultrasignup.com/results_event.aspx?did=31274
Total rows found: 519
Scraping: https://ultrasignup.com/results_event.aspx?did=35377
Total rows found: 579
Scraping: https://ultrasignup.com/results_event.aspx?did=43813
Total rows found: 540
Scraping: https://ultrasignup.com/results_event.aspx?did=53326
Total rows found: 566
Scraping: https://ultrasignup.com/results_event.aspx?did=63015
Total rows found: 728
Scraping: https://ultrasignup.com/results_event.aspx?did=74613
Total rows found: 258
Scraping: https://ultrasignup.com/results_event.aspx?did=81876
Total rows found: 591
Scraping: https://ultrasignup.com/results_event.aspx?did=90708
To

In [364]:
javelina_jundred_df.loc[javelina_jundred_df['Year'] == 2019]

Unnamed: 0,Date,Year,Rank,First_Name,Last_Name,Name,City,State,Age,Gender_Div,Div_Place,Time,Ultra_Rank,Status,Race,Race_Loc,Race_Dist
3814,"Saturday, Oct 26, 2019",2019,1,Patrick,Reagan,Patrick Reagan,Savannah,GA,32,M,1,13:11:48,96.29,Finisher,Javelina Jundred,"McDowell, AZ",100M
3815,"Saturday, Oct 26, 2019",2019,2,Tyler,Green,Tyler Green,Portland,OR,35,M,2,14:02:41,95.41,Finisher,Javelina Jundred,"McDowell, AZ",100M
3816,"Saturday, Oct 26, 2019",2019,3,Ryan,Shephard,Ryan Shephard,Abbotsford,BC,36,M,3,15:28:32,85.09,Finisher,Javelina Jundred,"McDowell, AZ",100M
3817,"Saturday, Oct 26, 2019",2019,4,Kaci,Lickteig,Kaci Lickteig,Omaha,NE,33,F,1,15:32:31,96.63,Finisher,Javelina Jundred,"McDowell, AZ",100M
3818,"Saturday, Oct 26, 2019",2019,5,Nathan,Moody,Nathan Moody,Los Alamos,NM,43,M,4,15:50:17,72.19,Finisher,Javelina Jundred,"McDowell, AZ",100M
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4476,"Saturday, Oct 26, 2019",2019,0,Carissa,Liebowitz,Carissa Liebowitz,Suwanee,GA,37,F,0,0,88.37,DNF,Javelina Jundred,"McDowell, AZ",100M
4477,"Saturday, Oct 26, 2019",2019,0,Riley,Nordmeier,Riley Nordmeier,Fullerton,CA,33,F,0,0,56.67,DNF,Javelina Jundred,"McDowell, AZ",100M
4478,"Saturday, Oct 26, 2019",2019,0,Carroll,Wright-Bennett,Carroll Wright-Bennett,Claremore,OK,51,F,0,0,65.74,DNF,Javelina Jundred,"McDowell, AZ",100M
4479,"Saturday, Oct 26, 2019",2019,0,Pam,Reed,Pam Reed,Jackson,WY,58,F,0,0,81.64,DNF,Javelina Jundred,"McDowell, AZ",100M


In [365]:
javelina_jundred_df.to_csv('javelina_jundred_df_raw.csv')

In [313]:
# summary = combined_df.groupby('Year')['Status'].value_counts().unstack(fill_value=0)
# summary['Total'] = summary.sum(axis=1)
# summary['% Finishers'] = (summary.get('Finisher', 0) / summary['Total']) * 100
# summary['% DNF'] = (summary.get('DNF', 0) / summary['Total']) * 100
# summary = summary.round(2)
# summary

In [314]:
# pivot.plot(kind='bar', stacked=True)

#### Western States 100M

In [360]:
# Western States Ultrasignup race id's
western_states_race_ids = [17746, 24962, 30033, 34773, 41765, 51243, 61359, 79446, 87878, 97204, 108752, 119682]
start_year = 2013

all_results = []

for i, race_id in enumerate(western_states_race_ids):
    year = start_year + i
    race_url = f"https://ultrasignup.com/results_event.aspx?did={race_id}"
    df = scrape_ultrasignup_results(race_url)
    df['Race'] = 'Western States'
    df['Race_Loc'] = 'Olympic Valley, CA'
    df['Race_Dist'] = '100M'
    all_results.append(df)

western_states_df = pd.concat(all_results, ignore_index=True)

Scraping: https://ultrasignup.com/results_event.aspx?did=17746
Total rows found: 416
Scraping: https://ultrasignup.com/results_event.aspx?did=24962
Total rows found: 403
Scraping: https://ultrasignup.com/results_event.aspx?did=30033
Total rows found: 391
Scraping: https://ultrasignup.com/results_event.aspx?did=34773
Total rows found: 385
Scraping: https://ultrasignup.com/results_event.aspx?did=41765
Total rows found: 372
Scraping: https://ultrasignup.com/results_event.aspx?did=51243
Total rows found: 372
Scraping: https://ultrasignup.com/results_event.aspx?did=61359
Total rows found: 372
Scraping: https://ultrasignup.com/results_event.aspx?did=79446
Total rows found: 319
Scraping: https://ultrasignup.com/results_event.aspx?did=87878
Total rows found: 386
Scraping: https://ultrasignup.com/results_event.aspx?did=97204
Total rows found: 382
Scraping: https://ultrasignup.com/results_event.aspx?did=108752
Total rows found: 378
Scraping: https://ultrasignup.com/results_event.aspx?did=119682


In [361]:
western_states_df.head()

Unnamed: 0,Date,Year,Rank,First_Name,Last_Name,Name,City,State,Age,Gender_Div,Div_Place,Time,Ultra_Rank,Status,Race,Race_Loc,Race_Dist
0,"Saturday, Jun 29, 2013",2013,1,Timothy,Olson,Timothy Olson,Ashland,OR,29,M,1,15:17:27,89.37,Finisher,Western States,"Olympic Valley, CA",100M
1,"Saturday, Jun 29, 2013",2013,2,Rob,Krar,Rob Krar,Flagstaff,AZ,36,M,2,15:22:05,97.67,Finisher,Western States,"Olympic Valley, CA",100M
2,"Saturday, Jun 29, 2013",2013,3,Mike,Morton,Mike Morton,Lithia,FL,41,M,3,15:45:21,95.61,Finisher,Western States,"Olympic Valley, CA",100M
3,"Saturday, Jun 29, 2013",2013,4,Ian,Sharman,Ian Sharman,Walnut Creek,CA,32,M,4,16:20:25,91.44,Finisher,Western States,"Olympic Valley, CA",100M
4,"Saturday, Jun 29, 2013",2013,5,Dylan,Bowman,Dylan Bowman,Emerald Hills,CA,27,M,5,16:32:18,94.19,Finisher,Western States,"Olympic Valley, CA",100M


In [362]:
western_states_df.to_csv('western_states_df_raw.csv')

#### Hard Rock 100M

In [358]:
# Hard Rock 100 Ultrasignup race id's 
# Race was cancelled in 2019 and 2020
hard_rock_race_ids = [36260, 43200, 51281, 79879, 88734, 98272, 108442, 119896]
start_year = 2016

all_results = []

for i, race_id in enumerate(hard_rock_race_ids):
    year = start_year + i
    race_url = f"https://ultrasignup.com/results_event.aspx?did={race_id}"
    df = scrape_ultrasignup_results(race_url)
    df['Race'] = 'Hard Rock 100'
    df['Race_Loc'] = 'Silverton, CO'
    df['Race_Dist'] = '100M'
    all_results.append(df)

hard_rock_df = pd.concat(all_results, ignore_index=True)

Scraping: https://ultrasignup.com/results_event.aspx?did=36260
Total rows found: 155
Scraping: https://ultrasignup.com/results_event.aspx?did=43200
Total rows found: 148
Scraping: https://ultrasignup.com/results_event.aspx?did=51281
Total rows found: 149
Scraping: https://ultrasignup.com/results_event.aspx?did=79879
Total rows found: 150
Scraping: https://ultrasignup.com/results_event.aspx?did=88734
Total rows found: 149
Scraping: https://ultrasignup.com/results_event.aspx?did=98272
Total rows found: 149
Scraping: https://ultrasignup.com/results_event.aspx?did=108442
Total rows found: 149
Scraping: https://ultrasignup.com/results_event.aspx?did=119896
Total rows found: 148


In [359]:
hard_rock_df.head()

Unnamed: 0,Date,Year,Rank,First_Name,Last_Name,Name,City,State,Age,Gender_Div,Div_Place,Time,Ultra_Rank,Status,Race,Race_Loc,Race_Dist
0,"Friday, Jul 15, 2016",2016,1,Jason,Schlarb,Jason Schlarb,Durango,CO,38,M,1,22:58:28,93.59,Finisher,Hard Rock 100,"Silverton, CO",100M
1,"Friday, Jul 15, 2016",2016,1,Kilian,Jornet,Kilian Jornet,Montellà,,28,M,1,22:58:28,98.08,Finisher,Hard Rock 100,"Silverton, CO",100M
2,"Friday, Jul 15, 2016",2016,3,Xavier,Thevenard,Xavier Thevenard,Jougnes,,28,M,3,23:57:10,96.89,Finisher,Hard Rock 100,"Silverton, CO",100M
3,"Friday, Jul 15, 2016",2016,4,Jeff,Browning,Jeff Browning,Bend,OR,44,M,4,25:42:03,91.52,Finisher,Hard Rock 100,"Silverton, CO",100M
4,"Friday, Jul 15, 2016",2016,5,Ryan,Kaiser,Ryan Kaiser,Bend,OR,37,M,5,27:39:16,92.99,Finisher,Hard Rock 100,"Silverton, CO",100M


In [366]:
hard_rock_df.to_csv('hard_rock_df_raw.csv')

In [371]:
# Hard Rock 100 Ultrasignup race id's 
# Race was cancelled 2020
black_canyon_race_ids = [24355, 29244, 34087, 38965, 48278, 57827, 67039, 77199, 84317, 93392, 104108, 115592]
start_year = 2014

all_results = []

for i, race_id in enumerate(black_canyon_race_ids):
    year = start_year + i
    race_url = f"https://ultrasignup.com/results_event.aspx?did={race_id}"
    df = scrape_ultrasignup_results(race_url)
    df['Race'] = 'Black Canyon 100K'
    df['Race_Loc'] = 'Mayer, AZ'
    df['Race_Dist'] = '100K'
    all_results.append(df)

black_canyon_df = pd.concat(all_results, ignore_index=True)

Scraping: https://ultrasignup.com/results_event.aspx?did=24355
Total rows found: 81
Scraping: https://ultrasignup.com/results_event.aspx?did=29244
Total rows found: 91
Scraping: https://ultrasignup.com/results_event.aspx?did=34087
Total rows found: 261
Scraping: https://ultrasignup.com/results_event.aspx?did=38965
Total rows found: 375
Scraping: https://ultrasignup.com/results_event.aspx?did=48278
Total rows found: 438
Scraping: https://ultrasignup.com/results_event.aspx?did=57827
Total rows found: 608
Scraping: https://ultrasignup.com/results_event.aspx?did=67039
Total rows found: 667
Scraping: https://ultrasignup.com/results_event.aspx?did=77199
Total rows found: 448
Scraping: https://ultrasignup.com/results_event.aspx?did=84317
Total rows found: 789
Scraping: https://ultrasignup.com/results_event.aspx?did=93392
Total rows found: 851
Scraping: https://ultrasignup.com/results_event.aspx?did=104108
Total rows found: 932
Scraping: https://ultrasignup.com/results_event.aspx?did=115592
To

In [372]:
black_canyon_df

Unnamed: 0,Date,Year,Rank,First_Name,Last_Name,Name,City,State,Age,Gender_Div,Div_Place,Time,Ultra_Rank,Status,Race,Race_Loc,Race_Dist
0,"Feb 15, 2014",2014,1,Michael,Carson,Michael Carson,Tempe,AZ,27,M,1,10:07:18,87.85,Finisher,Black Canyon 100K,"Mayer, AZ",100K
1,"Feb 15, 2014",2014,2,Michael,Versteeg,Michael Versteeg,Prescott,AZ,28,M,2,10:28:44,91.14,Finisher,Black Canyon 100K,"Mayer, AZ",100K
2,"Feb 15, 2014",2014,3,Justin,Faul,Justin Faul,Flagstaff,AZ,33,M,3,10:49:24,80.36,Finisher,Black Canyon 100K,"Mayer, AZ",100K
3,"Feb 15, 2014",2014,4,Brian,Tinder,Brian Tinder,Flagstaff,AZ,34,M,4,10:55:16,86.8,Finisher,Black Canyon 100K,"Mayer, AZ",100K
4,"Feb 15, 2014",2014,5,Andrew,Heard,Andrew Heard,Phoenix,AZ,47,M,5,11:08:56,80.94,Finisher,Black Canyon 100K,"Mayer, AZ",100K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6308,"February 8, 2025",2025,0,Kelsey,Gabriel,Kelsey Gabriel,Mesa,AZ,22,F,0,0,56.24,DNF,Black Canyon 100K,"Mayer, AZ",100K
6309,"February 8, 2025",2025,0,Jon,Wolfinger,Jon Wolfinger,Prescott,AZ,39,M,0,0,57.96,DNF,Black Canyon 100K,"Mayer, AZ",100K
6310,"February 8, 2025",2025,0,Christopher,Lopez,Christopher Lopez,Honeoye Falls,NY,55,M,0,0,81.13,DNF,Black Canyon 100K,"Mayer, AZ",100K
6311,"February 8, 2025",2025,0,Todd,Dill,Todd Dill,Castle Rock,CO,42,M,0,0,85.36,DNF,Black Canyon 100K,"Mayer, AZ",100K


In [374]:
black_canyon_df.to_csv('../data/ultra_raw_csv/black_canyon_df_raw.csv')