This script attempts to predict a Bob Graham round finishing time based on the race results of other finishers. The membership list of the BGR is imported into the DUV ultramarathon databased to obtain runner ID's. The each race result is then scraped from the DUV site and the results of any of the Bob Graham finishers are then stored. 

The idea is then to input a specific race result that you've completed or specific distance stats to see where you compare to other finishers.

In [10]:
import requests
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

import pandas as pd

In [None]:
# URL pointing to the CSV data
CSV_URL = "http://bobgrahamclub.org.uk/api/data_bgr_listing.csv"

bg_results = pd.read_csv(CSV_URL)
print(bg_results.head()) 

In [2]:
# Load DUV bulk search
DUV_PATH = "./data/DUV_Bulkexport_20240416.xls"

def load_html_to_dataframe(file_path):
    try:
        dfs = pd.read_html(file_path, encoding='utf-8')
        return dfs
    except Exception as e:
        print(f"Failed to load the HTML file: {e}")
        return None

data_frames = load_html_to_dataframe(DUV_PATH)
if data_frames:
    for i, df in enumerate(data_frames):
        print(f"Table {i+1}:")
        print(df.head())  # Display the first few rows of each DataFrame

Table 1:
  Unnamed: 0_level_0 DUV Ultra Marathon Statistics                       \
             Counter                       Surname First name Runner ID   
0                  1                        Heaton       Alan   1462093   
1                  2                      Bradshaw    Stanley    Search   
2                  3                        Heaton    Kenneth    Search   
3                  4                         Beard       Eric   1048701   
4                  5                        Naylor       Joss    Search   

                                                                
  Original name Nat.  M/F     YOB Date of birth Cat. internat.  
0           NaN  GBR    M     NaN    00.00.0000            NaN  
1           NaN  NaN  NaN     NaN           NaN            NaN  
2           NaN  NaN  NaN     NaN           NaN            NaN  
3           NaN  GBR    M  1931.0    00.00.0000            NaN  
4           NaN  NaN  NaN     NaN           NaN            NaN  


In [3]:
valid_runners = df[df[('DUV Ultra Marathon Statistics', 'Runner ID')] != 'Search']
valid_runners.columns = valid_runners.columns.droplevel(0)
valid_runners.head()

Unnamed: 0,Counter,Surname,First name,Runner ID,Original name,Nat.,M/F,YOB,Date of birth,Cat. internat.
0,1,Heaton,Alan,1462093,,GBR,M,,00.00.0000,
3,4,Beard,Eric,1048701,,GBR,M,1931.0,00.00.0000,
6,7,Talbot,Donald,482320,,GBR,M,1932.0,00.00.0000,
12,13,Weir,Dennis,113489,,GBR,M,1938.0,25.03.1938,M85
13,14,Millen,Boyd,1051631,,GBR,M,1936.0,25.01.1936,M85


In [11]:
def fetch_html(url):
    """Fetch the content of a URL with retries and backoff."""
    session = requests.Session()
    # Setup retry strategy
    retries = Retry(
        total=5,  # Total retries
        backoff_factor=1,  # Time between retries, exponential backoff factor
        status_forcelist=[500, 502, 503, 504, 429],  # Retry on these status codes
    )
    # Mount it for both http and https connections
    session.mount('http://', HTTPAdapter(max_retries=retries))
    session.mount('https://', HTTPAdapter(max_retries=retries))

    try:
        response = session.get(url, timeout=10)  # 10 seconds timeout for the request
        if response.ok:
            return BeautifulSoup(response.content, 'html.parser')
        else:
            response.raise_for_status()  # This will raise an error for 4XX client errors
    except requests.RequestException as e:
        print(f"Error fetching the URL {url}: {e}")
        return None

def extract_page_count(soup):
    """Extract the number of pages from the pagination element."""
    pagination = soup.find('div', class_='pagination')
    return int(pagination.find_all('a')[-2].text) if pagination else 1

def extract_event_links(base_url, soup):
    """ Extract event links from the page. """
    links = []
    for link in soup.find_all('a', href=True):
        href = link['href']
        if 'getresultevent.php?event=' in href:
            links.append(base_url + href)
    return links

def extract_event_details(soup):
    """Extract 'Event', 'Date', and 'Distance' from the HTML content."""
    details = {}
    try:
        info_rows = soup.find_all('tr')  # Find all table rows in the page
        for row in info_rows:
            # Look for rows where the first cell contains the labels we're interested in
            header_cell = row.find('td')
            if header_cell and header_cell.find('b'):  # Check for bold tags which might contain labels
                label = header_cell.get_text(strip=True).rstrip(':')
                value_cell = header_cell.find_next_sibling('td')  # Get the next sibling cell for the value
                if label in ['Date', 'Event', 'Distance'] and value_cell:
                    details[label] = value_cell.get_text(strip=True)
    except Exception as e:
        print(f"Error extracting event details: {e}")
    return details


def fetch_event_data(table_soup, event_details):
    """Extract data and runner IDs from the event table, including event details."""
    data = []
    headers = [th.text.strip() for th in table_soup.find_all('th')]
    # Append the event details headers
    headers.extend(['Runner ID', 'Event', 'Date', 'Distance'])

    rows = table_soup.find_all('tr')[1:]  # Skip the header row
    for row in rows:
        cols = row.find_all('td')
        row_data = [col.text.strip() for col in cols]
        # Get runner ID
        link = cols[2].find('a', href=True)  # Assuming the third column has the link
        runner_id = link['href'].split('runner=')[-1] if link else 'No ID'
        # Include event details and runner ID
        row_data.extend([runner_id, event_details['Event'], event_details['Date'], event_details['Distance']])
        data.append(row_data)

    return pd.DataFrame(data, columns=headers)

def fetch_event_data(table_soup, event_details, valid_ids):
    """Extract data and runner IDs from the event table, appending event details, filtered by valid runner IDs."""
    data = []
    headers = [th.text.strip() for th in table_soup.find_all('th')]
    headers.extend(['Runner ID', 'Event', 'Date', 'Distance'])
    rows = table_soup.find_all('tr')[1:]  # Skip header row
    
    for row in rows:
        cols = row.find_all('td')
        if len(cols) > 2:
            link = cols[2].find('a', href=True)
            if link and 'runner=' in link['href']:
                runner_id = link['href'].split('runner=')[-1]
                # Only add data if runner_id is in the list of valid_ids
                if runner_id in valid_ids:
                    row_data = [col.text.strip() for col in cols]
                    row_data.extend([runner_id, event_details.get('Event', 'N/A'), event_details.get('Date', 'N/A'), event_details.get('Distance', 'N/A')])
                    data.append(row_data)
    
    return pd.DataFrame(data, columns=headers)

def scrape_events(base_url, event_url):
    """Scrape events with additional details."""
    soup = fetch_html(base_url + event_url)
    event_details = extract_event_details(soup)
    table_soup = soup.find('table', {'id': 'Resultlist'})
    if table_soup:
        return fetch_event_data(table_soup, event_details)
    return pd.DataFrame()


# URL configuration
base_url = "https://statistik.d-u-v.org/"
event_url = "getresultevent.php?event=102988"

# Perform the scraping
#event_data = scrape_events(base_url, event_url)

# Display or export the data
# print(event_data)


In [14]:
def main():
    base_url = "https://statistik.d-u-v.org/"
    start_year = 2022
    end_year = 2023  # Adjust as needed
    valid_ids = set(valid_runners["Runner ID"])

    for year in range(start_year, end_year + 1):
        page_url = f"{base_url}geteventlist.php?year={year}&dist=all&country=all&surface=all&sort=1&page=1"
        first_page = fetch_html(page_url)
        if first_page:
            num_pages = extract_page_count(first_page)
            all_data = []

            for page in range(1, num_pages + 1):
                page_url = f"{base_url}geteventlist.php?year={year}&dist=all&country=all&surface=all&sort=1&page={page}"
                print(page_url)
                page_soup = fetch_html(page_url)
                if page_soup:
                    event_links = extract_event_links(base_url, page_soup)
                    for event_link in event_links:
                        #print(event_link)
                        event_page = fetch_html(event_link)
                        if event_page:
                            event_details = extract_event_details(event_page)
                            table_soup = event_page.find('table', {'id': 'Resultlist'})
                            if table_soup:
                                event_data = fetch_event_data(table_soup, event_details, valid_ids)
                                if not event_data.empty:
                                    all_data.append(event_data)

            if all_data:
                final_df = pd.concat(all_data, ignore_index=True)
                final_df.to_csv(f'all_events_data_{year}.csv', index=False)
                print(f"Saved all event data for {year} to 'all_events_data_{year}.csv'.")
            else:
                print(f"No data was extracted for {year}.")

if __name__ == "__main__":
    main()


https://statistik.d-u-v.org/geteventlist.php?year=2022&dist=all&country=all&surface=all&sort=1&page=1
https://statistik.d-u-v.org/getresultevent.php?event=84606
https://statistik.d-u-v.org/getresultevent.php?event=82708
https://statistik.d-u-v.org/getresultevent.php?event=82709
https://statistik.d-u-v.org/getresultevent.php?event=93687
https://statistik.d-u-v.org/getresultevent.php?event=93763
https://statistik.d-u-v.org/getresultevent.php?event=93764
https://statistik.d-u-v.org/getresultevent.php?event=93765
https://statistik.d-u-v.org/getresultevent.php?event=93766
https://statistik.d-u-v.org/getresultevent.php?event=93767
https://statistik.d-u-v.org/getresultevent.php?event=84605
https://statistik.d-u-v.org/getresultevent.php?event=98311
https://statistik.d-u-v.org/getresultevent.php?event=98312
https://statistik.d-u-v.org/getresultevent.php?event=98313
https://statistik.d-u-v.org/getresultevent.php?event=88193
https://statistik.d-u-v.org/getresultevent.php?event=98694
https://stati