In [1]:
import requests
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import pandas as pd
import re

In [19]:
def fetch_html(url):
    """Fetch the content of a URL with retries and backoff."""
    session = requests.Session()
    # Setup retry strategy
    retries = Retry(
        total=5,  # Total retries
        backoff_factor=1,  # Time between retries, exponential backoff factor
        status_forcelist=[500, 502, 503, 504, 429],  # Retry on these status codes
    )
    # Mount it for both http and https connections
    session.mount('http://', HTTPAdapter(max_retries=retries))
    session.mount('https://', HTTPAdapter(max_retries=retries))

    try:
        response = session.get(url, timeout=10)  # 10 seconds timeout for the request
        if response.ok:
            return BeautifulSoup(response.content, 'html.parser')
        else:
            response.raise_for_status()  # This will raise an error for 4XX client errors
    except requests.RequestException as e:
        print(f"Error fetching the URL {url}: {e}")
        return None

def extract_page_count(soup):
    """Extract the number of pages from the pagination element."""
    pagination = soup.find('div', class_='pagination')
    return int(pagination.find_all('a')[-2].text) if pagination else 1

def extract_event_links(base_url, soup):
    """ Extract event links from the page. """
    links = []
    for link in soup.find_all('a', href=True):
        href = link['href']
        if 'getresultevent.php?event=' in href:
            links.append(base_url + href)
    return links

def extract_event_details(soup):
    """Extract 'Event', 'Date', 'Finishers' and 'Distance' from the HTML content."""
    details = {}
    try:
        info_rows = soup.find_all('tr')  # Find all table rows in the page
        for row in info_rows:
            # Look for rows where the first cell contains the labels we're interested in
            header_cell = row.find('td')
            if header_cell and header_cell.find('b'):  # Check for bold tags which might contain labels
                label = header_cell.get_text(strip=True).rstrip(':')
                value_cell = header_cell.find_next_sibling('td')  # Get the next sibling cell for the value
                if label in ['Date', 'Event', 'Distance', 'Finishers'] and value_cell:
                    details[label] = value_cell.get_text(strip=True)
    except Exception as e:
        print(f"Error extracting event details: {e}")
    return details

def extract_elevation_gain(soup):
    """Extract elevation gain from the event detail page."""
    elevation_row = soup.find('b', string=re.compile('Elevation gain/loss: '))
    if elevation_row:
        elevation_data = elevation_row.find_next('td')
        if elevation_data:
            return elevation_data.text.strip()
    return 'N/A'

def extract_event_id(url):
    """Extract event ID from the URL."""
    match = re.search(r'event=(\d+)', url)
    return match.group(1) if match else 'N/A'

def fetch_event_all_data(table_soup, event_details, elevation_gain, event_id):
    """Extract data and runner IDs from the event table, including event details and elevation gain."""
    data = []
    headers = [th.text.strip() for th in table_soup.find_all('th')]
    # Append the event details headers
    headers.extend(['Runner ID', 'Event', 'Date', 'Distance', 'Finishers', 'Winner Time', 'Elevation Gain', 'Event ID'])
    rows = table_soup.find_all('tr')[1:]  # Skip header row
    winner_time = None

    # Extract winners time
    for row in rows:
        cols = row.find_all('td')
        rank = int(cols[0].text.strip()) if cols[0].text.strip().isdigit() else None

        if rank == 1:
            winner_time = cols[1].text.strip() 
            break

    for row in rows:
        cols = row.find_all('td')
        row_data = [col.text.strip() for col in cols]
        # Get runner ID
        link = cols[2].find('a', href=True)  # Assuming the third column has the link
        runner_id = link['href'].split('runner=')[-1] if link else 'No ID'
        # Include event details, runner ID, elevation gain, and event ID
        row_data.extend([
            runner_id, 
            event_details.get('Event', 'N/A'), 
            event_details.get('Date', 'N/A'), 
            event_details.get('Distance', 'N/A'), 
            event_details.get('Finishers', 'N/A'), 
            winner_time or 'N/A',
            elevation_gain,
            event_id
        ])
        data.append(row_data)

    return pd.DataFrame(data, columns=headers)

def extract_specific_links(soup, base_url, path_starts_with):
    """Extract specific links that start with a given path from the parsed HTML."""
    links = []
    if soup:
        # Find all 'a' tags with an 'href' attribute
        for a in soup.find_all('a', href=True):
            href = a['href']
            # Check if the href starts with the desired path
            if href.startswith(path_starts_with):
                full_link = base_url + href
                links.append(full_link)
    return links


In [3]:
base_url = "https://statistik.d-u-v.org/"
start_year = 2024
end_year = 2024

In [21]:
page_url = f"{base_url}geteventlist.php?year=2024&dist=all&country=all&surface=all&sort=1&page=1"
first_page = fetch_html(page_url)
if first_page:
    num_pages = extract_page_count(first_page)
    all_data = []
    for page in range(1, num_pages + 1):
        page_url = f"{base_url}geteventlist.php?year=2024&dist=all&country=all&surface=all&sort=1&page=1"
        print(page_url)
        page_soup = fetch_html(page_url)
        if page_soup:
            event_links = extract_event_links(base_url, page_soup)
            for event_link in event_links:
                event_page = fetch_html(event_link)
                if event_page:
                    event_details = extract_event_details(event_page)
                    table_soup = event_page.find('table', {'id': 'Resultlist'})                        
                    # Extract event ID
                    event_id = extract_event_id(event_link)                        
                    # Fetch elevation gain from event detail page
                    event_detail_url = f"{base_url}eventdetail.php?event={event_id}"
                    event_detail_page = fetch_html(event_detail_url)
                    elevation_gain = extract_elevation_gain(event_detail_page) if event_detail_page else 'N/A'                        
                    if table_soup:
                        event_data = fetch_event_all_data(table_soup, event_details, elevation_gain, event_id)
                        if not event_data.empty:
                            all_data.append(event_data)

https://statistik.d-u-v.org/geteventlist.php?year=2024&dist=all&country=all&surface=all&sort=1&page=1


KeyboardInterrupt: 

In [20]:
url = "https://statistik.d-u-v.org/eventdetail.php?event=109153"
event_detail_page = fetch_html(url)
elevation_gain = extract_elevation_gain(event_detail_page)

print(elevation_gain)

2000Hm


In [18]:
elevation_row = event_detail_page.find('b', string=re.compile('Elevation gain/loss: '))
elevation_value = elevation_row.find_next('td').text.strip()
print(elevation_value)

2000Hm


In [22]:
print(all_data)

[   Rank Performance          Surname, first name Club Nat.   YOB M/F Rank M/F  \
0     1  127.416 km                 Holm, Philip       DEN  1985   M        1   
1     2  120.710 km          Roed, Lars Vaedeled       DEN  1976   M        2   
2     3  114.004 km                 Roed, Julius       DEN  2002   M        3   
3     4  107.298 km          Hansen, Kasper Gabs       DEN  1984   M        4   
4     5  100.592 km         Jensen, Mads Kromann       DEN  1991   M        5   
5     6  100.592 km            Federspiel, Niels       DEN  1979   M        5   
6     7  100.592 km                  Flade, Mads       DEN  2004   M        5   
7     8  100.592 km           Scheel-Bech, Jacob       DEN  1993   M        5   
8     9  100.592 km         Jensen, Morten Steen       DEN  1994   M        5   
9    10  100.592 km           Jørgensen, Michael       DEN  1992   M        5   
10   11  100.592 km             Ljivoreka, Bessi       DEN  1997   M        5   
11   12  100.592 km        