In [1]:
import pandas as pd
from selenium import webdriver
from bs4 import BeautifulSoup
from itertools import chain
from selenium.webdriver.firefox.options import Options

In [2]:
urls = {
    'switzerland' : {
        '2022' : 'https://www.coachcox.co.uk/imstats/race/1887/results/',
        '2021' : 'https://www.coachcox.co.uk/imstats/race/1793/results/',
        '2019' : 'https://www.coachcox.co.uk/imstats/race/485/results/',
        '2018' : 'https://www.coachcox.co.uk/imstats/race/437/results/'
    },
    'copenhagen' : {
        '2022' : 'https://www.coachcox.co.uk/imstats/race/1880/results/',
        '2021' : 'https://www.coachcox.co.uk/imstats/race/1776/results/', 
        '2019' : 'https://www.coachcox.co.uk/imstats/race/492/results/', 
        '2018' : 'https://www.coachcox.co.uk/imstats/race/443/results/'
    }
}

In [3]:
def scrape(url):
    """
    Function takes a URL from www.coachcox.co.uk and scrapes the results data. Returns a DF containing the results 
    """
    # Set up webdriver and get the content
    options = Options()
    options.add_argument("--headless")
    driver = webdriver.Firefox(options=options)
    driver.get(url)
    html = driver.page_source

    # Parse the html and locate the results table
    soup = BeautifulSoup(html, 'html.parser')
    table = soup.find('table', id='imraceresultstable')

    # Grab the results and column titles from the table
    data = []
    labels = []
    for row in table.find_all('tr'):
        data_cells = row.find_all('td')
        data.append([cell.text for cell in data_cells])

        lebel_cells = row.find_all('th')
        labels.append([cell.text for cell in lebel_cells])

    # Flatten the list of labels
    labels = list(chain.from_iterable(labels))

    return pd.DataFrame(data, columns=labels)

In [5]:
def scrape_all_to_file(urls):
    for country, dates in urls.items():
        for date, url in dates.items(): 
            filename = f"{country}_{date}.res"
            df = scrape(url)
            df.to_pickle(filename)


In [6]:
scrape_all_to_file(urls)