In [1]:
import pandas as pd
from urllib.request import urlopen
from bs4 import BeautifulSoup
import time
import concurrent.futures
import multiprocessing

# Define helper methods

In [2]:
class Scrapable:
    def __init__(self, gender, year, url):
        self.gender = gender
        self.year = year
        self.url = url

In [3]:
def soup_reader(url):
    client = urlopen(url)
    page = client.read()
    page
    client.close()

    #open the result with BS
    soup = BeautifulSoup(page, 'html.parser')
    return soup

In [58]:
# Parse a page of marathon finishers, adding the results to the master list
def parse_page(scrapable):
    url = scrapable.url
    
    print(f"Current Page: {url}\n")
    soup = soup_reader(url)

    page_finishers = soup.find("table", "list-table").find_all("tr")

    resultList = []

    for finisher in page_finishers:
        
        cells = finisher.find_all('td')
        
        #skip header row
        if cells is None or len(cells) == 0:
            continue

        place_overall = cells[0].get_text()
        place_gender = cells[1].get_text()
        place_category = cells[2].get_text()
        runner_name = cells[3].a.get_text()
        club_cell = cells[5]
        if club_cell is not None:
            club_span = club_cell.span
            if club_span is not None:
                club = club_span['title']
            else:
                club = club_cell.get_text()
        else:
            club = ""
        bib_number = cells[6].get_text()
        division = cells[7].get_text()
        half_time = cells[8].get_text()
        full_time = cells[9].get_text()

        result = {
            "place_overall" : place_overall,
            "place_gender" : place_gender,
            "place_category" : place_category,
            "runner_name" : runner_name,
            "club" : club,
            "bib_number" : bib_number,
            "division" : division,
            "time_half" : half_time,
            "time_full" : full_time,
            "gender" : scrapable.gender,
            "year" : scrapable.year
        }

        masterResults.append(result)

    time.sleep(0.25)

In [7]:
def find_pagination_limit(url):
    soup = soup_reader(url)
    
    #Get the pagination object
    pagination = soup.find("div", "pages").find_all("a")
    
    #The second to last item in the pagination object displays the final page number, that's what we want
    li_length = len(pagination)
    num_pages = int(pagination[li_length - 2].get_text())
    
    return num_pages

In [10]:
def generate_urls(year):
    #find the number of pages of Male finisher results
    male_pages = find_pagination_limit(f"https://results.virginmoneylondonmarathon.com/{year}/?page=1&event=MAS&num_results=1000&pid=list&search%5Bage_class%5D=%25&search%5Bsex%5D=M")
    print(f"Number of male pages: {male_pages}")
    
    #find the number of pages of Female finisher results
    female_pages = find_pagination_limit(f"https://results.virginmoneylondonmarathon.com/{year}/?page=1&event=MAS&num_results=1000&pid=list&search%5Bage_class%5D=%25&search%5Bsex%5D=W")
    print(f"Number of female pages: {female_pages}")
    
    for i in range(1, male_pages+1):
        s = Scrapable('Male', year, f"https://results.virginmoneylondonmarathon.com/{year}/?page={i}&event=MAS&num_results=1000&pid=list&search%5Bage_class%5D=%25&search%5Bsex%5D=M")
        to_scrape.append(s)
        
    for i in range(1, female_pages+1):
        s = Scrapable('Female', year, f"https://results.virginmoneylondonmarathon.com/{year}/?page={i}&event=MAS&num_results=1000&pid=list&search%5Bage_class%5D=%25&search%5Bsex%5D=W")
        to_scrape.append(s)
        
    #Add the elite results urls
    female_elite = Scrapable('Female', year, f"https://results.virginmoneylondonmarathon.com/{year}/?pid=list&search%5Bage_class%5D=%25&search%5Bsex%5D=W&event=ELIT")
    to_scrape.append(female_elite)
    male_elite = Scrapable('Male', year, f"https://results.virginmoneylondonmarathon.com/{year}/?pid=list&search%5Bage_class%5D=%25&search%5Bsex%5D=M&event=ELIT")
    to_scrape.append(male_elite)

In [15]:
def cleanup_and_save():
    df_results = pd.DataFrame(masterResults)
    
    #add the event name
    df_results['Race'] = 'London'
    
    #sort by finisher name for easier comparison to online results page
    df_results.sort_values(by=['runner_name'], inplace=True)
    
    #save to csv
    df_results.to_csv(f'results.csv', index=False)
    print(f"Scraping complete.  Gathered {df_results.shape[0]} results")

# Do the scraping

In [59]:
to_scrape = list()
masterResults = [];

for year in range(2014, 2020):
    generate_urls(str(year))

#Use multithreading to speed up the scraping process significantly
with concurrent.futures.ThreadPoolExecutor(max_workers=multiprocessing.cpu_count()) as executor:
    executor.map(parse_page, to_scrape)
cleanup_and_save()

Number of male pages: 23
Number of female pages: 14
Number of male pages: 24
Number of female pages: 15
Number of male pages: 24
Number of female pages: 16
Number of male pages: 24
Number of female pages: 16
Number of male pages: 24
Number of female pages: 17
Number of male pages: 25
Number of female pages: 18
Current Page: https://results.virginmoneylondonmarathon.com/2014/?page=1&event=MAS&num_results=1000&pid=list&search%5Bage_class%5D=%25&search%5Bsex%5D=M
Current Page: https://results.virginmoneylondonmarathon.com/2014/?page=2&event=MAS&num_results=1000&pid=list&search%5Bage_class%5D=%25&search%5Bsex%5D=M

Current Page: https://results.virginmoneylondonmarathon.com/2014/?page=3&event=MAS&num_results=1000&pid=list&search%5Bage_class%5D=%25&search%5Bsex%5D=M


Current Page: https://results.virginmoneylondonmarathon.com/2014/?page=4&event=MAS&num_results=1000&pid=list&search%5Bage_class%5D=%25&search%5Bsex%5D=M

Current Page: https://results.virginmoneylondonmarathon.com/2014/?page=5

Current Page: https://results.virginmoneylondonmarathon.com/2015/?page=14&event=MAS&num_results=1000&pid=list&search%5Bage_class%5D=%25&search%5Bsex%5D=M
Current Page: https://results.virginmoneylondonmarathon.com/2015/?page=15&event=MAS&num_results=1000&pid=list&search%5Bage_class%5D=%25&search%5Bsex%5D=M


Current Page: https://results.virginmoneylondonmarathon.com/2015/?page=16&event=MAS&num_results=1000&pid=list&search%5Bage_class%5D=%25&search%5Bsex%5D=M

Current Page: https://results.virginmoneylondonmarathon.com/2015/?page=17&event=MAS&num_results=1000&pid=list&search%5Bage_class%5D=%25&search%5Bsex%5D=M

Current Page: https://results.virginmoneylondonmarathon.com/2015/?page=18&event=MAS&num_results=1000&pid=list&search%5Bage_class%5D=%25&search%5Bsex%5D=M

Current Page: https://results.virginmoneylondonmarathon.com/2015/?page=19&event=MAS&num_results=1000&pid=list&search%5Bage_class%5D=%25&search%5Bsex%5D=M

Current Page: https://results.virginmoneylondonmarathon.com/2015/?page

Current Page: https://results.virginmoneylondonmarathon.com/2016/?page=3&event=MAS&num_results=1000&pid=list&search%5Bage_class%5D=%25&search%5Bsex%5D=W

Current Page: https://results.virginmoneylondonmarathon.com/2016/?page=4&event=MAS&num_results=1000&pid=list&search%5Bage_class%5D=%25&search%5Bsex%5D=W

Current Page: https://results.virginmoneylondonmarathon.com/2016/?page=5&event=MAS&num_results=1000&pid=list&search%5Bage_class%5D=%25&search%5Bsex%5D=W

Current Page: https://results.virginmoneylondonmarathon.com/2016/?page=6&event=MAS&num_results=1000&pid=list&search%5Bage_class%5D=%25&search%5Bsex%5D=W

Current Page: https://results.virginmoneylondonmarathon.com/2016/?page=7&event=MAS&num_results=1000&pid=list&search%5Bage_class%5D=%25&search%5Bsex%5D=W

Current Page: https://results.virginmoneylondonmarathon.com/2016/?page=8&event=MAS&num_results=1000&pid=list&search%5Bage_class%5D=%25&search%5Bsex%5D=W

Current Page: https://results.virginmoneylondonmarathon.com/2016/?page=9&eve

Current Page: https://results.virginmoneylondonmarathon.com/2017/?page=15&event=MAS&num_results=1000&pid=list&search%5Bage_class%5D=%25&search%5Bsex%5D=W

Current Page: https://results.virginmoneylondonmarathon.com/2017/?page=16&event=MAS&num_results=1000&pid=list&search%5Bage_class%5D=%25&search%5Bsex%5D=W

Current Page: https://results.virginmoneylondonmarathon.com/2017/?pid=list&search%5Bage_class%5D=%25&search%5Bsex%5D=W&event=ELIT
Current Page: https://results.virginmoneylondonmarathon.com/2017/?pid=list&search%5Bage_class%5D=%25&search%5Bsex%5D=M&event=ELIT


Current Page: https://results.virginmoneylondonmarathon.com/2018/?page=1&event=MAS&num_results=1000&pid=list&search%5Bage_class%5D=%25&search%5Bsex%5D=M

Current Page: https://results.virginmoneylondonmarathon.com/2018/?page=2&event=MAS&num_results=1000&pid=list&search%5Bage_class%5D=%25&search%5Bsex%5D=M

Current Page: https://results.virginmoneylondonmarathon.com/2018/?page=3&event=MAS&num_results=1000&pid=list&search%5Bag

Current Page: https://results.virginmoneylondonmarathon.com/2019/?page=8&event=MAS&num_results=1000&pid=list&search%5Bage_class%5D=%25&search%5Bsex%5D=M

Current Page: https://results.virginmoneylondonmarathon.com/2019/?page=9&event=MAS&num_results=1000&pid=list&search%5Bage_class%5D=%25&search%5Bsex%5D=M

Current Page: https://results.virginmoneylondonmarathon.com/2019/?page=10&event=MAS&num_results=1000&pid=list&search%5Bage_class%5D=%25&search%5Bsex%5D=M

Current Page: https://results.virginmoneylondonmarathon.com/2019/?page=11&event=MAS&num_results=1000&pid=list&search%5Bage_class%5D=%25&search%5Bsex%5D=M

Current Page: https://results.virginmoneylondonmarathon.com/2019/?page=12&event=MAS&num_results=1000&pid=list&search%5Bage_class%5D=%25&search%5Bsex%5D=M

Current Page: https://results.virginmoneylondonmarathon.com/2019/?page=13&event=MAS&num_results=1000&pid=list&search%5Bage_class%5D=%25&search%5Bsex%5D=M

Current Page: https://results.virginmoneylondonmarathon.com/2019/?page=1