# London Scraper

    Years: 2014-2018
    URL: https://results.virginmoneylondonmarathon.com/{year}/?pid=search (replace year as needed)
    
    TODO: 2010-2013 also use the same results format, but only contain Masses/Elite Women.
          Need to alter URL generation logic to support that.

In [1]:
import pandas as pd
from urllib.request import urlopen
from bs4 import BeautifulSoup
import time
import concurrent.futures
import multiprocessing

# Define helper methods

In [2]:
class Scrapable:
    def __init__(self, gender, year, url):
        self.gender = gender
        self.year = year
        self.url = url

In [3]:
def soup_reader(url):
    client = urlopen(url)
    page = client.read()
    page
    client.close()

    #open the result with BS
    soup = BeautifulSoup(page, 'html.parser')
    return soup

In [4]:
# Parse a page of marathon finishers, adding the results to the master list
def parse_page(scrapable):
    url = scrapable.url
    
    print(f"Current Page: {url}\n")
    soup = soup_reader(url)

    page_finishers = soup.find("table", "list-table").find_all("tr")

    resultList = []

    for finisher in page_finishers:
        
        cells = finisher.find_all('td')
        
        #skip header row
        if cells is None or len(cells) == 0:
            continue

        place_overall = cells[0].get_text()
        place_gender = cells[1].get_text()
        place_category = cells[2].get_text()
        
        runner_name = cells[3].a.get_text()
        details = cells[3].a['href']
        
        club_cell = cells[5]
        if club_cell is not None:
            club_span = club_cell.span
            if club_span is not None:
                club = club_span['title']
            else:
                club = club_cell.get_text()
        else:
            club = ""
        bib_number = cells[6].get_text()
        division = cells[7].get_text()
        half_time = cells[8].get_text()
        full_time = cells[9].get_text()

        result = {
            "place_overall" : place_overall,
            "place_gender" : place_gender,
            "place_category" : place_category,
            "runner_name" : runner_name,
            "details_url" : details,
            "club" : club,
            "bib_number" : bib_number,
            "division" : division,
            "time_half" : half_time,
            "time_full" : full_time,
            "gender" : scrapable.gender,
            "year" : scrapable.year
        }

        masterResults.append(result)

    time.sleep(0.25)

In [5]:
def find_pagination_limit(url):
    soup = soup_reader(url)
    
    #Get the pagination object
    pagination = soup.find("div", "pages").find_all("a")
    
    #The second to last item in the pagination object displays the final page number, that's what we want
    li_length = len(pagination)
    num_pages = int(pagination[li_length - 2].get_text())
    
    return num_pages

In [6]:
def generate_urls(year):
    #find the number of pages of Male finisher results
    male_pages = find_pagination_limit(f"https://results.virginmoneylondonmarathon.com/{year}/?page=1&event=MAS&num_results=1000&pid=list&search%5Bage_class%5D=%25&search%5Bsex%5D=M")
    print(f"Number of male pages: {male_pages}")
    
    #find the number of pages of Female finisher results
    female_pages = find_pagination_limit(f"https://results.virginmoneylondonmarathon.com/{year}/?page=1&event=MAS&num_results=1000&pid=list&search%5Bage_class%5D=%25&search%5Bsex%5D=W")
    print(f"Number of female pages: {female_pages}")
    
    for i in range(1, male_pages+1):
        s = Scrapable('Male', year, f"https://results.virginmoneylondonmarathon.com/{year}/?page={i}&event=MAS&num_results=1000&pid=list&search%5Bage_class%5D=%25&search%5Bsex%5D=M")
        to_scrape.append(s)
        
    for i in range(1, female_pages+1):
        s = Scrapable('Female', year, f"https://results.virginmoneylondonmarathon.com/{year}/?page={i}&event=MAS&num_results=1000&pid=list&search%5Bage_class%5D=%25&search%5Bsex%5D=W")
        to_scrape.append(s)
        
    #Add the elite results urls
    female_elite = Scrapable('Female', year, f"https://results.virginmoneylondonmarathon.com/{year}/?pid=list&search%5Bage_class%5D=%25&search%5Bsex%5D=W&event=ELIT")
    to_scrape.append(female_elite)
    male_elite = Scrapable('Male', year, f"https://results.virginmoneylondonmarathon.com/{year}/?pid=list&search%5Bage_class%5D=%25&search%5Bsex%5D=M&event=ELIT")
    to_scrape.append(male_elite)

In [11]:
def generate_urls_10_13(year):
    #find the number of pages of Male finisher results
    male_pages = find_pagination_limit(f"https://results.virginmoneylondonmarathon.com/{year}/index.php?pid=search&search[sex]=M&search[age_class]=%25&search_sort=place_nosex&num_results=1000")
    print(f"Number of male pages: {male_pages}")
    
    #find the number of pages of Female finisher results
    female_pages = find_pagination_limit(f"https://results.virginmoneylondonmarathon.com/{year}/index.php?pid=search&search[sex]=W&search[age_class]=%25&search_sort=place_nosex&num_results=1000")
    print(f"Number of female pages: {female_pages}")
    
    for i in range(1, male_pages+1):
        s = Scrapable('Male', year, f"https://results.virginmoneylondonmarathon.com/{year}/index.php?pid=search&search[sex]=M&search[age_class]=%25&search_sort=place_nosex&num_results=1000")
        to_scrape.append(s)
        
    for i in range(1, female_pages+1):
        s = Scrapable('Female', year, f"https://results.virginmoneylondonmarathon.com/{year}/index.php?pid=search&search[sex]=W&search[age_class]=%25&search_sort=place_nosex&num_results=1000")
        to_scrape.append(s)
        
    #Add the elite results urls
    female_elite = Scrapable('Female', year, f"https://results.virginmoneylondonmarathon.com/2010/index.php?event=ELIW&num_results=1000&pid=search&search%5Bsex%5D=%25&search%5Bage_class%5D=%25&search_sort=place_nosex&split=time_finish_netto")
    to_scrape.append(female_elite)

In [10]:
def cleanup_and_save():
    df_results = pd.DataFrame(masterResults)
    
    #add the event name
    df_results['Race'] = 'London'
    
    #sort by finisher name for easier comparison to online results page
    df_results.sort_values(by=['runner_name'], inplace=True)
    
    #save to csv
    df_results.to_csv(f'results-10-13.csv', index=False)
    print(f"Scraping complete.  Gathered {df_results.shape[0]} results")

# Do the scraping

In [12]:
to_scrape = list()
masterResults = [];

#for year in range(2014, 2019):
#    generate_urls(str(year))

for year in range(2010, 2014):
    generate_urls_10_13(str(year))

#Use multithreading to speed up the scraping process significantly
with concurrent.futures.ThreadPoolExecutor(max_workers=multiprocessing.cpu_count()) as executor:
    executor.map(parse_page, to_scrape)
cleanup_and_save()

Number of male pages: 25
Number of female pages: 13
Number of male pages: 23
Number of female pages: 13
Number of male pages: 24
Number of female pages: 14
Number of male pages: 23
Number of female pages: 13
Current Page: https://results.virginmoneylondonmarathon.com/2010/index.php?pid=search&search[sex]=M&search[age_class]=%25&search_sort=place_nosex&num_results=1000

Current Page: https://results.virginmoneylondonmarathon.com/2010/index.php?pid=search&search[sex]=M&search[age_class]=%25&search_sort=place_nosex&num_results=1000

Current Page: https://results.virginmoneylondonmarathon.com/2010/index.php?pid=search&search[sex]=M&search[age_class]=%25&search_sort=place_nosex&num_results=1000

Current Page: https://results.virginmoneylondonmarathon.com/2010/index.php?pid=search&search[sex]=M&search[age_class]=%25&search_sort=place_nosex&num_results=1000

Current Page: https://results.virginmoneylondonmarathon.com/2010/index.php?pid=search&search[sex]=M&search[age_class]=%25&search_sort=pl

Current Page: https://results.virginmoneylondonmarathon.com/2011/index.php?pid=search&search[sex]=M&search[age_class]=%25&search_sort=place_nosex&num_results=1000

Current Page: https://results.virginmoneylondonmarathon.com/2011/index.php?pid=search&search[sex]=M&search[age_class]=%25&search_sort=place_nosex&num_results=1000

Current Page: https://results.virginmoneylondonmarathon.com/2011/index.php?pid=search&search[sex]=M&search[age_class]=%25&search_sort=place_nosex&num_results=1000

Current Page: https://results.virginmoneylondonmarathon.com/2011/index.php?pid=search&search[sex]=M&search[age_class]=%25&search_sort=place_nosex&num_results=1000

Current Page: https://results.virginmoneylondonmarathon.com/2011/index.php?pid=search&search[sex]=M&search[age_class]=%25&search_sort=place_nosex&num_results=1000

Current Page: https://results.virginmoneylondonmarathon.com/2011/index.php?pid=search&search[sex]=M&search[age_class]=%25&search_sort=place_nosex&num_results=1000

Current Page: ht

Current Page: https://results.virginmoneylondonmarathon.com/2012/index.php?pid=search&search[sex]=W&search[age_class]=%25&search_sort=place_nosex&num_results=1000

Current Page: https://results.virginmoneylondonmarathon.com/2012/index.php?pid=search&search[sex]=W&search[age_class]=%25&search_sort=place_nosex&num_results=1000

Current Page: https://results.virginmoneylondonmarathon.com/2012/index.php?pid=search&search[sex]=W&search[age_class]=%25&search_sort=place_nosex&num_results=1000

Current Page: https://results.virginmoneylondonmarathon.com/2012/index.php?pid=search&search[sex]=W&search[age_class]=%25&search_sort=place_nosex&num_results=1000

Current Page: https://results.virginmoneylondonmarathon.com/2012/index.php?pid=search&search[sex]=W&search[age_class]=%25&search_sort=place_nosex&num_results=1000

Current Page: https://results.virginmoneylondonmarathon.com/2012/index.php?pid=search&search[sex]=W&search[age_class]=%25&search_sort=place_nosex&num_results=1000

Current Page: ht

Current Page: https://results.virginmoneylondonmarathon.com/2010/index.php?event=ELIW&num_results=1000&pid=search&search%5Bsex%5D=%25&search%5Bage_class%5D=%25&search_sort=place_nosex&split=time_finish_netto

Scraping complete.  Gathered 110000 results
