In [None]:
import pandas as pd
from urllib.request import urlopen
from bs4 import BeautifulSoup
import time
import concurrent.futures
import multiprocessing

# Define helper methods

In [None]:
class Scrapable:
    def __init__(self, gender, year, url):
        self.gender = gender
        self.year = year
        self.url = url

In [None]:
def soup_reader(url):
    client = urlopen(url)
    page = client.read()
    page
    client.close()

    #open the result with BS
    soup = BeautifulSoup(page, 'html.parser')
    return soup

In [None]:
# Parse a page of marathon finishers, adding the results to the master list
def parse_page(scrapable):
    url = scrapable.url
    
    print(f"Current Page: {url}\n")
    soup = soup_reader(url)

    page_finishers = soup.find("ul", "list-group list-group-multicolumn").find_all("li", "row")

    resultList = []

    for finisher in page_finishers:

        #skip the header row
        if 'list-group-header' in finisher.attrs['class']:
            continue

        place_overall_search = finisher.find("div", "place-secondary")
        if place_overall_search is not None:
            place_overall = place_overall_search.get_text()

        #skip non-finishers
        if place_overall is None or place_overall == '–':
            continue


        place_gender_search = finisher.find("div", "place-primary")
        if place_gender_search is not None:
            place_gender = place_gender_search.get_text()

        runner_name = finisher.find("h4", "type-fullname")
        if runner_name is not None:
            runner_name = runner_name.get_text()
        #TODO: extract country
        city_state = finisher.find("div", "list-field type-eval").contents[1]

        bib_number = finisher.find("div", "type-field").contents[1]

        division = finisher.find("div", "type-age_class").contents[1]

        times = finisher.find_all("div", "list-field type-time")
        half_time = times[0].contents[1]
        full_time = times[1].contents[1]

        result = {
            "place_overall" : place_overall,
            "place_gender" : place_gender,
            "runner_name" : runner_name,
            "city_state" : city_state,
            "bib_number" : bib_number,
            "division" : division,
            "time_half" : half_time,
            "time_full" : full_time,
            "gender" : scrapable.gender,
            "year" : scrapable.year
        }

        masterResults.append(result)

    time.sleep(0.25)

In [None]:
def find_pagination_limit(url):
    soup = soup_reader(url)
    
    #Get the pagination object
    pagination = soup.find("ul", "pagination").find_all("li")
    
    #The second to last item in the pagination object displays the final page number, that's what we want
    li_length = len(pagination)
    num_pages = int(pagination[li_length - 2].get_text())
    
    return num_pages

In [None]:
def generate_urls(event_code, year):
    #find the number of pages of Male finisher results
    male_pages = find_pagination_limit(f"https://chicago-history.r.mikatiming.com/2015/?page=1&event={event_code}&lang=EN_CAP&num_results=1000&pid=search&pidp=start&search%5Bage_class%5D=%25&search%5Bsex%5D=M&search%5Bnation%5D=%25&search_sort=name")
    print(f"Number of male pages: {male_pages}")
    
    #find the number of pages of Female finisher results
    female_pages = find_pagination_limit(f"https://chicago-history.r.mikatiming.com/2015/?page=1&event={event_code}&lang=EN_CAP&num_results=1000&pid=search&pidp=start&search%5Bage_class%5D=%25&search%5Bsex%5D=W&search%5Bnation%5D=%25&search_sort=name")
    print(f"Number of female pages: {female_pages}")
    
    for i in range(1, male_pages+1):
        s = Scrapable('Male', year, f"https://chicago-history.r.mikatiming.com/2015/?page={i}&event={event_code}&lang=EN_CAP&num_results=1000&pid=search&pidp=start&search%5Bage_class%5D=%25&search%5Bsex%5D=M&search%5Bnation%5D=%25&search_sort=name")
        to_scrape.append(s)
        
    for i in range(1, female_pages+1):
        s = Scrapable('Female', year, f"https://chicago-history.r.mikatiming.com/2015/?page={i}&event={event_code}&lang=EN_CAP&num_results=1000&pid=search&pidp=start&search%5Bage_class%5D=%25&search%5Bsex%5D=W&search%5Bnation%5D=%25&search_sort=name")
        to_scrape.append(s)

In [None]:
def cleanup_and_save():
    df_results = pd.DataFrame(masterResults)
    
    #add the event name
    df_results['Race'] = 'Chicago'
    
    #sort by finisher name for easier comparison to online results page
    df_results.sort_values(by=['runner_name'], inplace=True)
    
    #save to csv
    df_results.to_csv(f'Chicago.csv', index=False)
    print(f"Scraping complete.  Gathered {df_results.shape[0]} results")

# Do the scraping

In [None]:
to_scrape = list()
masterResults = [];

events_to_scrape = {
    '2000' : "",
    '2001' : "MAR_9999990E9A9236000000006A",
    '2002' : "",
    '2003' : "",
    '2004' : "",
    '2005' : "",
    '2006' : "",
    '2007' : "",
    '2008' : "",
    '2009' : "",
    '2010' : "",
    '2011' : "",
    '2012' : "",
    '2013' : "",
    '2014' : "",
    '2015' : "",
    '2016' : "",
    '2017' : "",
    '2018' : "",
    '2019' : "",
}

for year,event_code in events_to_scrape.items():
    generate_urls(event_code, year)

#Use multithreading to speed up the scraping process significantly
with concurrent.futures.ThreadPoolExecutor(max_workers=multiprocessing.cpu_count()) as executor:
    executor.map(parse_page, to_scrape)
cleanup_and_save()