In [1]:
import pandas as pd
from urllib.request import urlopen
from bs4 import BeautifulSoup
import time
import concurrent.futures

In [7]:
urls_to_scrape = list()
masterResults = [];

In [10]:
# Parse a page of marathon finishers, adding the results to the master list
def parse_page(url):
    print(f"Current Page: {url}\n")
    client = urlopen(url)
    page = client.read()
    page
    client.close()

    #open the result with BS
    soup = BeautifulSoup(page, 'html.parser')

    page_finishers = soup.find("ul", "list-group list-group-multicolumn").find_all("li", "row")

    resultList = []

    for finisher in page_finishers:

        #skip the header row
        if 'list-group-header' in finisher.attrs['class']:
            continue

        place_overall_search = finisher.find("div", "place-secondary")
        if place_overall_search is not None:
            place_overall = place_overall_search.get_text()

        #skip non-finishers
        if place_overall is None or place_overall == '–':
            continue


        place_gender_search = finisher.find("div", "place-primary")
        if place_gender_search is not None:
            place_gender = place_gender_search.get_text()

        runner_name = finisher.find("h4", "type-fullname")
        if runner_name is not None:
            runner_name = runner_name.get_text()
        #TODO: extract country
        city_state = finisher.find("div", "list-field type-eval").contents[1]

        bib_number = finisher.find("div", "type-field").contents[1]

        division = finisher.find("div", "type-age_class").contents[1]

        times = finisher.find_all("div", "list-field type-time")
        half_time = times[0].contents[1]
        full_time = times[1].contents[1]

        result = {
            "place_overall" : place_overall,
            "place_gender" : place_gender,
            "runner_name" : runner_name,
            "city_state" : city_state,
            "bib_number" : bib_number,
            "division" : division,
            "time_half" : half_time,
            "time_full" : full_time
        }

        masterResults.append(result)

    time.sleep(0.25)
    


In [8]:
def generate_urls(event_code, num_pages):
    
    #TODO: Programatically figure out the number of pages for each gender
    #Then build URLS for both
    #Use this base URL:
    # https://chicago-history.r.mikatiming.com/2015/?page=2&event=MAR_9999990E9A9236000000006A&lang=EN_CAP&num_results=1000&pid=search&pidp=start&search%5Bage_class%5D=%25&search%5Bsex%5D=W&search%5Bnation%5D=%25&search_sort=name
    # setting the 'sex' parameter to M for male and W for female
    
    for i in range(1, num_pages+1):
        urls_to_scrape.append(f"https://chicago-history.r.mikatiming.com/2018/?page={i}&event={event_code}&lang=EN_CAP&pid=search&num_results=1000")

In [9]:
def cleanup_and_save(year):
    df_results = pd.DataFrame(masterResults)
    
    #add the year column
    df_results['Year'] = year
    
    #add the event name
    df_results['Race'] = 'Chicago'
    
    #sort by finisher name for easier comparison to online results page
    df_results.sort_values(by=['runner_name'])
    
    #save to csv
    df_results.to_csv(f'Chicago-{year}.csv', index=False)

In [11]:
event_code = 'MAR_9999990E9A9236000000006A'
num_pages = 29
year = '2001'

generate_urls(event_code, num_pages)

#Use multithreading to speed up the scraping process significantly
with concurrent.futures.ThreadPoolExecutor(max_workers=12) as executor:
    executor.map(parse_page, urls_to_scrape)
cleanup_and_save(year)

Current Page: https://chicago-history.r.mikatiming.com/2018/?page=1&event=MAR_9999990E9A9236000000006A&lang=EN_CAP&pid=search&num_results=1000

Current Page: https://chicago-history.r.mikatiming.com/2018/?page=2&event=MAR_9999990E9A9236000000006A&lang=EN_CAP&pid=search&num_results=1000

Current Page: https://chicago-history.r.mikatiming.com/2018/?page=3&event=MAR_9999990E9A9236000000006A&lang=EN_CAP&pid=search&num_results=1000

Current Page: https://chicago-history.r.mikatiming.com/2018/?page=4&event=MAR_9999990E9A9236000000006A&lang=EN_CAP&pid=search&num_results=1000

Current Page: https://chicago-history.r.mikatiming.com/2018/?page=5&event=MAR_9999990E9A9236000000006A&lang=EN_CAP&pid=search&num_results=1000

Current Page: https://chicago-history.r.mikatiming.com/2018/?page=6&event=MAR_9999990E9A9236000000006A&lang=EN_CAP&pid=search&num_results=1000

Current Page: https://chicago-history.r.mikatiming.com/2018/?page=7&event=MAR_9999990E9A9236000000006A&lang=EN_CAP&pid=search&num_result