In [1]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [2]:
def get_results_url(year, page):
    url = 'https://results.virginmoneylondonmarathon.com/' + str(year) + '/?page=' + str(page) + '&event=MAS&num_results=1000&pid=search&pidp=start&search%5Bsex%5D=%25&search%5Bage_class%5D=%25&search%5Bnation%5D=%25&search_sort=name'
    return url

In [3]:
def get_results_page_soup(year, page):
    url = get_results_url(year, page)
    page = requests.get(url).content
    return BeautifulSoup(page, 'html.parser')

In [4]:
def get_runner_urls_from_soup(year, soup):
    if year >= 2019:
        runner_rows = soup.select('.list-group  li.list-group-item.row')[1:]
    else:
        runner_rows = soup.select('div.cbox-content table.list-table tbody tr')
    
    urls = []
    for row in runner_rows:
        try:
            if year >= 2019:
                href = row.select('h4.list-field.type-fullname a')[0]['href']
            else:
                href = row.select('td a')[0]['href']
            urls.append('https://results.virginmoneylondonmarathon.com/' + str(year) + '/' + href)
        except:
            pass
    return urls

In [None]:
soup = get_results_page_soup(2018, 1)

In [None]:
runner_urls = get_runner_urls_from_soup(2018, soup)

In [5]:
def get_runner_times_from_url(runner_url):
    try:
        page = requests.get(runner_url).content
        soup2 = BeautifulSoup(page, 'html.parser')
        split_table = str(soup2.select('div.detail-box.box-splits table')[0])
        df_split = pd.read_html(split_table)[0]
        return df_split.Diff.values
    except:
        print('Failed Runner', runner_url)
        return ['-', '-', '-', '-', '-', '-', '-', '-', '-', '-']

In [None]:
all_split_times = []
for url in runner_urls[:5]:
    all_split_times.append(get_runner_times_from_url(url))
all_split_times = pd.DataFrame(all_split_times, columns=['5', '10', '15', '20', 'Half', '25', '30', '35', '40', 'Finish'])
all_split_times

In [6]:
def how_many_pages_for_year(year):
    page_soup = get_results_page_soup(year, 1)
    if year >= 2019:
        pages = page_soup.select('ul.pagination li.hidden-xs.hidden-sm a')[-1].text
    else:
        pages = page_soup.select('div.pages a')[-2].text
    return int(pages)

In [7]:
def get_splits_one_page(year, page):
    soup = get_results_page_soup(year, page)
    runner_urls = get_runner_urls_from_soup(year, soup)
    
    all_split_times = []
    for url in runner_urls:
        all_split_times.append(get_runner_times_from_url(url))
    all_split_times = pd.DataFrame(all_split_times, columns=['5', '10', '15', '20', 'Half', '25', '30', '35', '40', 'Finish'])
    return all_split_times

In [8]:
def get_splits_for_year(year):
    num_pages = how_many_pages_for_year(year)
    
    all_split_times = []
    for page in range(1, num_pages+1):
        soup = get_results_page_soup(year, page)
        runner_urls = get_runner_urls_from_soup(year, soup)
        
        for url in runner_urls:
            all_split_times.append(get_runner_times_from_url(url))

    all_split_times = pd.DataFrame(all_split_times, columns=['5', '10', '15', '20', 'Half', '25', '30', '35', '40', 'Finish'])
    return all_split_times

In [9]:
#splits_2014 = get_splits_for_year(2014)

In [23]:
df_2019_6 = get_splits_one_page(2019, 6)

Failed Runner https://results.virginmoneylondonmarathon.com/2019/?content=detail&fpid=search&pid=search&idp=9999990F5ECC9700002C4C49&lang=EN_CAP&event=MAS&num_results=1000&page=6&pidp=start&search%5Bsex%5D=%25&search%5Bage_class%5D=%25&search%5Bnation%5D=%25&search_sort=name&search_event=MAS
Failed Runner https://results.virginmoneylondonmarathon.com/2019/?content=detail&fpid=search&pid=search&idp=9999990F5ECC9700002CA491&lang=EN_CAP&event=MAS&num_results=1000&page=6&pidp=start&search%5Bsex%5D=%25&search%5Bage_class%5D=%25&search%5Bnation%5D=%25&search_sort=name&search_event=MAS
Failed Runner https://results.virginmoneylondonmarathon.com/2019/?content=detail&fpid=search&pid=search&idp=9999990F5ECC9700002C6DFE&lang=EN_CAP&event=MAS&num_results=1000&page=6&pidp=start&search%5Bsex%5D=%25&search%5Bage_class%5D=%25&search%5Bnation%5D=%25&search_sort=name&search_event=MAS
Failed Runner https://results.virginmoneylondonmarathon.com/2019/?content=detail&fpid=search&pid=search&idp=9999990F5ECC9

Failed Runner https://results.virginmoneylondonmarathon.com/2019/?content=detail&fpid=search&pid=search&idp=9999990F5ECC9700002CB7CE&lang=EN_CAP&event=MAS&num_results=1000&page=6&pidp=start&search%5Bsex%5D=%25&search%5Bage_class%5D=%25&search%5Bnation%5D=%25&search_sort=name&search_event=MAS
Failed Runner https://results.virginmoneylondonmarathon.com/2019/?content=detail&fpid=search&pid=search&idp=9999990F5ECC9700002C7E83&lang=EN_CAP&event=MAS&num_results=1000&page=6&pidp=start&search%5Bsex%5D=%25&search%5Bage_class%5D=%25&search%5Bnation%5D=%25&search_sort=name&search_event=MAS
Failed Runner https://results.virginmoneylondonmarathon.com/2019/?content=detail&fpid=search&pid=search&idp=9999990F5ECC9700002C4ABE&lang=EN_CAP&event=MAS&num_results=1000&page=6&pidp=start&search%5Bsex%5D=%25&search%5Bage_class%5D=%25&search%5Bnation%5D=%25&search_sort=name&search_event=MAS
Failed Runner https://results.virginmoneylondonmarathon.com/2019/?content=detail&fpid=search&pid=search&idp=9999990F5ECC9

Failed Runner https://results.virginmoneylondonmarathon.com/2019/?content=detail&fpid=search&pid=search&idp=9999990F5ECC9700002C902A&lang=EN_CAP&event=MAS&num_results=1000&page=6&pidp=start&search%5Bsex%5D=%25&search%5Bage_class%5D=%25&search%5Bnation%5D=%25&search_sort=name&search_event=MAS
Failed Runner https://results.virginmoneylondonmarathon.com/2019/?content=detail&fpid=search&pid=search&idp=9999990F5ECC9700002C1F28&lang=EN_CAP&event=MAS&num_results=1000&page=6&pidp=start&search%5Bsex%5D=%25&search%5Bage_class%5D=%25&search%5Bnation%5D=%25&search_sort=name&search_event=MAS
Failed Runner https://results.virginmoneylondonmarathon.com/2019/?content=detail&fpid=search&pid=search&idp=9999990F5ECC9700002C9B3E&lang=EN_CAP&event=MAS&num_results=1000&page=6&pidp=start&search%5Bsex%5D=%25&search%5Bage_class%5D=%25&search%5Bnation%5D=%25&search_sort=name&search_event=MAS
Failed Runner https://results.virginmoneylondonmarathon.com/2019/?content=detail&fpid=search&pid=search&idp=9999990F5ECC9

Failed Runner https://results.virginmoneylondonmarathon.com/2019/?content=detail&fpid=search&pid=search&idp=9999990F5ECC9700002C449C&lang=EN_CAP&event=MAS&num_results=1000&page=6&pidp=start&search%5Bsex%5D=%25&search%5Bage_class%5D=%25&search%5Bnation%5D=%25&search_sort=name&search_event=MAS
Failed Runner https://results.virginmoneylondonmarathon.com/2019/?content=detail&fpid=search&pid=search&idp=9999990F5ECC9700002C455C&lang=EN_CAP&event=MAS&num_results=1000&page=6&pidp=start&search%5Bsex%5D=%25&search%5Bage_class%5D=%25&search%5Bnation%5D=%25&search_sort=name&search_event=MAS
Failed Runner https://results.virginmoneylondonmarathon.com/2019/?content=detail&fpid=search&pid=search&idp=9999990F5ECC9700002C9EFA&lang=EN_CAP&event=MAS&num_results=1000&page=6&pidp=start&search%5Bsex%5D=%25&search%5Bage_class%5D=%25&search%5Bnation%5D=%25&search_sort=name&search_event=MAS
Failed Runner https://results.virginmoneylondonmarathon.com/2019/?content=detail&fpid=search&pid=search&idp=9999990F5ECC9

Failed Runner https://results.virginmoneylondonmarathon.com/2019/?content=detail&fpid=search&pid=search&idp=9999990F5ECC9700002CC63D&lang=EN_CAP&event=MAS&num_results=1000&page=6&pidp=start&search%5Bsex%5D=%25&search%5Bage_class%5D=%25&search%5Bnation%5D=%25&search_sort=name&search_event=MAS
Failed Runner https://results.virginmoneylondonmarathon.com/2019/?content=detail&fpid=search&pid=search&idp=9999990F5ECC9700002C2624&lang=EN_CAP&event=MAS&num_results=1000&page=6&pidp=start&search%5Bsex%5D=%25&search%5Bage_class%5D=%25&search%5Bnation%5D=%25&search_sort=name&search_event=MAS
Failed Runner https://results.virginmoneylondonmarathon.com/2019/?content=detail&fpid=search&pid=search&idp=9999990F5ECC9700002C4229&lang=EN_CAP&event=MAS&num_results=1000&page=6&pidp=start&search%5Bsex%5D=%25&search%5Bage_class%5D=%25&search%5Bnation%5D=%25&search_sort=name&search_event=MAS
Failed Runner https://results.virginmoneylondonmarathon.com/2019/?content=detail&fpid=search&pid=search&idp=9999990F5ECC9

In [24]:
df_2019_6.to_csv('../data/london_marathon_2019_6.csv', index=False)