In [1]:
import time
from datetime import datetime
from decimal import Decimal

import pandas as pd
import requests
from bs4 import BeautifulSoup

In [2]:
def getSoup(page):
  soup = BeautifulSoup(page.text, 'lxml')
  return (soup)

def getRunDate(soup):
  dateString = soup.find('h4').text
  runDate = datetime.strptime(dateString, '%d/%m/%Y').date()
  return(runDate)

def getCourse(soup):
  if soup.find('h2')is None:
    print('error in getCourse')
    course = 'getCourse Error'
  else:
      course = soup.find('h2').text
  return(course)

def getRunDetails(soup):
  columns = ['Athlete Place','Athlete Name', 'Athlete Gender','Run Time','Avg Pace','Points', 'Run ClassID','Run Length','Run Distance','Run Location','Run Date']
  runDetails = pd.DataFrame(columns = columns)
  course = getCourse(soup).replace('Results','')
  runDate = getRunDate(soup)
  for record in soup.find('table', attrs={'class':'table'}).find_all(['tr']):
    if record.find('img') is not None:    
      runClassID = record.find('img')['class'][0]
      headerString = record.get_text().split()
      if 'Run' in headerString:
        runType = (headerString[headerString.index('Run') -1])
        for word in headerString:
          if 'km' in word:
            dist  = Decimal(word.replace('km','').replace(')','')) 
    row = [td.text for td in record.find_all(['td'])]
    row.append(runClassID)
    row.append(runType)
    row.append(dist)
    row.append(course)
    row.append(runDate)
    if len(row) == 11:
      runDetails.loc[len(runDetails)] = row
  return(runDetails) 

def get_pages(urlbase, startPageNo, pagesToCheck):
  columns = ['Athlete Place','Athlete Name', 'Athlete Gender','Run Time','Avg Pace','Points', 'Run ClassID','Run Length','Run Distance','Run Location','Run Date']
  allResults = pd.DataFrame(columns = columns)
  for i in range (startPageNo, startPageNo + pagesToCheck):          #150 is start of 2022 season. Checks for 50 pages
    url = urlbase + str(i)
    try:
      page = requests.get(url)
    except:
      print('Exception ' + str(page.status_code))  
    if page.status_code == 200:
        soup = getSoup(page)
        runDetails = getRunDetails(soup)
        allResults = pd.concat([allResults, runDetails])
        time.sleep(1)
  return(allResults)

In [3]:
urlbase = "https://theautomatedclub.com/3CRC/Result/Details/"
results = get_pages(urlbase, 150, 26)
results = results.drop_duplicates()


In [5]:
#results.to_csv('/content/drive/MyDrive/Colab Notebooks/Gingerbread_Man/2022_23_Results.csv', index=False) 
results.to_csv('./2022_23_Results.csv', index=False) 

In [4]:
display(results[results['Athlete Name'].str.contains("Chris Jardine")])


Unnamed: 0,Athlete Place,Athlete Name,Athlete Gender,Run Time,Avg Pace,Points,Run ClassID,Run Length,Run Distance,Run Location,Run Date
105,8,Chris Jardine,M,1:02:02,6:12,43,ig1-2,Long,10.0,Peace Park,2022-08-31
80,13,Chris Jardine,M,1:24:05,7:00,38,ig1-2,Long,12.0,Peace Park,2022-09-07
34,5,Chris Jardine,M,1:10:37,7:04,46,ig1-2,Long,10.0,Peace Park,2022-09-21
10,11,Chris Jardine,M,56:28,5:49,40,ig1-0,Long,9.7,Kitchener Poppet Head,2022-09-28
43,8,Chris Jardine,M,56:39,5:50,43,ig1-2,Long,9.7,Kitchener Poppet Head,2022-10-05
95,10,Chris Jardine,M,53:38,5:32,41,ig1-2,Long,9.7,Kitchener Poppet Head,2022-10-12
111,8,Chris Jardine,M,59:50,5:59,43,ig1-2,Long,10.0,Pelaw Main,2022-10-26
106,7,Chris Jardine,M,55:28,5:33,44,ig1-2,Long,10.0,Pelaw Main,2022-11-02
114,11,Chris Jardine,M,53:52,5:23,40,ig1-2,Long,10.0,Pelaw Main,2022-11-09
