In [127]:
import time
from datetime import datetime
from decimal import Decimal

import pandas as pd
import requests
from bs4 import BeautifulSoup

In [128]:
 
def getSoup(page):
  soup = BeautifulSoup(page.text, 'lxml')
  return (soup)

def getRunDate(soup):
  dateString = soup.find('h4').text
  runDate = datetime.strptime(dateString, '%d/%m/%Y').date()
  return(runDate)

def getCourse(soup):
  if soup.find('h2')is None:
    print('error in getCourse')
    course = 'getCourse Error'
  else:
      course = soup.find('h2').text.replace('Results','')
  return(course)

def get_initials (fullName):
  splitName = fullName.split(' ')
  initials = ''
  for name in splitName:
    if len(name) > 1:
      initials = initials + name.strip()[0] + ' '
  return(initials.strip())


In [136]:

def getRunDetails(soup):
  columns = ['Athlete Place','Athlete Name', 'Athlete Gender','Clock Time','Handicap','Run Time','Avg Pace','Points', 'Run ClassID','Run Length','Run Distance','Run Location','Run Date']
  #['Athlete Place','Athlete Name', 'Athlete Gender','Clock Time','Handicap','Run Time','Avg Pace','Points', 'Run ClassID','Run Length','Run Distance','Run Location','Run Date']
  runDetails = pd.DataFrame(columns = columns)
  course = getCourse(soup)
  runDate = getRunDate(soup)
  handicapInd = 0
  volunteerInd = 0
  clockTime = None
  handicap = None
  headers = soup.find_all(['th'])
  for header in headers:
    if header.getText() == 'Handicap':
      handicapInd = 1
  for record in soup.find('table', attrs={'class':'table'}).find_all(['tr']):
    if record.find('img') is not None:    
      runClassID = record.find('img')['class'][0]
      headerString = record.get_text().split()
      if 'Volunteers' in headerString:
        volunteerInd = 1
      if 'Run' in headerString:
        runType = (headerString[headerString.index('Run') -1])
        for word in headerString:
          if 'km' in word:
            dist  = Decimal(word.replace('km','').replace(')','')) 
    row = [td.text for td in record.find_all(['td'])]
    if handicapInd == 0:
      row.insert(3, clockTime)
      row.insert(4, handicap)
    row.append(runClassID)
    row.append(runType)
    row.append(dist)
    row.append(course)
    row.append(runDate)
    if len(row) == 13 and len(str(row[5])) > 1 and len(row[1]) > 1:
      runDetails.loc[len(runDetails)] = row

  return(runDetails) 



In [130]:
def getVolDetails(soup):
  columns = ['Athlete Place','Athlete Name', 'Athlete Gender','Clock Time','Handicap','Run Time','Avg Pace','Points', 'Run ClassID','Run Length','Run Distance','Run Location','Run Date']
  volDetails = pd.DataFrame(columns = columns)
  course = getCourse(soup)
  runDate = getRunDate(soup)
  volunteerInd = 0
  headers = soup.find_all(['th'])
  for record in soup.find('table', attrs={'class':'table'}).find_all(['tr']):
    if record.find('img') is not None:    
      runClassID = record.find('img')['id']
    if runClassID == 'g1-3':
      row = [td.text for td in record.find_all(['td'])]
      if len(row) == 5:
        vol = [None,  #Place
                row[1],#Name
                row[2],#Gender
                None, #Clock Time
                None, #Handicap
                None, #Run Time
                None, #Avg Pace
                row[4], #Points
                runClassID,
                None, #Length
                None, #Distance
                course,
                runDate
                ]      
        volDetails.loc[len(volDetails)] = vol
  return(volDetails) 


In [137]:
def get_pages(urlbase, startPageNo, pagesToCheck):
  #columns = ['Athlete Place','Athlete Name', 'Athlete Gender','Run Time','Avg Pace','Points', 'Run ClassID','Run Length','Run Distance','Run Location','Run Date']
  columns = ['Athlete Place','Athlete Name', 'Athlete Gender','Clock Time','Handicap','Run Time','Avg Pace','Points', 'Run ClassID','Run Length','Run Distance','Run Location','Run Date']
  allResults = pd.DataFrame(columns = columns)
  for i in range (startPageNo, startPageNo + pagesToCheck):          #150 is start of 2022 season. 
    url = urlbase + str(i)
    try:
      page = requests.get(url)
    except:
      print('Exception ' + str(page.status_code))  
    if page.status_code == 200:
        soup = getSoup(page)
        runResults = getRunDetails(soup)
        allResults = pd.concat([allResults, runResults])
        volDetails = getVolDetails(soup)
        allResults = pd.concat([allResults, volDetails])
        time.sleep(1)
  return(allResults)

In [132]:
def calcElapsedTime(x):
    if pd.isnull(x):
        time = ''
    else:
        runTime = x.split(':')
        if len(runTime) == 2 :
            time = int(runTime[0]) * 60 + int(runTime[1])
        else:
            time = int(runTime[0]) * 60 * 60 + int(runTime[1]) * 60 + int(runTime[2])
    return(time)

In [None]:
urlbase = "https://theautomatedclub.com/3CRC/Result/Details/"
results = get_pages(urlbase, 150, 50)

#results.to_csv('/content/drive/MyDrive/Colab Notebooks/Gingerbread_Man/2022_23_Results.csv', index=False) 
#display(results[results['Athlete Name'].str.contains("C J")])


In [138]:
#results = pd.read_csv('./2022_23_Results.csv')
results['Athlete Place'] = None
results = results.drop_duplicates()
results['Run Time (s)'] = results.apply(lambda row : calcElapsedTime(row['Run Time']), axis = 1)
results['Clock Time (s)'] = results.apply(lambda row : calcElapsedTime(row['Clock Time']), axis = 1)
results['Handicap (s)'] = results.apply(lambda row : calcElapsedTime(row['Handicap']), axis = 1)
results['Avg Pace (s)'] = results.apply(lambda row : calcElapsedTime(row['Avg Pace']), axis = 1)
results['Athlete Intials'] = results.apply(lambda row : get_initials(row['Athlete Name']), axis = 1)
results.Points = pd.to_numeric(results.Points, errors='coerce')
results['Total Points'] = results.groupby(['Athlete Name'])['Points'].cumsum() 
results.to_csv('./2022_23_Results.csv', index=False) 
