In [13]:
import time
from datetime import datetime
from decimal import Decimal

import pandas as pd
import requests
from bs4 import BeautifulSoup

In [50]:
 
def getSoup(page):
  soup = BeautifulSoup(page.text, 'lxml')
  return (soup)

def getRunDate(soup):
  dateString = soup.find('h4').text
  runDate = datetime.strptime(dateString, '%d/%m/%Y').date()
  return(runDate)

def getCourse(soup):
  if soup.find('h2')is None:
    print('error in getCourse')
    course = 'getCourse Error'
  else:
      course = soup.find('h2').text.replace('Results','')
  return(course)

def get_initials (fullName):
  splitName = fullName.split(' ')
  initials = ''
  for name in splitName:
    if len(name) > 1:
      initials = initials + name.strip()[0] + ' '
  return(initials.strip())

def getRunDetails(soup):
  columns = ['Athlete Place','Athlete Name', 'Athlete Gender','Clock Time','Handicap','Run Time','Avg Pace','Points', 'Run ClassID','Run Length','Run Distance','Run Location','Run Date']
  #['Athlete Place','Athlete Name', 'Athlete Gender','Clock Time','Handicap','Run Time','Avg Pace','Points', 'Run ClassID','Run Length','Run Distance','Run Location','Run Date']
  runDetails = pd.DataFrame(columns = columns)
  course = getCourse(soup)
  runDate = getRunDate(soup)
  handicapInd = 0
  clockTime = None
  handicap = None
  headers = soup.find_all(['th'])
  for header in headers:
    if header.getText() == 'Handicap':
      handicapInd = 1
  for record in soup.find('table', attrs={'class':'table'}).find_all(['tr']):
    if record.find('img') is not None:    
      runClassID = record.find('img')['class'][0]
      headerString = record.get_text().split()
      if 'Run' in headerString:
        runType = (headerString[headerString.index('Run') -1])
        for word in headerString:
          if 'km' in word:
            dist  = Decimal(word.replace('km','').replace(')','')) 
    row = [td.text for td in record.find_all(['td'])]
    if handicapInd == 0:
      row.insert(3, clockTime)
      row.insert(4, handicap)
    row.append(runClassID)
    row.append(runType)
    row.append(dist)
    row.append(course)
    row.append(runDate)
    if len(row) == 13 and len(str(row[5])) > 1 and len(row[1]) > 1:
      row[1] = get_initials(row[1])
      runDetails.loc[len(runDetails)] = row
  return(runDetails) 

def get_pages(urlbase, startPageNo, pagesToCheck):
  #columns = ['Athlete Place','Athlete Name', 'Athlete Gender','Run Time','Avg Pace','Points', 'Run ClassID','Run Length','Run Distance','Run Location','Run Date']
  columns = ['Athlete Place','Athlete Name', 'Athlete Gender','Clock Time','Handicap','Run Time','Avg Pace','Points', 'Run ClassID','Run Length','Run Distance','Run Location','Run Date']
  allResults = pd.DataFrame(columns = columns)
  for i in range (startPageNo, startPageNo + pagesToCheck):          #150 is start of 2022 season. 
    url = urlbase + str(i)
    try:
      page = requests.get(url)
    except:
      print('Exception ' + str(page.status_code))  
    if page.status_code == 200:
        soup = getSoup(page)
        runDetails = getRunDetails(soup)
        allResults = pd.concat([allResults, runDetails])
        time.sleep(1)
  return(allResults)

In [52]:
urlbase = "https://theautomatedclub.com/3CRC/Result/Details/"
results = get_pages(urlbase, 150, 26)
results = results.drop_duplicates()


In [55]:
#results.to_csv('/content/drive/MyDrive/Colab Notebooks/Gingerbread_Man/2022_23_Results.csv', index=False) 
results.to_csv('./2022_23_Results.csv', index=False) 
#display(results[results['Athlete Name'].str.contains("C J")])
