In [1]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
import re
import pandas as pd
import os
import datetime

In [4]:
# Reading collected csv file of the dates and location for each race
dateList = pd.read_csv('DatesFull.csv')
# Setting a Boolean flag indicator for future use for writing to CSV to prevent data loss due to
# Program timeouts
new = True

# Looping over each race date and corrisponding track
for index, fullDate in dateList.iterrows():
    # Converting and selecting the date in string format
    day = str(fullDate['Date'])
    # Selecting the track location
    tLoc = fullDate['Track']
    # Setting the selenium webdriver options to prevent opening new browsers and to initialize
    options = webdriver.FirefoxOptions()
    options.add_argument('-headless')
    driver = webdriver.Firefox(firefox_options=options)
    # Creating the URL with the basic information and concating the current date and track location
    url = 'http://racing.hkjc.com/racing/Info/Meeting/Results/English/Local/'+day+'/'+tLoc+'/1'
    # Printing the URL to check for any errors
    print(url)
    # Set a wait time for the browser time to open
    driver.implicitly_wait(70)
    # Provide URL for the page to be pulled
    driver.get(url)
    # Setting a delay of 5 seconds to allow for javascript load times
    delay = 10 # seconds
    # Trying to find the presence of a targerted table element after waiting for loading
    try:
        myElem = WebDriverWait(driver, delay).until(EC.presence_of_element_located((By.CLASS_NAME, 'trBgBlue')))
        # Printing indicator for the page being ready
        print("Page is ready!")
        # Converting the page responce to soup
        soup=BeautifulSoup(driver.page_source, 'lxml')        
        # Finding how many races were ran that day by first selecting the element containing the race numbers
        raceCountBar = soup.find_all(class_='raceNum')
        # Testing if no races are found on the webpage
        if len(raceCountBar) == 0:
            # Printing an indicator that the next date should be accessed
            print('next date')
        else:
            # Finds all of the td elements associated with the race number for the day
            raceCount = raceCountBar[0].find_all('td')
            # Locating the full track name in the soup
            track = raceCount[1].text
            # Removing the semicolon from the track
            track = track.replace(':','')
            # Finding the number of races for the day by counting the elements and subtacting the unassocated elements
            raceCount = len(raceCount) - 3
            # There are a maximum of 11 races in a day but due to HTML differences testing done if the 
            # race count is greater than 11 and if so setting it to the max of 11
            if raceCount > 11:
                raceCount = 11
            # Printing the race count for confirmation
            print(raceCount)
            # Cretaing an empty list for the number of races
            numberOfRaces = []
            # Initializing a count variable at 0
            count = 0
            # While loop populates the number of races list when the count is less than the race count
            while(count<raceCount):
                # Adding 1 to the count
                count += 1
                # Appending the count to the number of races list
                numberOfRaces.append(count)
            # Looping over the number of races list 
            for race in numberOfRaces:
                # Creating exact URL for the day, track, and race number 
                url = 'http://racing.hkjc.com/racing/Info/Meeting/Results/English/Local/'+day+'/'+tLoc+'/'+str(race)
                # Printing for URL verification
                print(url)
                # create a new Firefox session and allow page to load
                driver.implicitly_wait(70)
                driver.get(url)
                delay = 5 # seconds
                # Wait for the page to load until specific element is found
                try:
                    myElem = WebDriverWait(driver, delay).until(EC.presence_of_element_located((By.CLASS_NAME, 'trBgBlue')))
                    print("Page is ready!")
                    # Turning response into soup
                    soup=BeautifulSoup(driver.page_source, 'lxml')
                    # Finding the date
                    date = soup.find_all(class_ = 'tdAlignL')
                    date = date[0].text
                    date = re.findall('(\d+\S+)', date)
                    date = date[0]
                    # Finding the race number for the day and overall
                    race = soup.find_all(class_ = 'trBgBlue')
                    race = race[0].text
                    # Finding the type and distance of the race
                    raceInfo = soup.find_all(class_ = 'divWidth')
                    classDistance = raceInfo[0].text
                    # Finding the track condition
                    going = raceInfo[0].next_sibling.next_sibling.text
                    # Locating addtional info
                    moreInfo = raceInfo[0].parent.next_sibling.find_all('td')
                    # Finding the handicapper
                    handicapper = moreInfo[0].text
                    # Finding the course
                    course = moreInfo[2].text
                    moreInfo = raceInfo[0].parent.next_sibling.next_sibling.find_all('td')
                    # Creating an empty list for the winning splits
                    cleanWinningSplits = []
                    # Locating the winning splits, formatting, and appending
                    try:
                        winningSplits = [moreInfo[2].text, moreInfo[3].text, moreInfo[4].text]
                        for split in winningSplits:
                            cleanSplit = split.strip('(')
                            cleanSplit = cleanSplit.strip(')')
                            cleanWinningSplits.append(cleanSplit)            
                    except:
                        # If the winning splits are not there all 0s are entered
                        cleanWinningSplits = [0,0,0]
                    # Locating the prize money and if not found putting unknown in
                    try:
                        prizeMoney = moreInfo[0].text
                    except:
                        prizeMoney = 'Unknown'
                    # Creating an empty list for the section times
                    sectionTime = []
                    # Locating the section times, formatting, and appending
                    try:
                        moreInfo = raceInfo[0].parent.next_sibling.next_sibling.next_sibling.find_all('td')
                        sectionTime = [moreInfo[2].text, moreInfo[3].text, moreInfo[4].text]
                    except:
                        # If the section times are not there all 0s are entered
                        sectionTime = [0,0,0]
                    # Setting up variables for locating and storing the Class and Distance of the races
                    count = 0
                    chopSpots = []
                    hyphens = 0
                    for item in classDistance:
                        if item == '-':
                            chopSpots.append(count)
                            hyphens +=1
                        count+= 1
                    if hyphens == 1:
                        hClass = classDistance[0:chopSpots[0]-1]
                        distance = classDistance[chopSpots[0]+2:]
                    else:
                        hClass = classDistance[0:chopSpots[0]-1]
                        distance = classDistance[chopSpots[0]+2:chopSpots[1]]
                    # Locating horse information table
                    tables = soup.find_all('tbody')
                    count = 0
                    try:
                        rows = tables[4].find_all('tr')
                    except:
                        rows = tables[3].find_all('tr')
                    # Due to variation in formatting the table data is put into list for only specific elements
                    resultsPacked = []
                    for row in rows:
                        data = row.find_all('td')
                        tdCount = (len(data))
                        if tdCount < 10:
                            continue
                        else:
                            for elem in data:
                                if count < 9:
                                    resultsPacked.append(elem.text)
                                elif count > tdCount-3 :
                                    resultsPacked.append(elem.text)
                                count +=1
                                if count > tdCount-1:
                                    count = 0
                    # Creating empty lists to store all information for the horses                
                    place = []
                    horse_num = []
                    horse = []
                    jockey =[]
                    trainer = []
                    act_wt = []
                    declar_wt = []
                    draw = []
                    lengs_behind = []
                    finish_time = []
                    win_odds = []
                    count = 0
                    # Isolating the required variables based on location in the results packed list
                    for x in resultsPacked:
                        if count == 0:
                            place.append(x)
                        elif count == 1:
                            horse_num.append(x)
                        elif count == 2:
                            horse.append(x)
                        elif count == 3:
                            jockey.append(x)
                        elif count == 4:
                            trainer.append(x)
                        elif count == 5:
                            act_wt.append(x)
                        elif count == 6:
                            declar_wt.append(x)
                        elif count == 7:
                            draw.append(x)
                        elif count == 8:
                            lengs_behind.append(x)
                        elif count == 9:
                            finish_time.append(x)
                        elif count == 10:
                            win_odds.append(x)
                        count = count + 1
                        if count == 11:
                            count = 0
                    # Creating empty lists to store all information for the race 
                    dateList = []
                    raceList = []
                    trackList = []
                    hClassList = []
                    distanceList = []
                    goingList = []
                    handicapperList = []
                    cleanWinningSplitsList = []
                    prizeMoneyList = []
                    sectionTimeList = []
                    # Populating the race info into lists matching the number or horses in the race
                    def populateLists(d, r, t, hc, dist, g, h, cWS, pM,
                                     sT):
                        for x in horse:
                            dateList.append(d)
                            raceList.append(r)
                            trackList.append(t)
                            hClassList.append(hc)
                            distanceList.append(dist)
                            goingList.append(g)
                            handicapperList.append(h)
                            cleanWinningSplitsList.append(cWS)
                            prizeMoneyList.append(pM)
                            sectionTimeList.append(sT)

                    populateLists(date,race, track, hClass, distance, going,handicapper, cleanWinningSplits, 
                                  prizeMoney, sectionTime)
                    # Build the DataFrame with collected info
                    raceDF = pd.DataFrame({'Race':raceList, 'Track':trackList, 'Date':dateList, 'Class': hClassList,
                                           'Distance':distanceList, 'Track_Status':goingList, 'Handicapper':handicapperList,
                                           'Winning_Splits':cleanWinningSplitsList, 'Prize_Money': prizeMoneyList,
                                           'Section_Time':sectionTimeList,
                                          'Horse_Number': horse_num,'Horse':horse, 'Jockey':jockey, 'Trainer':trainer,
                                          'Act_wt':act_wt, 'Declar_wt':declar_wt, 'Draw':draw, 'Lengs_Behind':lengs_behind,
                                        'Finish_Time': finish_time, 'Odds': win_odds, 'Place':place})
                    # Testing if the is the first time the dateframe was created
                    if(new):
                        # Copying the dataframe
                        fullDF = raceDF
                        # Switching the Boolean value to false
                        new = False
                    else:
                        # Appending the currently created dataframe for the race to existing dataframe of races
                        fullDF = fullDF.append(raceDF)
                    print(fullDF.shape)
                # In the event the page does not load properly printing error message
                except TimeoutException:
                    print("Loading took too much time!")
                # Writing the dataframe to a CSV file 
                fullDF.to_csv('Races.csv')       
    # In the event the page does not load properly printing error message
    except TimeoutException:
        print("Loading took too much time!")


http://racing.hkjc.com/racing/Info/Meeting/Results/English/Local/20070909/ST/1


KeyboardInterrupt: 