In [44]:
from datetime import date, datetime, timedelta

def parse_date(input_date:datetime.date, strf=False):
    if strf:
        return input_date.strftime("%Y/%m/%d")
    return input_date

def init_date(from_date):
    if from_date is None:
        from_date = date.today()
    elif isinstance(from_date, str):
        from_date = datetime.strptime(from_date, "%Y/%m/%d")

    return from_date

def get_last_DoW(from_date=None, strf=False, date_of_interest='wednesday'):
    """ Compute last date of week we are interested in

    :param from_date: datetime.date, str
        the beginning date
    :param strf: Bool
        if we are interested in string format

    :return: last date of interest
    """
    assert date_of_interest.lower() in ('wednesday', 'sunday', 'wed', 'sun')

    from calendar import WEDNESDAY, SUNDAY

    DoW = WEDNESDAY if date_of_interest in ('wednesday', 'wed') else SUNDAY
    from_date = init_date(from_date)
    offset = (from_date.weekday() - DoW) % 7
    last_wednesday = from_date - timedelta(days=offset)

    return parse_date(last_wednesday, strf)

def get_span_of_DoW(from_date, strf, span, date_of_interest='wednesday'):
    begin_date = get_last_DoW(from_date, strf=False, date_of_interest=date_of_interest)
    datelist = []
    for i in range(span):
        last_DoW = begin_date - timedelta(days=7*i)
        datelist.append(parse_date(last_DoW, strf))
    return datelist


DATE_SPAN = 100
BEGIN_DATE = '2022/12/18'

horse_racing_dict = {
    'ST':get_span_of_DoW(BEGIN_DATE, strf=True, span=DATE_SPAN, date_of_interest='sunday')
    , 'HV':get_span_of_DoW(BEGIN_DATE, strf=True, span=DATE_SPAN, date_of_interest='wednesday')
}


In [45]:
horse_racing_dict['ST'][:5]

['2022/12/18', '2022/12/11', '2022/12/04', '2022/11/27', '2022/11/20']

In [42]:
def get_performance_info(webdriver, race_date, race_no, race_course, base_url):
    from time import time

    t0=time()

    target_url = base_url.format(race_date, race_course, race_no)
    webdriver.get(target_url)
    webdriver.implicitly_wait(1)

    # try to get performance table
    for _ in range(3):
        try:
            performance = webdriver.find_element(by=By.XPATH, value='//*[@id="innerContent"]/div[2]/div[5]')
        except:
            webdriver.implicitly_wait(1)

    performance_file = open('./data/performance.txt', 'a')
    horse_file = open('./data/url_horse.txt', 'a')
    jockey_file = open('./data/url_jockey.txt', 'a')
    trainer_file = open('./data/url_trainer.txt', 'a')

    # main body
    try:
        # race info
        distance = webdriver.find_element(by=By.XPATH, value='//*[@id="innerContent"]/div[2]/div[4]/table/tbody/tr[2]/td[1]').text
        field_going = webdriver.find_element(by=By.XPATH, value='//*[@id="innerContent"]/div[2]/div[4]/table/tbody/tr[2]/td[3]').text
        race_name = webdriver.find_element(by=By.XPATH, value='//*[@id="innerContent"]/div[2]/div[4]/table/tbody/tr[3]/td[1]').text
        course_type_n_no = webdriver.find_element(by=By.XPATH, value='//*[@id="innerContent"]/div[2]/div[4]/table/tbody/tr[3]/td[3]').text
        race_money = webdriver.find_element(by=By.XPATH, value='//*[@id="innerContent"]/div[2]/div[4]/table/tbody/tr[4]/td[1]').text

        performance_ranks = performance \
            .find_element(by=By.TAG_NAME, value='tbody') \
            .find_elements(by=By.TAG_NAME, value='tr')

        # horse = {}
        # jockey = {}
        # trainer = {}

        # performance_data = []
        for prank in performance_ranks:
            prank_data = prank.find_elements(by=By.TAG_NAME, value='td')
            performance_elem = [race_date, race_no, race_course, distance, field_going, race_name, course_type_n_no, race_money]
            for ix, data in enumerate(prank_data):
                performance_elem.append(data.text)

                href = re.findall(r'href="(.*?)"', data.get_attribute('innerHTML'))
                if len(href)>0:
                    if ix==2:
                        # horse[performance_elem[-1]] = href[0]
                        horse_file.write(f'{performance_elem[-1]}::{href[0]}\n')
                    elif ix==3:
                        # jockey[performance_elem[-1]] = href[0]
                        jockey_file.write(f'{performance_elem[-1]}::{href[0]}\n')
                    elif ix==4:
                        # trainer[performance_elem[-1]] = href[0]
                        trainer_file.write(f'{performance_elem[-1]}::{href[0]}\n')

            # performance_data.append(performance_elem)
            performance_file.write('::'.join([str(i) if i !='' else ' ' for i in performance_elem])+'\n')

    except:
        print('Err')
        performance_file.close()
        horse_file.close()
        jockey_file.close()
        trainer_file.close()
        return False

    performance_file.close()
    horse_file.close()
    jockey_file.close()
    trainer_file.close()

    t1=time()

    print(f'Scrapping: [Cost] - {round(t1-t0, 2)}s. Target - {target_url}')

    return True

In [46]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import re
import warnings
warnings.filterwarnings('ignore')

options=webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
chromedriver = r'E:\3,HKU\Courses\sem1 STAT7008-Programming for Data Science\HW-7008\assignment 3\chromedriver.exe'


# example: https://racing.hkjc.com/racing/information/Chinese/Racing/LocalResults.aspx?RaceDate=2022/12/18&Racecourse=ST&RaceNo=1
# date example: 2022/12/18
hkjc_root = 'https://racing.hkjc.com'
base_url = 'https://racing.hkjc.com/racing/information/Chinese/Racing/LocalResults.aspx?RaceDate={0}&Racecourse={1}&RaceNo={2}'

driver = webdriver.Chrome(executable_path=chromedriver, chrome_options=options)

for race_course in horse_racing_dict:
    for race_date in horse_racing_dict[race_course]:
        for race_no in range(1,11):
            continue_flag = get_performance_info(driver, race_date, race_no, race_course, base_url)
            if not continue_flag:
                break

Scrapping: [Cost] - 6.44s. Target - https://racing.hkjc.com/racing/information/Chinese/Racing/LocalResults.aspx?RaceDate=2022/12/18&Racecourse=ST&RaceNo=1
Scrapping: [Cost] - 4.88s. Target - https://racing.hkjc.com/racing/information/Chinese/Racing/LocalResults.aspx?RaceDate=2022/12/18&Racecourse=ST&RaceNo=2
Scrapping: [Cost] - 6.19s. Target - https://racing.hkjc.com/racing/information/Chinese/Racing/LocalResults.aspx?RaceDate=2022/12/18&Racecourse=ST&RaceNo=3
Scrapping: [Cost] - 8.02s. Target - https://racing.hkjc.com/racing/information/Chinese/Racing/LocalResults.aspx?RaceDate=2022/12/18&Racecourse=ST&RaceNo=4
Scrapping: [Cost] - 10.06s. Target - https://racing.hkjc.com/racing/information/Chinese/Racing/LocalResults.aspx?RaceDate=2022/12/18&Racecourse=ST&RaceNo=5
Scrapping: [Cost] - 8.75s. Target - https://racing.hkjc.com/racing/information/Chinese/Racing/LocalResults.aspx?RaceDate=2022/12/18&Racecourse=ST&RaceNo=6
Scrapping: [Cost] - 8.47s. Target - https://racing.hkjc.com/racing/in

In [65]:

def drop_duplicate(path, sep='\t'):
    from pandas import read_csv
    df = read_csv(path, sep=sep, encoding='gbk')
    df_dedup = df.drop_duplicates()
    print(df.shape, df_dedup.shape)
    df_dedup.to_csv(path, sep='\t', index=False)

drop_duplicate('./data/performance.txt')
drop_duplicate('./data/url_horse.txt')
drop_duplicate('./data/url_jockey.txt')
drop_duplicate('./data/url_trainer.txt')

(14431, 2) (67, 2)


In [8]:
import pandas as pd

horse = pd.read_csv('./data/url_horse.txt', sep='\t')
jockey = pd.read_csv('./data/url_jockey.txt', sep='\t')
trainer = pd.read_csv('./data/url_trainer.txt', sep='\t')
performance = pd.read_csv('./data/performance.txt', sep='\t')

print(horse.shape)
horse.head()

(1893, 2)


Unnamed: 0,horse,url
0,喜悅精靈(E388),/racing/information/Chinese/Horse/Horse.aspx?H...
1,勇敢動力(D153),/racing/information/Chinese/Horse/Horse.aspx?H...
2,合夥贛勁(D051),/racing/information/Chinese/Horse/Horse.aspx?H...
3,安寶(E407),/racing/information/Chinese/Horse/Horse.aspx?H...
4,創高峰(D136),/racing/information/Chinese/Horse/Horse.aspx?H...


In [9]:
print(jockey.shape)
jockey.head()

(74, 2)


Unnamed: 0,jockey,url
0,潘頓,/racing/information/Chinese/Jockey/JockeyProfi...
1,布文,/racing/information/Chinese/Jockey/JockeyProfi...
2,何澤堯,/racing/information/Chinese/Jockey/JockeyProfi...
3,蘇兆輝,/racing/information/Chinese/Jockey/JockeyProfi...
4,希威森,/racing/information/Chinese/Jockey/JockeyProfi...


In [10]:
print(trainer.shape)
trainer.head()

(67, 2)


Unnamed: 0,trainer,url
0,賀賢,/racing/information/Chinese/Trainers/TrainerPr...
1,告東尼,/racing/information/Chinese/Trainers/TrainerPr...
2,蘇偉賢,/racing/information/Chinese/Trainers/TrainerPr...
3,黎昭昇,/racing/information/Chinese/Trainers/TrainerPr...
4,苗禮德,/racing/information/Chinese/Trainers/TrainerPr...


In [11]:
print(performance.shape)
performance.head()

(14432, 20)


Unnamed: 0,Race Date,Race No.,Race Course,Distance,Field Going,Race Name,Course Type,Race Money,Pla.,Horse No.,Horse,Jockey,Trainer,Act. Wt.,Declare. Horse Wt.,Dr.,LBW,Running Position,Finish Time,Win Odds
0,2022/12/18,1,ST,第五班 - 1400米 - (40-0),好地,六福珠寶HEXICON讓賽,"草地 - ""C+3"" 賽道","HK$ 810,000",1,2,喜悅精靈(E388),潘頓,賀賢,135,1200,8,-,10 11 11 1,1:22.80,2.6
1,2022/12/18,1,ST,第五班 - 1400米 - (40-0),好地,六福珠寶HEXICON讓賽,"草地 - ""C+3"" 賽道","HK$ 810,000",2,5,勇敢動力(D153),布文,告東尼,129,1206,12,1-3/4,13 13 14 2,1:23.06,8.5
2,2022/12/18,1,ST,第五班 - 1400米 - (40-0),好地,六福珠寶HEXICON讓賽,"草地 - ""C+3"" 賽道","HK$ 810,000",3,3,合夥贛勁(D051),何澤堯,蘇偉賢,134,1230,6,2-1/2,9 10 10 3,1:23.20,7.4
3,2022/12/18,1,ST,第五班 - 1400米 - (40-0),好地,六福珠寶HEXICON讓賽,"草地 - ""C+3"" 賽道","HK$ 810,000",4,6,安寶(E407),蘇兆輝,黎昭昇,127,1093,10,3,12 12 12 4,1:23.26,42.0
4,2022/12/18,1,ST,第五班 - 1400米 - (40-0),好地,六福珠寶HEXICON讓賽,"草地 - ""C+3"" 賽道","HK$ 810,000",5,7,創高峰(D136),希威森,苗禮德,126,1225,3,3-1/4,2 3 3 5,1:23.33,6.0
