In [1]:
from bs4 import BeautifulSoup
from ipynb.fs.full.racing_tv_functions import *

import datetime
import json
import numpy as np
import pandas as pd
import re
import requests
import time

In [2]:
update_race_data(delay = 0.1)

Collecting data for 2025-08-24
Yarmouth
Collected race data for Yarmouth at 14:10
Collected race data for Yarmouth at 14:45
Collected race data for Yarmouth at 15:20
Collected race data for Yarmouth at 15:55
Collected race data for Yarmouth at 16:30
Collected race data for Yarmouth at 17:05
Progress: 1/4
Goodwood
Collected race data for Goodwood at 13:50
Collected race data for Goodwood at 14:25
Collected race data for Goodwood at 15:00
Collected race data for Goodwood at 15:35
Collected race data for Goodwood at 16:10
Collected race data for Goodwood at 16:45
Collected race data for Goodwood at 17:20
Progress: 2/4
Naas
Collected race data for Naas at 13:40
Collected race data for Naas at 14:15
Collected race data for Naas at 14:50
Collected race data for Naas at 15:25
Collected race data for Naas at 16:00
Collected race data for Naas at 16:35
Collected race data for Naas at 17:10
Progress: 3/4
Beverley
Collected race data for Beverley at 14:05
Collected race data for Beverley at 14:40

In [3]:
today_races = todays_data_df(delay = 0.1)

Collected race data for Ripon at 14:05
Collected race data for Ripon at 14:35
Collected race data for Ripon at 15:05
Collected race data for Ripon at 15:35
Collected race data for Ripon at 16:05
Collected race data for Ripon at 16:35
Collected race data for Musselburgh at 14:20
Collected race data for Musselburgh at 14:50
Collected race data for Musselburgh at 15:20
Collected race data for Musselburgh at 15:50
Collected race data for Musselburgh at 16:20
Collected race data for Musselburgh at 16:50
Collected race data for Bellewstown at 16:25
Collected race data for Bellewstown at 16:55
Collected race data for Bellewstown at 17:25
Collected race data for Bellewstown at 17:55
Collected race data for Bellewstown at 18:25
Collected race data for Bellewstown at 18:55
Collected race data for Bellewstown at 19:25
Collected race data for Bellewstown at 19:55
Collected race data for Lingfield at 16:10
Collected race data for Lingfield at 16:40
Collected race data for Lingfield at 17:10
Collect

In [4]:
horse_df = unpack_json_to_df('historical_horse_data.json')

In [5]:
cleaned_df = horse_df.dropna(subset=['Position'])
cleaned_df = cleaned_df.replace('N/A', np.nan)
cleaned_df[['Finishing Speed (%)', 'Top Speed Value (mph)']] = cleaned_df[['Finishing Speed (%)', 'Top Speed Value (mph)']].astype(float)

In [6]:
conditions = {}
id_list = cleaned_df['Race Id'].unique().tolist()
for race_id in id_list:
    conditions.update({race_id : finish_condition(cleaned_df, race_id)})

In [7]:
high_condition_dict, med_condition_dict, low_condition_dict = ({} for i in range(3))

for race_id in conditions.keys():
    condition_list = conditions[race_id]
    # Update dictionary with {race_id, mean + std}
    high_condition_dict.update({race_id : condition_list[0] + condition_list[1]})
    # Update dictionary with {race_id, mean + 0.8 * std}
    med_condition_dict.update({race_id : condition_list[0] + 0.8 * condition_list[1]})
    # Update dictionary with {race_id, mean + 0.5 * std}
    low_condition_dict.update({race_id : condition_list[0] + 0.5 * condition_list[1]})

In [8]:
high_fast_list = []
speed_list = cleaned_df['Finishing Speed (%)'].tolist()
id_list = cleaned_df['Race Id'].tolist()

for i in range(len(speed_list)):
    condition = speed_list[i] >= high_condition_dict[id_list[i]]
    if condition:
        high_fast_list.append(True)
    else:
        high_fast_list.append(np.nan)

In [9]:
med_fast_list = []

for i in range(len(speed_list)):
    condition = speed_list[i] >= med_condition_dict[id_list[i]]
    if condition:
        med_fast_list.append(True)
    else:
        med_fast_list.append(np.nan)

In [10]:
low_fast_list = []

for i in range(len(speed_list)):
    condition = speed_list[i] >= low_condition_dict[id_list[i]]
    if condition:
        low_fast_list.append(True)
    else:
        low_fast_list.append(np.nan)

In [11]:
cleaned_df.insert(0, 'Low Fast Finish' , low_fast_list)
cleaned_df.insert(0, 'Med Fast Finish' , med_fast_list)
cleaned_df.insert(0, 'High Fast Finish' , high_fast_list)

In [12]:
grouped_df = cleaned_df.groupby([cleaned_df['Horse']]).agg(
    **{
        # 'Best Finish (%)' : pd.NamedAgg(column = 'Finishing Speed (%)', aggfunc = 'max'),
        # 'Mean Finish (%)' : pd.NamedAgg(column = 'Finishing Speed (%)', aggfunc = 'mean'),
        # 'Worst Finish (%)' : pd.NamedAgg(column = 'Finishing Speed (%)', aggfunc = 'min'),
        # 'Std Finish (%)' : pd.NamedAgg(column = 'Finishing Speed (%)', aggfunc = 'std'),
        # 'Median Finish (%)' : pd.NamedAgg(column = 'Finishing Speed (%)', aggfunc = 'median'),
        'Best Speed (mph)' : pd.NamedAgg(column = 'Top Speed Value (mph)', aggfunc = 'max'),
        'Best Pos' : pd.NamedAgg(column = 'Position', aggfunc = 'min'),
        'Mean Pos' : pd.NamedAgg(column = 'Position', aggfunc = 'mean'),
        'Worst Pos' : pd.NamedAgg(column = 'Position', aggfunc = 'max'),
        'Std Pos' : pd.NamedAgg(column = 'Position', aggfunc = 'std'),
        'High Fast Finishes' : pd.NamedAgg(column = 'High Fast Finish', aggfunc = 'count'),
        'Med Fast Finishes' : pd.NamedAgg(column = 'Med Fast Finish', aggfunc = 'count'),
        'Low Fast Finishes' : pd.NamedAgg(column = 'Low Fast Finish', aggfunc = 'count'),
        'Num Races' : pd.NamedAgg(column = 'Position', aggfunc = 'count'), # Number of non-null position values (we have removed all null values)
    }
)

grouped_df['High Fast Races (%)'] = grouped_df['High Fast Finishes'] / grouped_df['Num Races']
grouped_df['Med Fast Races (%)'] = grouped_df['Med Fast Finishes'] / grouped_df['Num Races']
grouped_df['Low Fast Races (%)'] = grouped_df['Low Fast Finishes'] / grouped_df['Num Races']

In [13]:
grouped_df = grouped_df[['High Fast Races (%)', 'Med Fast Races (%)', 'Low Fast Races (%)', 'Best Speed (mph)',
                         'Best Pos', 'Mean Pos', 'Worst Pos', 'Std Pos', 'High Fast Finishes', 'Med Fast Finishes',
                         'Low Fast Finishes', 'Num Races']].reset_index()

In [14]:
today_races = today_races[['Off Time', 'Track', 'Tips']].reset_index()

# Some horse names have different cases on racingtv and racingpost.
# Merging on 'Horse' without case sensitivity.
today_races = pd.merge(
    today_races,
    grouped_df,

    # Used to make merge non case sensitive.
    left_on = today_races['Horse'].str.lower(),
    right_on = grouped_df['Horse'].str.lower(),
                       
    how = 'inner',

    # Used so we can easily remove duplicated 'Horse' column later.
    suffixes = ('', '_y')
)

# Dropping unwanted columns generated during the merge.
today_races = today_races.drop(columns = ['key_0', 'Horse_y']).set_index('Off Time')

today_races = today_races.reset_index().set_index('Off Time')
today_races = today_races[['Track', 'Horse', 'High Fast Races (%)', 'Med Fast Races (%)', 'Low Fast Races (%)',
                           'Best Speed (mph)', 'Best Pos', 'Mean Pos', 'Worst Pos', 'Std Pos',
#                           'High Fast Finishes', 'Med Fast Finishes', 'Low Fast Finishes',
                           'Num Races', 'Tips']]

In [15]:
todays_date = pd.Timestamp.today().strftime('%d-%m-%Y')
today_races.round(2).to_csv('./Horse Data/horse_data_{:s}.csv'.format(todays_date))

In [16]:
today_races

Unnamed: 0_level_0,Track,Horse,High Fast Races (%),Med Fast Races (%),Low Fast Races (%),Best Speed (mph),Best Pos,Mean Pos,Worst Pos,Std Pos,Num Races,Tips
Off Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
14:05,Ripon,Time Tested,0.300000,0.500000,0.500000,41.82,1.0,4.000000,13.0,4.082483,10,4 tips
14:05,Ripon,Woodstock City,0.000000,0.000000,0.000000,40.12,1.0,4.777778,9.0,2.073802,18,2 tips
14:05,Ripon,Triple Force,0.055556,0.055556,0.111111,40.65,1.0,4.166667,19.0,3.988955,18,1 tip
14:05,Ripon,Poet's Dawn,0.000000,0.000000,0.037037,40.28,1.0,5.000000,19.0,3.616203,27,
14:05,Ripon,Bay Dream Believer,0.476190,0.523810,0.571429,40.53,1.0,3.571429,8.0,1.832251,21,3 tips
...,...,...,...,...,...,...,...,...,...,...,...,...
19:40,Lingfield,Mary Of Modena,0.000000,0.000000,0.000000,44.61,1.0,5.800000,13.0,3.326660,10,1 tip
19:40,Lingfield,Phoenix Moon,0.076923,0.076923,0.307692,43.05,1.0,4.538462,9.0,2.569546,13,4 tips
19:40,Lingfield,Commendation,0.076923,0.230769,0.230769,42.73,1.0,6.615385,15.0,3.884552,13,
19:40,Lingfield,Mammy,0.055556,0.055556,0.055556,43.56,1.0,4.722222,10.0,2.321398,18,3 tips
