In [4]:
track_abbreviation = "baq"
track_name = "Aqueduct"
race_date = "20241003"

In [5]:
race_quality_dict = {
    "TRL": 1,
    "MCL": 1,
    "WMC": 1,
    "MOC": 1.5,
    "MSA": 1.5,
    "MSW": 2.5,
    "WCL": 2,
    "CLM": 2,
    "MST": 2.5,
    "CLH": 2.5,
    "CST": 2.5,
    "SOC": 2.5,
    "OCL":	2.75,
    "SHP":	3,
    "STR":	2.75,
    "AOC": 3.25,
    "OCS": 3.5,
    "OCH":	3.25,
    "ALW":	4,
    "HCP":	4,
    "SIM":	2,
    "SST":	3.5,
    "STK": 5
}

race_types = {
    "AL": "ALW",
    "MS": "MSW",
    "CL": "CLM",
    "OC": "AOC",
    "MC": "MCL",
    "SO": "SOC",
    "MO": "MCL", # MO is Maiden Optional Claiming but the simulcast data contains no definition for it
    "ST": "STK",
}

In [7]:
import os
import re
import xml.etree.ElementTree as ET
import xmltodict
import pandas as pd
import numpy as np
from datetime import datetime

# Step 2: Load and Parse XML Data using pandas.read_xml
def load_performance_data(file_path):
    # Parse the XML file
    tree = ET.parse(file_path)
    root = tree.getroot()
    print("Loading PP data for: ", os.path.basename(file_path))
    
    # Extract each Race element within EntryRaceCard and convert to a dictionary
    races = []
    for race in root.findall('.//racedata'):
        race_dict = xmltodict.parse(ET.tostring(race))['racedata']
        race_date = race_dict['race_date']
        track_name = race_dict['track']

        race_dict = extract_general_race_info(race_dict, race_date, track_name)

        for entry in race.findall('.//horsedata'):
            entry_dict = extract_entry_info(entry, race_date)
            workout_dict = extract_workout_info(entry, race_date)
            races.append({**race_dict, **entry_dict, **workout_dict})
        
    # Convert the list of dictionaries into a DataFrame
    df = pd.DataFrame(races)

    return df

def extract_general_race_info(race_dict, race_date, track_name):
    race_id = f"{race_date}_{race_dict['race']}_{track_name}"
    return {
        "race_id": race_id,
        "course_type": str(race_dict['surface']),
        "distance": int(float(race_dict['distance'])),
        "race_type": race_types[str(race_dict['stkorclm'])],
        "restriction_type": "S" if "state" in str(race_dict['race_text']).lower() else "None",
        "purse": float(race_dict['purse']),
        "number_of_run": len(race_dict['horsedata']),
    }

def extract_entry_info(entry_root, race_date):
    entry_dict = xmltodict.parse(ET.tostring(entry_root))['horsedata']

    final_dict = {
        "horse_id": f"{entry_dict['horse_name']}_{entry_dict['program']}",
        "gender": str(entry_dict['sex']),
        "post_position": int(entry_dict['pp']),
        "weight": int(entry_dict['weight']),
        "equipment": str(entry_dict['equip']),
        "medication": str(entry_dict['med']),
        "jockey_win_percentage": float(entry_dict['jockey']['stats_data']['stat']['wins']) / float(entry_dict['jockey']['stats_data']['stat']['starts']) if float(entry_dict['jockey']['stats_data']['stat']['starts']) != 0 else 0,
        "trainer_win_percentage": float(entry_dict['trainer']['stats_data']['stat']['wins']) / float(entry_dict['trainer']['stats_data']['stat']['starts']) if float(entry_dict['trainer']['stats_data']['stat']['starts']) != 0 else 0,
        "trainer_jockey_win_percentage": float(entry_dict['stats_data']['stat'][22]['wins']) / float(entry_dict['stats_data']['stat'][22]['starts']) if float(entry_dict['stats_data']['stat'][22]['starts']) != 0 else 0,
    }

    summaries = entry_root.findall('.//stats_data')
    if summaries is list:
        summary_dict = xmltodict.parse(ET.tostring(summaries[0]))['stat']['THIS_YEAR']
        final_dict.update({
            "win_percentage_year": float(summary_dict['wins']) / float(summary_dict['starts']),
            "otb_percentage_year": (float(summary_dict['wins']) + float(summary_dict['places']) + float(summary_dict['shows'])) / float(summary_dict['starts']),
        })


    for i, pp in enumerate(entry_root.findall('.//ppdata')):
        if i > 5:
            break
        pp_dict = xmltodict.parse(ET.tostring(pp))['ppdata']

        race_type = str(pp_dict['racetype'])
        race_quality = race_quality_dict[race_type] if str(race_type) in race_quality_dict.keys() else 1

        if str(pp_dict['statebredr']) == 'S':
            race_quality -= 1
        if race_type == 'STK' and pp_dict['racegrade'] != 0:
            race_quality += 1 + int(pp_dict['racegrade'])

        if i == 0:
            final_dict.update({
                "pp_layoff": (datetime.strptime(race_date, '%Y%m%d') - datetime.strptime(pp_dict['racedate'][:10], '%Y%m%d')).days
            })

        bad_luck = False
        long_comment = str(pp_dict['longcommen']).lower()
        if long_comment is not None:
            if any(['bump' in long_comment, 'stumbled' in long_comment, 'checked' in long_comment, 'steadied' in long_comment, 'stopped' in long_comment, 'squeezed' in long_comment, 'steady' in long_comment or 'steadied' in long_comment, 'head turned' in long_comment, 'unprepared start' in long_comment, 'wd' in long_comment or 'wide' in long_comment, 'bled' in long_comment]):
                bad_luck = True

        final_dict.update({
            f"pp_track_{i}": str(pp_dict['trackcode']),
            f"pp_time_since_race_{i}": (datetime.strptime(race_date, '%Y%m%d') - datetime.strptime(pp_dict['racedate'][:10], '%Y%m%d')).days,
            f"pp_course_type_{i}": str(pp_dict['surface']),
            f"pp_distance_{i}": int(pp_dict['distance']),
            f"pp_quality_{i}": race_quality,
            f"pp_purse_{i}": float(pp_dict['purse']),
            f"pp_normalized_position_{i}": np.divide(float(pp_dict['positionfi']), float(pp_dict['fieldsize'])),
            f"pp_class_rating_{i}": int(pp_dict['classratin']),
            f"pp_speed_rating_{i}": int(pp_dict['speedfigur']),
            f"pp_pace_figure_{i}": int(pp_dict['pacefigur2']),
            f"pp_bad_luck_{i}": bad_luck,
        })

    return final_dict

def extract_workout_info(entry_root, race_date):
    final_dict = {}
    for i, workout in enumerate(entry_root.findall('.//workoutdata')):    
        if i > 3:
            return final_dict
        workout_dict = xmltodict.parse(ET.tostring(workout))['workoutdata']
        final_dict.update({
            f"workout_last_month_{i}": True if int(workout_dict['days_back']) < 30 else False,
            f"workout_distance_{i}": int(workout_dict['worktext'][0]) * 100,
            f"workout_course_type_{i}": str(workout_dict['worktext'][1]),
            f"workout_time_{i}": int(float(re.sub('\D', '', get_substring_from_char(workout_dict['worktext'], ':'))) * 10),
            f"workout_rank_{i}": int(workout_dict['ranking']) / int(workout_dict['rank_group']),
        })

    return final_dict

def get_substring_from_char(s, char):
    pos = s.find(char) + 1
    if pos != -1:
        return s[pos:]
    else:
        return ""

performance_path = "C:\\Users\\dylan\\OneDrive - Wayne State College\\Documents\\XML_PPs"
file_name = f'{track_abbreviation}{race_date}ppsXML.xml'  # Add your suffixes here

# Load all past performance files
performance_data = load_performance_data(performance_path + '\\' + file_name)

Loading PP data for:  baq20241003ppsXML.xml


  f"pp_normalized_position_{i}": np.divide(float(pp_dict['positionfi']), float(pp_dict['fieldsize'])),


In [8]:
# Go through each row and create new columns
for i, row in performance_data.iterrows():
    # Initialize new columns
    performance_data.at[i, "first_off_layoff"] = False
    performance_data.at[i, "second_off_layoff"] = False
    performance_data.at[i, "third_off_layoff"] = False
    
    # Set if horse is off layoff
    if row['pp_time_since_race_0'] > 45:
        performance_data.at[i, 'first_off_layoff'] = True
    elif row['pp_time_since_race_1'] - row['pp_time_since_race_0'] > 45:
        performance_data.at[i, 'second_off_layoff'] = True
    elif row['pp_time_since_race_2'] - row['pp_time_since_race_1'] > 45:
        performance_data.at[i, 'third_off_layoff'] = True

  performance_data.at[i, "first_off_layoff"] = False
  performance_data.at[i, "second_off_layoff"] = False
  performance_data.at[i, "third_off_layoff"] = False


In [9]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
import pickle

performance_data.reset_index()

# Drop unneeded columns
horse_ids = performance_data['horse_id']
performance_data = performance_data.drop(columns=['horse_id'])

# Identify columns with missing values
columns_with_missing = performance_data.columns[performance_data.isnull().any()]

# Impute only columns with missing values
imputer = SimpleImputer(strategy='most_frequent')
imputed_array = imputer.fit_transform(performance_data[columns_with_missing])

# Convert the imputed array back to a DataFrame with original column names
imputed_data = pd.DataFrame(imputed_array, columns=columns_with_missing)

# Combine the imputed columns with the rest of the data
data = performance_data.copy()
data[columns_with_missing] = imputed_data

# Get correct column types
data = data.infer_objects()

# Use LabelEncoder on string columns
label_encoders = pickle.load(open(f"Models\\{track_name}\\label_encoders.pkl", "rb"))
for col in data.columns:
    if col == "race_id" or col == "horse_id":
        data[col] = LabelEncoder().fit_transform(data[col])
    elif data[col].dtype == 'object':
        try:
            data[col] = label_encoders[col].transform(data[col])
        except:
            data[col] = LabelEncoder().fit_transform(data[col])
            print(col, " errored")
        


equipment  errored
medication  errored
pp_track_0  errored
pp_track_2  errored
workout_course_type_0  errored
workout_course_type_1  errored
workout_course_type_2  errored
workout_course_type_3  errored
pp_track_5  errored


In [10]:
model = pickle.load(open(f"Models\\{track_name}\\{track_name}_Model.pkl", "rb"))

feature_names = model.feature_names_in_

# Reorder the columns of merged_data_imputed to match the order of feature_names
data = data[feature_names]

# Now make the prediction
y_predict = model.predict(data)

In [11]:
predicted_normalized_position = model.predict(data)
predicted_finish_position = ((predicted_normalized_position * data['number_of_run']) / 100)

In [12]:
# Create a DataFrame by concatenating the series
results_df = pd.concat([performance_data['race_id'], horse_ids, predicted_finish_position], axis=1)