In [280]:
track_abbreviation = "cd"
track_name = "Churchill Downs"
race_date = "20250626"
scratches = []

In [281]:
race_quality_dict = {
    "TRL": 1,
    "MCL": 1,
    "WMC": 1,
    "MOC": 1.5,
    "MSA": 1.5,
    "MSW": 2.5,
    "WCL": 2,
    "CLM": 2,
    "MST": 2.5,
    "CLH": 2.5,
    "CST": 2.5,
    "SOC": 2.5,
    "OCL":	2.75,
    "SHP":	3,
    "STR":	2.75,
    "AOC": 3.25,
    "OCS": 3.5,
    "OCH":	3.25,
    "ALW":	4,
    "HCP":	4,
    "SIM":	2,
    "SST":	3.5,
    "STK": 5
}

race_types = {
    "AL": "ALW",
    "MS": "MSW",
    "CL": "CLM",
    "OC": "AOC",
    "MC": "MCL",
    "SO": "SOC",
    "MO": "MCL", # MO is Maiden Optional Claiming but the simulcast data contains no definition for it
    "ST": "STK",
    "SA": "STR"
}

equipment = {
    "B": "B",
    "F": "None",
    "BF": "B",
    "V": "V",
    "R": "R",
    "Y": "Y",

}

In [282]:
import os
import re
import xml.etree.ElementTree as ET
import xmltodict
import pandas as pd
import numpy as np
from datetime import datetime

# Step 2: Load and Parse XML Data using pandas.read_xml
def load_performance_data(file_path):
    # Parse the XML file
    tree = ET.parse(file_path)
    root = tree.getroot()
    print("Loading PP data for: ", os.path.basename(file_path))
    
    # Extract each Race element within EntryRaceCard and convert to a dictionary
    races = []
    for race in root.findall('.//racedata'):
        race_dict = xmltodict.parse(ET.tostring(race))['racedata']
        race_date = race_dict['race_date']
        track_name = race_dict['track']

        race_dict = extract_general_race_info(race_dict, race_date, track_name)

        num_scratches = 0
        for entry in race.findall('.//horsedata'):
            entry_dict = extract_entry_info(entry, race_date, num_scratches)
            workout_dict = extract_workout_info(entry, race_date)
            
            is_scratched = False
            for horse in scratches:
                if horse.lower() in entry_dict['horse_id'].split('_')[0].lower():
                    print(horse, entry_dict['horse_id'].split('_')[0])
                    is_scratched = True
                    num_scratches += 1
                    break
            if not is_scratched:
                races.append({**race_dict, **entry_dict, **workout_dict})
        
    # Convert the list of dictionaries into a DataFrame
    df = pd.DataFrame(races)

    return df

def extract_general_race_info(race_dict, race_date, track_name):
    race_id = f"{race_date}_{race_dict['race']}_{track_name}"
    return {
        "race_id": race_id,
        "course_type": str(race_dict['surface']),
        "distance": int(float(race_dict['distance'])),
        "race_type": race_types[str(race_dict['stkorclm'])],
        "restriction_type": "S" if "state" in str(race_dict['race_text']).lower() else "None",
        "purse": float(race_dict['purse']),
        "number_of_run": len(race_dict['horsedata']),
    }

def extract_entry_info(entry_root, race_date, scratches):
    entry_dict = xmltodict.parse(ET.tostring(entry_root))['horsedata']

    final_dict = {
        "horse_id": f"{entry_dict['horse_name']}_{entry_dict['program']}",
        "gender": str(entry_dict['sex']),
        "post_position": int(entry_dict['pp']) - scratches,
        "weight": int(entry_dict['weight']),
        "equipment": str(entry_dict['equip']),
        "medication": "L" if str(entry_dict['med']) == "1" else "None",
        "jockey_win_percentage": float(entry_dict['jockey']['stats_data']['stat']['wins']) / float(entry_dict['jockey']['stats_data']['stat']['starts']) if float(entry_dict['jockey']['stats_data']['stat']['starts']) != 0 else 0,
        "trainer_win_percentage": float(entry_dict['trainer']['stats_data']['stat']['wins']) / float(entry_dict['trainer']['stats_data']['stat']['starts']) if float(entry_dict['trainer']['stats_data']['stat']['starts']) != 0 else 0,
        "trainer_jockey_win_percentage": float(entry_dict['stats_data']['stat'][22]['wins']) / float(entry_dict['stats_data']['stat'][22]['starts']) if float(entry_dict['stats_data']['stat'][22]['starts']) != 0 else 0,
    }

    summaries = entry_root.findall('.//stats_data')
    if summaries is list:
        summary_dict = xmltodict.parse(ET.tostring(summaries[0]))['stat']['THIS_YEAR']
        final_dict.update({
            "win_percentage_year": float(summary_dict['wins']) / float(summary_dict['starts']),
            "otb_percentage_year": (float(summary_dict['wins']) + float(summary_dict['places']) + float(summary_dict['shows'])) / float(summary_dict['starts']),
        })

    ignored_scratches = 0
    for i, pp in enumerate(entry_root.findall('.//ppdata')):
        normalized_i = i - ignored_scratches
        if normalized_i > 5:
            break
        pp_dict = xmltodict.parse(ET.tostring(pp))['ppdata']

        # Get the actual equipment being worn
        if normalized_i == 0:
            if final_dict['equipment'] != "OFF":
                if str(pp_dict['equipment']) in equipment.keys():
                    final_dict.update({
                        "equipment": equipment[str(pp_dict['equipment'])]
                    })
                else:
                    final_dict.update({
                        "equipment": "None"
                    })
            else:
                final_dict.update({
                    "equipment": "O"
                })

        race_type = str(pp_dict['racetype'])
        if (race_type == 'SCR'):
            ignored_scratches += 1
            continue
        race_quality = race_quality_dict[race_type] if str(race_type) in race_quality_dict.keys() else 1

        if str(pp_dict['statebredr']) == 'S':
            race_quality -= 1
        if race_type == 'STK' and pp_dict['racegrade'] != 0:
            race_quality += 1 + int(pp_dict['racegrade'])

        if i == 0:
            final_dict.update({
                "pp_layoff": (datetime.strptime(race_date, '%Y%m%d') - datetime.strptime(pp_dict['racedate'][:10], '%Y%m%d')).days
            })

        bad_luck = False
        long_comment = str(pp_dict['longcommen']).lower()
        if long_comment is not None:
            if any(['bump' in long_comment, 'stumbled' in long_comment, 'checked' in long_comment, 'steadied' in long_comment, 'stopped' in long_comment, 'squeezed' in long_comment, 'steady' in long_comment or 'steadied' in long_comment, 'head turned' in long_comment, 'unprepared start' in long_comment, 'wd' in long_comment or 'wide' in long_comment, 'bled' in long_comment]):
                bad_luck = True

        final_dict.update({
            f"pp_track_{normalized_i}": str(pp_dict['trackcode']),
            f"pp_time_since_race_{normalized_i}": (datetime.strptime(race_date, '%Y%m%d') - datetime.strptime(pp_dict['racedate'][:10], '%Y%m%d')).days,
            f"pp_course_type_{normalized_i}": str(pp_dict['surface']),
            f"pp_distance_{normalized_i}": int(pp_dict['distance']),
            f"pp_quality_{normalized_i}": race_quality,
            f"pp_purse_{normalized_i}": float(pp_dict['purse']),
            f"pp_normalized_position_{normalized_i}": np.divide(float(pp_dict['positionfi']), float(pp_dict['fieldsize'])),
            f"pp_class_rating_{normalized_i}": int(pp_dict['classratin']),
            f"pp_speed_rating_{normalized_i}": int(pp_dict['speedfigur']),
            f"pp_pace_figure_{normalized_i}": int(pp_dict['pacefigur2']),
            f"pp_bad_luck_{normalized_i}": bad_luck,
        })

    return final_dict

def extract_workout_info(entry_root, race_date):
    final_dict = {}
    for i, workout in enumerate(entry_root.findall('.//workoutdata')):    
        if i > 3:
            return final_dict
        workout_dict = xmltodict.parse(ET.tostring(workout))['workoutdata']
        final_dict.update({
            f"workout_last_month_{i}": True if int(workout_dict['days_back']) < 30 else False,
            f"workout_distance_{i}": int(workout_dict['worktext'][0]) * 100,
            f"workout_course_type_{i}": "D", # No data available, usually dirt
            f"workout_time_{i}": int(float(re.sub('\D', '', get_substring_from_char(workout_dict['worktext'], ':'))) * 10),
            f"workout_rank_{i}": int(workout_dict['ranking']) / int(workout_dict['rank_group']),
        })

    return final_dict

def get_substring_from_char(s, char):
    pos = s.find(char) + 1
    if pos != -1:
        return s[pos:]
    else:
        return ""

performance_path = "C:\\Users\\dylan\\OneDrive - Wayne State College\\Documents\\XML_PPs"
file_name = f'{track_abbreviation}{race_date}ppsXML.xml'  # Add your suffixes here

# Load all past performance files
performance_data = load_performance_data(performance_path + '\\' + file_name)

Loading PP data for:  cd20250626ppsXML.xml


In [283]:
# Go through each row and create new columns
for i, row in performance_data.iterrows():
    # Initialize new columns
    performance_data.at[i, "first_off_layoff"] = False
    performance_data.at[i, "second_off_layoff"] = False
    performance_data.at[i, "third_off_layoff"] = False
    
    # Set if horse is off layoff
    if row['pp_time_since_race_0'] > 45:
        performance_data.at[i, 'first_off_layoff'] = True
    elif row['pp_time_since_race_1'] - row['pp_time_since_race_0'] > 45:
        performance_data.at[i, 'second_off_layoff'] = True
    elif row['pp_time_since_race_2'] - row['pp_time_since_race_1'] > 45:
        performance_data.at[i, 'third_off_layoff'] = True

  performance_data.at[i, "first_off_layoff"] = False
  performance_data.at[i, "second_off_layoff"] = False
  performance_data.at[i, "third_off_layoff"] = False


In [284]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
import pickle

performance_data.reset_index()

# Drop unneeded columns
horse_ids = performance_data['horse_id']
performance_data = performance_data.drop(columns=['horse_id'])

# Impute only columns with missing values
imputer: SimpleImputer = pickle.load(open(f"Models\\{track_name}\\imputer.pkl", "rb"))
columns_with_missing = imputer.feature_names_in_
imputed_array = imputer.transform(performance_data[columns_with_missing])

# Convert the imputed array back to a DataFrame with original column names
imputed_data = pd.DataFrame(imputed_array, columns=columns_with_missing)

# Combine the imputed columns with the rest of the data
data = performance_data.copy()
data[columns_with_missing] = imputed_data

# Get correct column types
data = data.infer_objects()

# Use LabelEncoder on string columns
label_encoders = pickle.load(open(f"Models\\{track_name}\\label_encoders.pkl", "rb"))
for col in data.columns:
    if col == "race_id" or col == "horse_id":
        data[col] = LabelEncoder().fit_transform(data[col])
    elif data[col].dtype == 'object':
        try:
            if ('pp_track' in col):
                # If the track was unknown in test, set it to be the same as the track we're running at
                data[col] = data[col].map(lambda s: track_abbreviation.upper() if s not in label_encoders[col].classes_ else s)
            data[col] = label_encoders[col].transform(data[col])
        except:
            data[col] = LabelEncoder().fit_transform(data[col])
            print(f"Error encoding column: {col}")
        


Error encoding column: restriction_type


In [285]:
model = pickle.load(open(f"Models\\{track_name}\\{track_name}_Model.pkl", "rb"))

feature_names = model.feature_names_in_

# Reorder the columns of merged_data_imputed to match the order of feature_names
data = data[feature_names]

# Try to load ensemble models first, fallback to single model
try:
    ensemble_models = pickle.load(open(f"Models\\{track_name}\\{track_name}_Ensemble.pkl", "rb"))
    print(f"Loaded ensemble of {len(ensemble_models)} models")
    use_ensemble = True
    model = ensemble_models[0]  # Keep for compatibility
except FileNotFoundError:
    print("Ensemble models not found, using single model")
    model = pickle.load(open(f"Models\\{track_name}\\{track_name}_Model.pkl", "rb"))
    use_ensemble = False

feature_names = model.feature_names_in_

# Reorder the columns of merged_data_imputed to match the order of feature_names
data = data[feature_names]

# Generate predictions
if use_ensemble:
    # Generate predictions from all models in the ensemble
    ensemble_predictions = []
    for i, ensemble_model in enumerate(ensemble_models):
        predictions = ensemble_model.predict(data)
        ensemble_predictions.append(predictions)
    
    # Convert to numpy array for easier manipulation
    ensemble_predictions = np.array(ensemble_predictions)
    
    # Calculate ensemble statistics
    y_predict = np.mean(ensemble_predictions, axis=0)  # Mean prediction
    prediction_std = np.std(ensemble_predictions, axis=0)  # Standard deviation across models
    
    # Calculate confidence scores based on model agreement (inverse of standard deviation)
    # Lower std = higher confidence, higher std = lower confidence
    max_std = np.max(prediction_std)
    min_std = np.min(prediction_std)
    
    # Normalize confidence to 0-100 scale (100 = highest confidence, 0 = lowest confidence)
    if max_std > min_std:
        confidence_scores = 100 * (1 - (prediction_std - min_std) / (max_std - min_std))
    else:
        confidence_scores = np.full_like(prediction_std, 100)  # All predictions have same confidence
    
    print(f"Ensemble predictions generated with confidence scores")
    print(f"Average confidence: {np.mean(confidence_scores):.2f}")
    print(f"Confidence range: {np.min(confidence_scores):.2f} - {np.max(confidence_scores):.2f}")
    
else:
    # Single model prediction
    y_predict = model.predict(data)
    confidence_scores = np.full_like(y_predict, 50)  # Default moderate confidence
    print("Single model predictions generated")

Loaded ensemble of 5 models
Ensemble predictions generated with confidence scores
Average confidence: 56.28
Confidence range: 0.00 - 100.00


In [286]:
# Use the ensemble predictions from the previous cell
predicted_normalized_position = y_predict
predicted_finish_position = ((predicted_normalized_position * data['number_of_run']) / 100)

# Add confidence scores to the results
print(f"Predictions generated for {len(predicted_finish_position)} horses")
print(f"Confidence scores range: {np.min(confidence_scores):.1f} to {np.max(confidence_scores):.1f}")

Predictions generated for 80 horses
Confidence scores range: 0.0 to 100.0


In [287]:
# Create a DataFrame by concatenating the series, now including confidence scores
results_df = pd.concat([performance_data['race_id'], horse_ids, predicted_finish_position, pd.Series(confidence_scores, index=performance_data.index)], axis=1)

In [288]:
# Display current race predictions with confidence scores
print(f"Predictions for {track_name} on {race_date}:")
print("=" * 60)

results_df.columns = ['race_id', 'horse_id', 'predicted_finish_position', 'confidence_score']
results_df['predicted_finish_position'] = results_df['predicted_finish_position'].round(2)
results_df['confidence_score'] = results_df['confidence_score'].round(1)

# Group by race and show top picks with confidence
for race_id in results_df['race_id'].unique():
    race_data = results_df[results_df['race_id'] == race_id].sort_values('predicted_finish_position')
    race_num = race_id.split('_')[1]
    print(f"\nRace {race_num}:")
    print("Top 4 Predictions (with confidence):")
    for i, (_, row) in enumerate(race_data.head(4).iterrows(), 1):
        horse_name = row['horse_id'].split('_')[0]
        prog_num = row['horse_id'].split('_')[1] 
        confidence = row['confidence_score']
        confidence_level = "HIGH" if confidence >= 75 else "MODERATE" if confidence >= 50 else "LOW"
        print(f"  {i}. #{prog_num} {horse_name} (Predicted: {row['predicted_finish_position']:.2f}, Confidence: {confidence:.1f} - {confidence_level})")
    
    # Calculate spread between top two picks
    if len(race_data) >= 2:
        top_two = race_data.head(2)
        spread = top_two.iloc[1]['predicted_finish_position'] - top_two.iloc[0]['predicted_finish_position']
        avg_confidence = top_two['confidence_score'].mean()
        print(f"  Spread: {spread:.2f}, Avg Top-2 Confidence: {avg_confidence:.1f}")
    print("-" * 40)

Predictions for Churchill Downs on 20250626:

Race 1:
Top 4 Predictions (with confidence):
  1. #4 PIERCE ELEVATED (Predicted: 3.15, Confidence: 52.3 - MODERATE)
  2. #6 GOLDEN PLATE (Predicted: 3.28, Confidence: 78.4 - HIGH)
  3. #3 GOLD SEARCH (Predicted: 3.31, Confidence: 59.3 - MODERATE)
  4. #2 GLOBAL LEGEND (Predicted: 3.42, Confidence: 48.7 - LOW)
  Spread: 0.13, Avg Top-2 Confidence: 65.3
----------------------------------------

Race 2:
Top 4 Predictions (with confidence):
  1. #3 BORN FLASHY (Predicted: 2.51, Confidence: 0.0 - LOW)
  2. #4 BOURBON OUTLAW (Predicted: 2.92, Confidence: 40.7 - LOW)
  3. #6 MUGATU (Predicted: 3.41, Confidence: 9.5 - LOW)
  4. #7 SHATTUCK (Predicted: 3.92, Confidence: 62.2 - MODERATE)
  Spread: 0.41, Avg Top-2 Confidence: 20.4
----------------------------------------

Race 3:
Top 4 Predictions (with confidence):
  1. #5 ASHES AND DIAMONDS (Predicted: 3.12, Confidence: 100.0 - HIGH)
  2. #6 ORDER RESTORED (Predicted: 3.52, Confidence: 53.2 - MODERA

## Prediction Variance Analysis & Multi-Race Betting Strategies

To develop effective multi-race betting strategies, we need to analyze the historical variance between our model's top predictions and actual race winners. This analysis will help us understand the reliability of our predictions and develop strategies that account for uncertainty.

In [289]:
# Load historical training data to analyze prediction variance
print("Loading historical data for variance analysis...")

# Load the imputed data that was used for training
historical_data = pd.read_csv(f'Imputed Data\\{track_name}.csv')
print(f"Loaded {len(historical_data)} historical race entries")

# Get unique race count
unique_races = historical_data['race_id'].nunique()
print(f"Covering {unique_races} historical races")

Loading historical data for variance analysis...
Loaded 4675 historical race entries
Covering 575 historical races


In [290]:
# Recreate predictions on historical data to analyze variance
print("Generating predictions on historical data...")

# Prepare the historical data similar to how we prepared the current race data
historical_features = historical_data.drop(columns=['normalized_position', 'Position', 'odds'])
historical_actual = historical_data['Position'].astype(int)

# Get predictions for historical data using ensemble if available
if use_ensemble:
    # Generate ensemble predictions on historical data
    historical_ensemble_predictions = []
    for ensemble_model in ensemble_models:
        predictions = ensemble_model.predict(historical_features[feature_names])
        historical_ensemble_predictions.append(predictions)
    
    # Convert to numpy array and calculate mean predictions
    historical_ensemble_predictions = np.array(historical_ensemble_predictions)
    historical_predicted_normalized = np.mean(historical_ensemble_predictions, axis=0)
else:
    # Single model prediction
    historical_predicted_normalized = model.predict(historical_features[feature_names])

# Convert normalized predictions to actual positions
historical_predicted_positions = (historical_predicted_normalized * historical_features['number_of_run']) / 100

# Create variance analysis DataFrame
variance_df = pd.DataFrame({
    'race_id': historical_data['race_id'],
    'actual_position': historical_actual,
    'predicted_position': historical_predicted_positions,
    'prediction_error': historical_predicted_positions - historical_actual,
    'number_of_run': historical_features['number_of_run']
})

print(f"Generated predictions for {len(variance_df)} historical entries")

Generating predictions on historical data...
Generated predictions for 4675 historical entries


In [291]:
# Analyze top pick variance
print("PREDICTION VARIANCE ANALYSIS")
print("=" * 50)

# For each race, get the top predicted horse and analyze its actual performance
top_picks_analysis = []

for race_id in variance_df['race_id'].unique():
    race_data = variance_df[variance_df['race_id'] == race_id]
    
    # Find the horse with the best (lowest) predicted position
    top_pick_idx = race_data['predicted_position'].idxmin()
    top_pick = race_data.loc[top_pick_idx]
    
    top_picks_analysis.append({
        'race_id': race_id,
        'predicted_position': top_pick['predicted_position'],
        'actual_position': top_pick['actual_position'],
        'prediction_error': top_pick['prediction_error'],
        'field_size': top_pick['number_of_run'],
        'won_race': top_pick['actual_position'] == 1,
        'in_top_3': top_pick['actual_position'] <= 3,
        'finished_worse_than_predicted': top_pick['actual_position'] > top_pick['predicted_position']
    })

top_picks_df = pd.DataFrame(top_picks_analysis)

# Calculate key statistics
win_rate = top_picks_df['won_race'].mean() * 100
top_3_rate = top_picks_df['in_top_3'].mean() * 100
avg_prediction_error = top_picks_df['prediction_error'].mean()
std_prediction_error = top_picks_df['prediction_error'].std()
avg_actual_position = top_picks_df['actual_position'].mean()

print(f"Top Pick Performance:")
print(f"  Win Rate: {win_rate:.1f}%")
print(f"  Top 3 Rate: {top_3_rate:.1f}%")
print(f"  Average Actual Finish: {avg_actual_position:.2f}")
print(f"  Average Prediction Error: {avg_prediction_error:.2f} positions")
print(f"  Standard Deviation of Error: {std_prediction_error:.2f} positions")

# Calculate confidence intervals
error_95th_percentile = np.percentile(top_picks_df['prediction_error'], 95)
error_5th_percentile = np.percentile(top_picks_df['prediction_error'], 5)

print(f"\nPrediction Error Distribution:")
print(f"  95% of top picks finish within {error_95th_percentile:.2f} positions of prediction")
print(f"  5% of top picks finish within {error_5th_percentile:.2f} positions of prediction")
print(f"  68% confidence interval: ±{std_prediction_error:.2f} positions")
print(f"  95% confidence interval: ±{1.96 * std_prediction_error:.2f} positions")

PREDICTION VARIANCE ANALYSIS
Top Pick Performance:
  Win Rate: 73.0%
  Top 3 Rate: 91.0%
  Average Actual Finish: 1.65
  Average Prediction Error: 1.31 positions
  Standard Deviation of Error: 1.47 positions

Prediction Error Distribution:
  95% of top picks finish within 3.19 positions of prediction
  5% of top picks finish within -1.18 positions of prediction
  68% confidence interval: ±1.47 positions
  95% confidence interval: ±2.88 positions


In [None]:
# Save prediction results to CSV for BetBuilder.ipynb
import os

# Create filename with track and date
csv_filename = f"Predictions\\predictions_{track_abbreviation}_{race_date}.csv"
csv_path = os.path.join(os.getcwd(), csv_filename)

# Save results_df to CSV
results_df.to_csv(csv_path, index=False)

print(f"✅ Predictions saved to: {csv_filename}")
print(f"📊 Data saved: {len(results_df)} entries for {len(results_df['race_id'].unique())} races")
print(f"🏁 Track: {track_name}")
print(f"📅 Date: {race_date}")
print(f"\n🎯 Next step: Open BetBuilder.ipynb to build betting strategies using this data.")

✅ Predictions saved to: predictions_cd_20250626.csv
📊 Data saved: 80 entries for 8 races
🏁 Track: Churchill Downs
📅 Date: 20250626

🎯 Next step: Open BetBuilder.ipynb to build betting strategies using this data.
