In [317]:
track_abbreviation = "asd"
track_name = "Assiniboia Downs"

In [318]:
import pandas as pd

data = pd.read_csv(f'Imputed Data\\{track_name}.csv')

In [319]:
from sklearn.model_selection import GroupShuffleSplit
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
import numpy as np

# Assuming 'race_type' is a column in your data
X = data.drop(columns=['normalized_position', 'Position'])
y = data['normalized_position']
groups = data['race_id']
stratify_col = data['race_type']

# Initialize GroupShuffleSplit
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

# Create empty lists to store train and test indices
train_idx = []
test_idx = []

# Split each race_type group
for race_type in stratify_col.unique():
    race_type_mask = stratify_col == race_type
    X_race_type = X[race_type_mask]
    y_race_type = y[race_type_mask]
    groups_race_type = groups[race_type_mask]
    
    if len(X_race_type) < 2:
        continue
    
    if len(X_race_type) > 1 and len(y_race_type) > 1:  # Ensure there are enough samples to split
        gss_split = gss.split(X_race_type, y_race_type, groups=groups_race_type)
        try:
            train_idx_race_type, test_idx_race_type = next(gss_split)
        except ValueError:
            continue
    
        train_idx.extend(X_race_type.index[train_idx_race_type])
        test_idx.extend(X_race_type.index[test_idx_race_type])

# Convert lists to arrays
train_idx = pd.Index(train_idx)
test_idx = pd.Index(test_idx)

X_train, X_test = X.loc[train_idx], X.loc[test_idx]
y_train, y_test = y.loc[train_idx], y.loc[test_idx]

# Store the 'odds' column for later use
odds_train = X_train['odds']
odds_test = X_test['odds']

# Drop the 'odds' column from the training and testing datasets
X_train = X_train.drop(columns=['odds'])
X_test = X_test.drop(columns=['odds'])

# Create ensemble of random forest models with different configurations
ensemble_models = []
model_configs = [
    {'n_estimators': 300, 'max_depth': 20, 'max_features': 0.5, 'min_samples_leaf': 10, 'random_state': 42},
    {'n_estimators': 250, 'max_depth': 25, 'max_features': 0.6, 'min_samples_leaf': 8, 'random_state': 123},
    {'n_estimators': 350, 'max_depth': 15, 'max_features': 0.4, 'min_samples_leaf': 12, 'random_state': 456},
    {'n_estimators': 275, 'max_depth': 30, 'max_features': 0.7, 'min_samples_leaf': 15, 'random_state': 789},
    {'n_estimators': 320, 'max_depth': 18, 'max_features': 0.5, 'min_samples_leaf': 9, 'random_state': 999}
]

print(f"Training ensemble of {len(model_configs)} Random Forest models...")

for i, config in enumerate(model_configs):
    print(f"Training model {i+1}/{len(model_configs)}...")
    model = RandomForestRegressor(n_jobs=-1, **config)
    model.fit(X_train, y_train)
    ensemble_models.append(model)

print("Ensemble training complete!")

# Make predictions with each model in the ensemble
predictions = np.zeros((len(X_test), len(ensemble_models)))

for i, model in enumerate(ensemble_models):
    predictions[:, i] = model.predict(X_test)

# Calculate the mean prediction and the standard deviation (as a measure of uncertainty)
y_pred_mean = predictions.mean(axis=1)
y_pred_std = predictions.std(axis=1)

# Calculate confidence scores (e.g., using the inverse of the standard deviation)
confidence_scores = 1 / (1 + y_pred_std)

# The final predictions can be the mean predictions, and you can use the confidence scores as needed
final_predictions = y_pred_mean

Training ensemble of 5 Random Forest models...
Training model 1/5...
Training model 2/5...
Training model 2/5...
Training model 3/5...
Training model 3/5...
Training model 4/5...
Training model 4/5...
Training model 5/5...
Training model 5/5...
Ensemble training complete!
Ensemble training complete!


In [320]:
# Generate predictions from all models in the ensemble
ensemble_predictions = []
for i, model in enumerate(ensemble_models):
    predictions = model.predict(X_test)
    ensemble_predictions.append(predictions)
    print(f"Model {i+1} predictions generated")

# Convert to numpy array for easier manipulation
ensemble_predictions = np.array(ensemble_predictions)

# Calculate ensemble statistics
predicted_normalized_position = np.mean(ensemble_predictions, axis=0)  # Mean prediction
prediction_std = np.std(ensemble_predictions, axis=0)  # Standard deviation across models

# Calculate confidence scores based on model agreement (inverse of standard deviation)
# Lower std = higher confidence, higher std = lower confidence
max_std = np.max(prediction_std)
min_std = np.min(prediction_std)

# Normalize confidence to 0-100 scale (100 = highest confidence, 0 = lowest confidence)
if max_std > min_std:
    confidence_scores = 100 * (1 - (prediction_std - min_std) / (max_std - min_std))
else:
    confidence_scores = np.full_like(prediction_std, 100)  # All predictions have same confidence

# Convert to finish positions
predicted_finish_position = ((predicted_normalized_position * X_test['number_of_run']) / 100)

print(f"Ensemble predictions complete!")
print(f"Average confidence score: {np.mean(confidence_scores):.2f}")
print(f"Confidence score range: {np.min(confidence_scores):.2f} - {np.max(confidence_scores):.2f}")

Model 1 predictions generated
Model 2 predictions generated
Model 3 predictions generated
Model 4 predictions generated
Model 5 predictions generated
Ensemble predictions complete!
Average confidence score: 65.61
Confidence score range: 0.00 - 100.00
Model 5 predictions generated
Ensemble predictions complete!
Average confidence score: 65.61
Confidence score range: 0.00 - 100.00


In [321]:
# Extract actual finish positions
actual_finish_position = (y_test * X_test['number_of_run']) / 100
actual_finish_position = actual_finish_position.astype(int)

# Extract and normalize the Odds column
odds = odds_test
normalized_odds = (odds - odds.min()) / (odds.max() - odds.min()) * 100  # Scale to a range of 0 to 100

In [322]:
# Create a DataFrame by concatenating the series, now including confidence scores
results_df = pd.concat([X_test['race_id'], actual_finish_position, predicted_finish_position, odds, pd.Series(confidence_scores, index=X_test.index)], axis=1)

# Rename the columns for clarity
results_df.columns = ['race_id', 'actual_finish_position', 'predicted_finish_position', 'odds', 'confidence_score']

print(f"Results dataframe created with {len(results_df)} predictions")
print(f"Sample confidence scores: {results_df['confidence_score'].head().values}")


Results dataframe created with 474 predictions
Sample confidence scores: [75.35058077 68.97879056 50.50699063 77.39271927 66.67451125]


In [323]:
# Create another dataframe from results_df that selects the lowest predicted_finish_position for each race_id
best_predictions = results_df.groupby('race_id').agg({'predicted_finish_position': 'min'}).reset_index()
# Merge the best_predictions dataframe with results_df to get all other columns
best_predictions = pd.merge(best_predictions, results_df, on=["race_id", "predicted_finish_position"], how='inner')

In [324]:
# Calculate percent of horses that finished 1
num_firsts = best_predictions[best_predictions['actual_finish_position'] == 1].shape[0]
print("Win: {:.2f}%".format(num_firsts / best_predictions.shape[0] * 100))

# Calculate percent of horses that finished 1 or 2
num_seconds = best_predictions[best_predictions['actual_finish_position'] <= 2].shape[0]
print("Place: {:.2f}%".format(num_seconds / best_predictions.shape[0] * 100))

# Calculate percent of horses that finished 1, 2, or 3
num_thirds = best_predictions[best_predictions['actual_finish_position'] <= 3].shape[0]
print("Show: {:.2f}%".format(num_thirds / best_predictions.shape[0] * 100))

Win: 36.99%
Place: 60.27%
Show: 75.34%


In [325]:
# Generate win bet ROI per race
grouped = best_predictions.groupby('race_id')
roi_per_race = grouped.apply(lambda x: (x[x['actual_finish_position'] == 1]['odds'].sum() - len(x)) / len(x) * 100)

# Calculate average ROI per bet
average_roi_per_bet = roi_per_race.mean()

# Print average ROI per bet
print(f"Average ROI per bet: {average_roi_per_bet:.2f}%")

Average ROI per bet: 48.08%


In [326]:
import pickle

# Save the entire ensemble of models
ensemble_model_file = f"Models\\{track_name}\\{track_name}_Ensemble.pkl"
primary_model_file = f"Models\\{track_name}\\{track_name}_Model.pkl"

# Save ensemble models
with open(ensemble_model_file, 'wb') as file:  
    pickle.dump(ensemble_models, file)
    
print(f"Ensemble of {len(ensemble_models)} models saved to {ensemble_model_file}")

# Also save the primary model for backward compatibility
with open(primary_model_file, 'wb') as file:  
    pickle.dump(model, file)
    
print(f"Primary model saved to {primary_model_file}")

Ensemble of 5 models saved to Models\Assiniboia Downs\Assiniboia Downs_Ensemble.pkl
Primary model saved to Models\Assiniboia Downs\Assiniboia Downs_Model.pkl
Primary model saved to Models\Assiniboia Downs\Assiniboia Downs_Model.pkl


In [327]:
# Get the top two predictions for each race_id
top_two_predictions = results_df.groupby('race_id').apply(lambda x: x.nsmallest(2, 'predicted_finish_position')).reset_index(drop=True)

# Check if both top two predictions finished in the top two positions
top_two_grouped = top_two_predictions.groupby('race_id').filter(lambda x: set(x['actual_finish_position']) == {1, 2})

# Calculate the percentage of races where both top two predictions finished in the top two positions
percentage_top_two = (top_two_grouped['race_id'].nunique() / results_df['race_id'].nunique()) * 100

print("Percentage of times where the top 2 predictions both finished 1 and 2 in any order: {:.2f}%".format(percentage_top_two))

Percentage of times where the top 2 predictions both finished 1 and 2 in any order: 21.92%


In [328]:
# Find the top 3 horses in each race_id
top_three_predictions = results_df.groupby('race_id').apply(lambda x: x.nsmallest(3, 'predicted_finish_position')).reset_index(drop=True)

# Check if all top three predictions finished in the top three positions
top_three_grouped = top_three_predictions.groupby('race_id').filter(lambda x: set(x['actual_finish_position']) == {1, 2, 3})

# Calculate the percentage of races where both top three predictions finished in the top three positions
percentage_top_three = (top_three_grouped['race_id'].nunique() / results_df['race_id'].nunique()) * 100

print("Percentage of times where the top 3 predictions all finished 1, 2, and 3 in any order: {:.2f}%".format(percentage_top_three))

Percentage of times where the top 3 predictions all finished 1, 2, and 3 in any order: 2.74%


In [329]:
# Find the top 4 horses in each race_id
top_four_predictions = results_df.groupby('race_id').apply(lambda x: x.nsmallest(4, 'predicted_finish_position')).reset_index(drop=True)

# Check if three of the top four predictions finished in the top three positions
top_four_grouped = top_four_predictions.query('actual_finish_position <= 3')

# Find the number of race_ids that occur 3 times in top_four_grouped
race_id_counts = top_four_grouped['race_id'].value_counts()
race_ids_with_three_occurrences = len(race_id_counts[race_id_counts == 3])
percentage_top_four_trifecta = (race_ids_with_three_occurrences / top_four_predictions.groupby('race_id').ngroups) * 100

print("Percentage of times where the top 4 predictions all finished 1, 2, 3, and 4 in any order: {:.2f}%".format(percentage_top_four_trifecta))

Percentage of times where the top 4 predictions all finished 1, 2, 3, and 4 in any order: 53.42%


In [330]:
# Calculate optimal horse coverage for >80% success rate by confidence level
print("\n" + "="*70)
print("OPTIMAL HORSE COVERAGE ANALYSIS FOR >80% SUCCESS RATE")
print("="*70)

def calculate_optimal_coverage_by_confidence():
    """
    Calculate how many horses are needed to achieve >80% success rate for different confidence levels
    """
    # Create confidence bins
    confidence_bins = [
        (0, 40, "Low"),
        (40, 60, "Medium"), 
        (60, 75, "High"),
        (75, 90, "Very High"),
        (90, 100, "Extreme")
    ]
    
    coverage_analysis = {}
    
    for min_conf, max_conf, label in confidence_bins:
        # Filter races by confidence level
        conf_mask = (results_df['confidence_score'] >= min_conf) & (results_df['confidence_score'] < max_conf)
        conf_data = results_df[conf_mask]
        
        if len(conf_data) == 0:
            continue
            
        # For each race, test different horse coverage levels
        race_coverage_results = []
        
        for race_id in conf_data['race_id'].unique():
            race_data = conf_data[conf_data['race_id'] == race_id].sort_values('predicted_finish_position')
            
            if len(race_data) < 2:
                continue
                
            # Test coverage from 1 to min(field_size, 6) horses
            max_horses = min(len(race_data), 6)
            race_results = {'race_id': race_id, 'field_size': len(race_data)}
            
            for num_horses in range(1, max_horses + 1):
                top_horses = race_data.head(num_horses)
                winner_covered = (top_horses['actual_finish_position'] == 1).any()
                race_results[f'covered_{num_horses}'] = winner_covered
                
            race_coverage_results.append(race_results)
        
        if not race_coverage_results:
            continue
            
        # Calculate success rates for each coverage level
        coverage_df = pd.DataFrame(race_coverage_results)
        total_races = len(coverage_df)
        
        coverage_stats = {}
        optimal_horses = None
        
        for num_horses in range(1, 7):
            col_name = f'covered_{num_horses}'
            if col_name in coverage_df.columns:
                success_rate = coverage_df[col_name].mean() * 100
                coverage_stats[num_horses] = success_rate
                
                # Find minimum horses needed for >80% success rate
                if success_rate >= 80 and optimal_horses is None:
                    optimal_horses = num_horses
        
        coverage_analysis[label] = {
            'confidence_range': f"{min_conf}-{max_conf}",
            'total_races': total_races,
            'coverage_stats': coverage_stats,
            'optimal_horses': optimal_horses,
            'avg_confidence': conf_data['confidence_score'].mean()
        }
        
        # Display results
        print(f"\n{label} Confidence ({min_conf}-{max_conf}%): {total_races} races, Avg: {conf_data['confidence_score'].mean():.1f}")
        success_rates = []
        for h in range(1, 7):
            if h in coverage_stats:
                rate = coverage_stats[h]
                marker = " ✅" if rate >= 80 else ""
                success_rates.append(f"{h}H:{rate:.1f}%{marker}")
        print(f"  Success rates: {' | '.join(success_rates)}")
        
        if optimal_horses:
            print(f"  🎯 OPTIMAL: Use {optimal_horses} horses for >80% success rate")
        else:
            print(f"  ⚠️  No coverage level achieves 80% success rate")
    
    return coverage_analysis

# Run the analysis
optimal_coverage_by_confidence = calculate_optimal_coverage_by_confidence()

# Save the coverage analysis for use in BetBuilder
import pickle
coverage_file = f"Models\\{track_name}\\{track_name}_Coverage_Analysis.pkl"
with open(coverage_file, 'wb') as file:
    pickle.dump(optimal_coverage_by_confidence, file)
    
print(f"\n📊 Coverage analysis saved to {coverage_file}")
print("This data will be used in BetBuilder for dynamic horse selection")

# Create a summary lookup table
print(f"\n📋 CONFIDENCE-BASED HORSE SELECTION LOOKUP TABLE:")
print("-" * 60)
for label, data in optimal_coverage_by_confidence.items():
    optimal = data['optimal_horses']
    if optimal:
        print(f"{label:12} ({data['confidence_range']:>6}%): Use {optimal} horses")
    else:
        print(f"{label:12} ({data['confidence_range']:>6}%): No reliable strategy")
print("-" * 60)


OPTIMAL HORSE COVERAGE ANALYSIS FOR >80% SUCCESS RATE

Low Confidence (0-40%): 7 races, Avg: 29.9
  Success rates: 1H:28.6% | 2H:42.9% | 3H:0.0%
  ⚠️  No coverage level achieves 80% success rate



Medium Confidence (40-60%): 34 races, Avg: 51.6
  Success rates: 1H:23.5% | 2H:41.2% | 3H:63.2% | 4H:75.0% | 5H:66.7% | 6H:0.0%
  ⚠️  No coverage level achieves 80% success rate

High Confidence (60-75%): 44 races, Avg: 68.3
  Success rates: 1H:15.9% | 2H:31.8% | 3H:41.4% | 4H:47.1% | 5H:50.0% | 6H:33.3%
  ⚠️  No coverage level achieves 80% success rate

Very High Confidence (75-90%): 47 races, Avg: 81.6
Very High Confidence (75-90%): 47 races, Avg: 81.6
  Success rates: 1H:25.5% | 2H:29.8% | 3H:53.8% | 4H:42.9% | 5H:40.0%
  ⚠️  No coverage level achieves 80% success rate

Extreme Confidence (90-100%): 2 races, Avg: 93.6
  Success rates: 1H:0.0% | 2H:0.0%
  ⚠️  No coverage level achieves 80% success rate

📊 Coverage analysis saved to Models\Assiniboia Downs\Assiniboia Downs_Coverage_Analysis.pkl
This data will be used in BetBuilder for dynamic horse selection

📋 CONFIDENCE-BASED HORSE SELECTION LOOKUP TABLE:
------------------------------------------------------------
Low          (  

In [331]:
# Analyze confidence score performance
print("\n" + "="*60)
print("CONFIDENCE SCORE ANALYSIS")
print("="*60)

# Create confidence bins
results_df['confidence_bin'] = pd.cut(results_df['confidence_score'], 
                                    bins=[0, 25, 50, 75, 100], 
                                    labels=['Low (0-25)', 'Medium (25-50)', 'High (50-75)', 'Very High (75-100)'])

# Analyze accuracy by confidence level
confidence_analysis = results_df.groupby('confidence_bin').agg({
    'actual_finish_position': ['count', lambda x: (x == 1).sum()],
    'predicted_finish_position': 'mean'
}).round(2)

confidence_analysis.columns = ['Total_Predictions', 'Wins', 'Avg_Predicted_Position']
confidence_analysis['Win_Rate_%'] = (confidence_analysis['Wins'] / confidence_analysis['Total_Predictions'] * 100).round(2)

print(confidence_analysis)

# Calculate prediction error by confidence level
results_df['prediction_error'] = abs(results_df['actual_finish_position'] - results_df['predicted_finish_position'])
error_by_confidence = results_df.groupby('confidence_bin')['prediction_error'].mean().round(2)

print(f"\nAverage Prediction Error by Confidence Level:")
for bin_name, error in error_by_confidence.items():
    print(f"  {bin_name}: {error:.2f} positions")

# Show correlation between confidence and accuracy
correlation = results_df['confidence_score'].corr(-results_df['prediction_error'])
print(f"\nCorrelation between confidence and accuracy: {correlation:.3f}")
print("(Higher values indicate confidence scores are well-calibrated)")

# Best picks analysis with confidence
best_predictions_with_confidence = results_df.groupby('race_id').apply(
    lambda x: x.loc[x['predicted_finish_position'].idxmin()]
).reset_index(drop=True)

print(f"\nBest Picks Analysis with Confidence:")
print(f"Average confidence of best picks: {best_predictions_with_confidence['confidence_score'].mean():.2f}")

high_conf_wins = best_predictions_with_confidence[
    (best_predictions_with_confidence['confidence_score'] >= 75) & 
    (best_predictions_with_confidence['actual_finish_position'] == 1)
].shape[0]

high_conf_total = best_predictions_with_confidence[
    best_predictions_with_confidence['confidence_score'] >= 75
].shape[0]

if high_conf_total > 0:
    print(f"High confidence (≥75) best picks win rate: {high_conf_wins/high_conf_total*100:.2f}% ({high_conf_wins}/{high_conf_total})")
else:
    print("No high confidence best picks found")


CONFIDENCE SCORE ANALYSIS
                    Total_Predictions  Wins  Avg_Predicted_Position  \
confidence_bin                                                        
Low (0-25)                         10     2                    3.10   
Medium (25-50)                     70    13                    3.73   
High (50-75)                      244    35                    4.11   
Very High (75-100)                149    21                    4.23   

                    Win_Rate_%  
confidence_bin                  
Low (0-25)               20.00  
Medium (25-50)           18.57  
High (50-75)             14.34  
Very High (75-100)       14.09  

Average Prediction Error by Confidence Level:
  Low (0-25): 1.44 positions
  Medium (25-50): 1.86 positions
  High (50-75): 1.74 positions
  Very High (75-100): 1.63 positions

Correlation between confidence and accuracy: 0.040
(Higher values indicate confidence scores are well-calibrated)

Best Picks Analysis with Confidence:
Average confidence

  confidence_analysis = results_df.groupby('confidence_bin').agg({
  error_by_confidence = results_df.groupby('confidence_bin')['prediction_error'].mean().round(2)
