In [292]:
track_abbreviations = ["aqu", "baq"]
track_name = "Aqueduct"

In [293]:
import pandas as pd

data = pd.read_csv(f'Imputed Data\\{track_name}.csv')

In [294]:
from sklearn.model_selection import GroupShuffleSplit
from sklearn.ensemble import RandomForestRegressor

X = data.drop(columns=['normalized_position', 'Position'])
y = data['normalized_position']
groups = data['race_id']

# Split the data
gss = GroupShuffleSplit(test_size=0.2, random_state=42)
train_idx, test_idx = next(gss.split(X, y, groups=groups))

X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

# Store the 'odds' column for later use
odds_train = X_train['odds']
odds_test = X_test['odds']

# Drop the 'odds' column from the training and testing datasets
X_train = X_train.drop(columns=['odds'])
X_test = X_test.drop(columns=['odds'])

model = RandomForestRegressor(n_jobs=-1, max_depth=20, max_features=.5, min_samples_leaf=10, n_estimators=300)

# Train the model
model_results = model.fit(X_train, y_train)

In [295]:
predicted_normalized_position = model.predict(X_test)
predicted_finish_position = ((predicted_normalized_position * X_test['number_of_run']) / 100)

In [296]:
# Extract actual finish positions
actual_finish_position = (y_test * X_test['number_of_run']) / 100
actual_finish_position = actual_finish_position.astype(int)

# Extract and normalize the Odds column
odds = odds_test
normalized_odds = (odds - odds.min()) / (odds.max() - odds.min()) * 100  # Scale to a range of 0 to 100

In [297]:
# Create a DataFrame by concatenating the series
results_df = pd.concat([X_test['race_id'], actual_finish_position, predicted_finish_position, odds], axis=1)

# Rename the columns for clarity
results_df.columns = ['race_id', 'actual_finish_position', 'predicted_finish_position', 'odds']


In [298]:
# Create another dataframe from results_df that selects the lowest predicted_finish_position for each race_id
best_predictions = results_df.groupby('race_id').agg({'predicted_finish_position': 'min'}).reset_index()
# Merge the best_predictions dataframe with results_df to get all other columns
best_predictions = pd.merge(best_predictions, results_df, on=["race_id", "predicted_finish_position"], how='inner')

In [299]:
# Calculate percent of horses that finished 1
num_firsts = best_predictions[best_predictions['actual_finish_position'] == 1].shape[0]
print("Win: {:.2f}%".format(num_firsts / best_predictions.shape[0] * 100))

# Calculate percent of horses that finished 1 or 2
num_firsts = best_predictions[best_predictions['actual_finish_position'] <= 2].shape[0]
print("Place: {:.2f}%".format(num_firsts / best_predictions.shape[0] * 100))

# Calculate percent of horses that finished 1, 2, or 3
num_firsts = best_predictions[best_predictions['actual_finish_position'] <= 3].shape[0]
print("Show: {:.2f}%".format(num_firsts / best_predictions.shape[0] * 100))

Win: 48.36%
Place: 74.65%
Show: 80.75%


In [47]:
import pickle

# model_file = f"Models\\{track_name}\\{track_name}_Model.pkl"

# with open(model_file, 'wb') as file:  
#     pickle.dump(model, file)

In [53]:
# Get the top two predictions for each race_id
top_two_predictions = results_df.groupby('race_id').apply(lambda x: x.nsmallest(2, 'predicted_finish_position')).reset_index(drop=True)

# Check if both top two predictions finished in the top two positions
top_two_grouped = top_two_predictions.groupby('race_id').filter(lambda x: set(x['actual_finish_position']) == {1, 2})

# Calculate the percentage of races where both top two predictions finished in the top two positions
percentage_top_two = (top_two_grouped['race_id'].nunique() / results_df['race_id'].nunique()) * 100

print("Percentage of times where the top 2 predictions both finished 1 and 2 in any order: {:.2f}%".format(percentage_top_two))

Percentage of times where the top 2 predictions both finished 1 and 2 in any order: 29.58%


In [54]:
# Find the top 3 horses in each race_id
top_three_predictions = results_df.groupby('race_id').apply(lambda x: x.nsmallest(3, 'predicted_finish_position')).reset_index(drop=True)

# Check if all top three predictions finished in the top three positions
top_three_grouped = top_three_predictions.groupby('race_id').filter(lambda x: set(x['actual_finish_position']) == {1, 2, 3})

# Calculate the percentage of races where both top three predictions finished in the top three positions
percentage_top_three = (top_three_grouped['race_id'].nunique() / results_df['race_id'].nunique()) * 100

print("Percentage of times where the top 3 predictions all finished 1, 2, and 3 in any order: {:.2f}%".format(percentage_top_three))

Percentage of times where the top 3 predictions all finished 1, 2, and 3 in any order: 3.76%


In [58]:
# Find the top 4 horses in each race_id
top_four_predictions = results_df.groupby('race_id').apply(lambda x: x.nsmallest(4, 'predicted_finish_position')).reset_index(drop=True)

# Check if three of the top four predictions finished in the top three positions
top_four_grouped = top_four_predictions.query('actual_finish_position <= 3')

# Find the number of race_ids that occur 3 times in top_four_grouped
race_id_counts = top_four_grouped['race_id'].value_counts()
race_ids_with_three_occurrences = len(race_id_counts[race_id_counts == 3])
percentage_top_four_trifecta = (race_ids_with_three_occurrences / top_four_predictions.groupby('race_id').ngroups) * 100

print("Percentage of times where the top 4 predictions all finished 1, 2, 3, and 4 in any order: {:.2f}%".format(percentage_top_four_trifecta))

Percentage of times where the top 4 predictions all finished 1, 2, 3, and 4 in any order: 35.21%
