In [None]:
# Core
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# For clean visuals
sns.set(style="whitegrid")
%matplotlib inline

# Paths
TRAIN_PATH = "C:\Users\dylan\Documents\Projects\horse_model_project\trainData.csv"
TEST_PATH = "C:\Users\dylan\Documents\Projects\horse_model_project\testData.csv"

# Load data
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)

# Quick peek
print("Train shape:", train.shape)
print("Test shape:", test.shape)

# Column info
train.info()
train.describe(include="all").T

# Sample
train.sample(5)

# Create target
train["Winner"] = (train["Position"] == 1).astype(int)

# Verify class balance
train["Winner"].value_counts(normalize=True)

print("Unique Races in Train:", train["Race_ID"].nunique())
print("Unique Horses in Train:", train["Horse"].nunique())

print("Avg runners per race (Train):", train.groupby("Race_ID").size().mean())

LEAK_COLS = ["Position", "betfairSP", "timeSecs", "pdsBeaten", "NMFP"]

# Drop for modelling later, but keep a copy for EDA
train_leaky = train.copy()
train = train.drop(columns=LEAK_COLS)
test = test.drop(columns=[col for col in LEAK_COLS if col in test.columns])

def missing_report(df):
    miss = df.isnull().sum()
    miss = miss[miss > 0]
    return pd.DataFrame({"Missing Count": miss, "%": miss / len(df) * 100}).sort_values(by="%", ascending=False)

print("Train missing:")
display(missing_report(train))

print("Test missing:")
display(missing_report(test))

num_cols = train.select_dtypes(include=["float64", "int64"]).columns

for col in num_cols:
    if train[col].isnull().any():
        train[col] = train.groupby(["Course", "Going"])[col].transform(lambda x: x.fillna(x.median()))
        test[col] = test.groupby(["Course", "Going"])[col].transform(lambda x: x.fillna(x.median()))

cat_cols = train.select_dtypes(include=["object"]).columns

for col in cat_cols:
    train[col] = train[col].fillna("Unknown")
    test[col] = test[col].fillna("Unknown")

# Runners per race
race_sizes = train_leaky.groupby("Race_ID").size()
print("Min runners:", race_sizes.min())
print("Max runners:", race_sizes.max())

# Going distribution
sns.countplot(y=train["Going"], order=train["Going"].value_counts().index)
plt.title("Distribution of Going Types")
plt.show()

# Speed_PreviousRun distribution
sns.histplot(train["Speed_PreviousRun"], bins=30, kde=True)
plt.title("Histogram of Speed_PreviousRun")
plt.show()

sns.boxplot(x="Winner", y="daysSinceLastRun", data=train_leaky)
plt.title("daysSinceLastRun by Winner")
plt.xticks([0, 1], ["Not Winner", "Winner"])
plt.show()

train.to_csv("../data/raw/train_cleaned.csv", index=False)
test.to_csv("../data/raw/test_cleaned.csv", index=False)
