## Brief glance at the main data frame

In [3]:
import pandas as pd
import seaborn as sns

races_df = pd.read_csv("../data/samples/test_sample.csv")

races_df

In [4]:
races_df[races_df["race_id"] == 6753590]

In [6]:
import pandas as pd

races_df = pd.read_csv("../data/leakage_detection/live/2024-01-05.csv")

races_df

In [8]:
import pickle
import seaborn as sns
from ModelTuning.simulate_conf import LEARNING_CURVE_PATH

with open(LEARNING_CURVE_PATH, "rb") as f:
    learning_curve = pickle.load(f)

print(learning_curve)

sns.lineplot(learning_curve)

## How many horses per race on average?

In [5]:
COLUMN_NAME = "PreviousValueSource_subject_id_relative_distance_behind"
TIME_INTERVAL = "year-month"

# races_df = races_df[races_df["PreviousWinProbability"] != -1]

races_df["year-month"] = races_df["date_time"].astype(str).str[:7]
races_df["date"] = pd.to_datetime(races_df["date_time"], format='%Y-%m-%d')
races_df["dayofweek"] = races_df["date"].dt.dayofweek.astype(str)
races_df["hour"] = races_df["date"].dt.hour.astype(str)
races_df["minute"] = races_df["date"].dt.minute.astype(str)
races_df["month"] = races_df["date"].dt.month.astype(str)

print(races_df.groupby([TIME_INTERVAL]).agg(std=(COLUMN_NAME, "std")))
print(races_df.groupby([TIME_INTERVAL]).agg(std=(COLUMN_NAME, "mean")))

# races_df_no_rating = races_df[races_df["PreviousWinProbability"] == -1]
# print(races_df_no_rating.groupby([TIME_INTERVAL]).size())

sns.set(rc={'figure.figsize':(26,15)})
sns.boxplot(data=races_df, x=COLUMN_NAME, y=TIME_INTERVAL)

In [1]:
import pandas as pd

races_df = pd.read_csv('../data/samples/latest_live_sample.csv')

races_df

In [2]:
races_df[races_df["race_id"] == 6775532]

In [6]:
avg_horses_per_race = races_df.groupby("race_id").size().mean()

print(f"Average horses per race: {avg_horses_per_race}. Chance to randomly guess the winner: {1 / avg_horses_per_race}")

In [9]:
grouped = races_df.groupby('race_id')

# Use idxmax to get the index of the row with the highest value in each group
max_indexes = grouped['score'].idxmax()

# Use the indexes to extract the rows with the highest values
result_df = races_df.loc[max_indexes]

n_predictions = len(result_df)

n_correct_predictions = len(result_df[result_df["place"] == 1])

print(f"Number of predictions: {n_predictions}")
print(f"Number of correct predictions: {n_correct_predictions}")
print(f"Accuracy: {n_correct_predictions / n_predictions}")

print(result_df)

In [9]:
len(result_df[result_df["label"] == 1]) / len(result_df)

In [38]:
races_df["track_name"] = races_df["race_name"].astype(str).str[:-2]

races_df = races_df[races_df["track_name"] == "Aintree"]
sns.scatterplot(data=races_df, x="track_name", y="year-month")

## Distance Outlier Visualization

In [4]:
sns.displot(races_df, x="CurrentDistance", y="CurrentRaceClass")

In [11]:
distances = list(races_df["CurrentDistance"].values)

sns.distplot(distances)

# Covariate Shift Detection

## 1.) Between a month and its predecessor

In [51]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import numpy as np

races_df = pd.read_csv("../data/races.csv")
races_df["year-month"] = races_df["date_time"].astype(str).str[:7]

month_df = races_df[races_df["year-month"].isin(["2022-09", "2022-08"])]
month_df["label"] = np.where(month_df["year-month"] == "2022-09", 1, 0)

month_df = month_df.fillna(value=-1)

month_df = month_df.drop(["date_time", "race_id", "horse_id", "year-month", "Month_Sin", "Month_Cos", "Unnamed: 0"], axis=1)

print(month_df)

features = [column for column in month_df.columns if column not in ["label"]]
shift_X = month_df.loc[:, features]
scaler = StandardScaler().fit(shift_X)
shift_X = scaler.transform(shift_X)

shift_y = month_df.loc[:, "label"]

X_train, X_test, y_train, y_test = train_test_split(shift_X, shift_y, test_size=0.33, random_state=42, stratify=shift_y)

log_regression_classifier = LogisticRegression(random_state=0).fit(X_train, y_train)
print(f"Classifier score:{log_regression_classifier.score(X_test, y_test)}")

coef_data = {
    "feature name": list([column for column in features]),
    "coeff": list(list(log_regression_classifier.coef_)[0]),
}

coeff_df = pd.DataFrame.from_dict(coef_data)
coeff_df.sort_values(by=["coeff"])

## Real time sample inspection

In [1]:
import pandas as pd

real_time_sample_df = pd.read_csv("../data/logs/samples/real_time_5686367")

real_time_sample_df

In [None]:
from tqdm import tqdm
from Persistence.RaceCardPersistence import RaceCardsPersistence

race_cards_persistence = RaceCardsPersistence("race_cards")

for race_card_file_name in tqdm(race_cards_persistence.race_card_file_names):
    race_cards = race_cards_persistence.load_race_card_files_non_writable([race_card_file_name])
    for race_card in race_cards.values():
        print(race_card.date)