In [None]:
import pandas as pd 
import matplotlib.pyplot as plt 
import numpy as np 
from sklearn.tree import DecisionTreeClassifier, plot_tree 
from sklearn.metrics import accuracy_score 

In [None]:
data_loc = "match data/data"
df = pd.read_csv(data_loc)
df.fillna(0, inplace=True) 

df.head(10)

In [None]:
print(f"Before suspended or stopped games are dropped: {len(df)}")
a_ = len(df)

columns_to_check = [
    "suspended",
    "stopped"]

mask = (df[columns_to_check] == False).all(axis=1) 
df = df[mask] 
print(f"After suspended or stopped games are dropped: {len(df)}")
print(f"% dropped: {100*(a_-len(df))/a_:.2f}%")

In [None]:
df_odds = df.copy()

results = df_odds['result'] 
half = df_odds['halftime'] 
df_odds = df_odds.loc[:, "1":"X"]

df_odds.head()

In [None]:
prob_df = 1 / df_odds 
prob_df.columns = ["Pr{Home Win}", "Pr{Away Win}", "Pr{Draw}"]

normalized_prob_df = prob_df.div(prob_df.sum(axis=1), axis=0) 

prob_df["Result"] = results
prob_df["Half"] = half
normalized_prob_df["Result"] = results
normalized_prob_df["Half"] = half

prob_df.head()

In [None]:
prob_df.iloc[np.random.randint(0, len(prob_df))][["Pr{Home Win}", "Pr{Away Win}", "Pr{Draw}"]].sum() 

In [None]:
normalized_prob_df.head()

In [None]:
normalized_prob_df.iloc[np.random.randint(0, len(normalized_prob_df))][["Pr{Home Win}", "Pr{Away Win}", "Pr{Draw}"]].sum() 

In [None]:
bins = [-1, -0.75, -0.5, -0.25, 0, 0.25, 0.5, 0.75, 1] 

prob_df["Pr{Home Win} - Pr{Away Win}"] = prob_df["Pr{Home Win}"] - prob_df["Pr{Away Win}"] 
prob_df["Bin"] = pd.cut(prob_df["Pr{Home Win} - Pr{Away Win}"], bins) 

In [None]:
first_half_prob_df = prob_df[prob_df["Half"] == "1st-half"] 

bin_totals = first_half_prob_df.groupby("Bin", observed=False).size()
bin_draws = first_half_prob_df[first_half_prob_df["Result"] == "X"].groupby("Bin", observed=False).size() 

estimated_draws = bin_draws / bin_totals 

bookmarker_draws = first_half_prob_df.groupby("Bin", observed=False)["Pr{Draw}"].mean() 

plt.figure(figsize=(12, 6))

plt.bar(bin_totals.index.astype(str), estimated_draws , width=0.2, align="center", edgecolor = "black", color = "green", label="Estimated Draw Rate")
plt.hlines(
    y=bookmarker_draws,                  
    xmin=-0.2 + np.arange(len(bookmarker_draws)),  
    xmax=0.2 + np.arange(len(bookmarker_draws)),  
    colors="red",
    linestyles="--",
    label="Bookmarker Draw Rate Lines")

plt.legend(loc="upper right")
plt.xlabel("Home Win - Away Win")
plt.ylabel("Estimated Probability of Draws")
plt.title("Estimated Draw Rates for Different Home Win - Away Win Probabilities in the 1st Half")

plt.show()

In [None]:
second_half_prob_df = prob_df[prob_df["Half"] == "2nd-half"] 

bin_totals = second_half_prob_df.groupby("Bin", observed=False).size() 
bin_draws = second_half_prob_df[second_half_prob_df["Result"] == "X"].groupby("Bin", observed=False).size() 

estimated_draws = bin_draws / bin_totals 

bookmarker_draws = second_half_prob_df.groupby("Bin", observed=False)["Pr{Draw}"].mean() 

plt.figure(figsize=(12, 6))

plt.bar(bin_totals.index.astype(str), estimated_draws , width=0.2, align="center", edgecolor = "black", color = "green", label="Estimated Draw Rate")
plt.hlines(
    y=bookmarker_draws,                  
    xmin=-0.2 + np.arange(len(bookmarker_draws)),  
    xmax=0.2 + np.arange(len(bookmarker_draws)),  
    colors="red",
    linestyles="--",
    label="Bookmarker Draw Rate Lines")

plt.legend(loc="upper right")
plt.xlabel("Home Win - Away Win")
plt.ylabel("Estimated Probability of Draws")
plt.title("Estimated Draw Rates for Different Home Win - Away Win Probabilities in the 2nd Half")

plt.show()

In [None]:
normalized_prob_df["Pr{Home Win} - Pr{Away Win}"] = normalized_prob_df["Pr{Home Win}"] - normalized_prob_df["Pr{Away Win}"]
normalized_prob_df["Bin"] = pd.cut(normalized_prob_df["Pr{Home Win} - Pr{Away Win}"], bins)

In [None]:
first_half_normalized_prob_df = normalized_prob_df[normalized_prob_df["Half"] == "1st-half"] 

bin_totals = first_half_normalized_prob_df.groupby("Bin", observed=False).size() 
bin_draws = first_half_normalized_prob_df[first_half_normalized_prob_df["Result"] == "X"].groupby("Bin", observed=False).size() 

estimated_draws = bin_draws / bin_totals 

bookmarker_draws = first_half_normalized_prob_df.groupby("Bin", observed=False)["Pr{Draw}"].mean() 

plt.figure(figsize=(12, 6))

plt.bar(bin_totals.index.astype(str), estimated_draws , width=0.2, align="center", edgecolor = "black", color = "green", label="Estimated Draw Rate")
plt.hlines(
    y=bookmarker_draws,                  
    xmin=-0.2 + np.arange(len(bookmarker_draws)),  
    xmax=0.2 + np.arange(len(bookmarker_draws)),  
    colors="red",
    linestyles="--",
    label="Bookmarker Draw Rate Lines")

plt.legend(loc="upper right")
plt.xlabel("Home Win - Away Win")
plt.ylabel("Estimated Probability of Draws")
plt.title("Estimated Draw Rates for Different Home Win - Away Win (Actual) Probabilities in the 1st Half")

plt.show()

In [None]:
second_half_normalized_prob_df = normalized_prob_df[normalized_prob_df["Half"] == "2nd-half"] 

bin_totals = second_half_normalized_prob_df.groupby("Bin", observed=False).size() 
bin_draws = second_half_normalized_prob_df[second_half_normalized_prob_df["Result"] == "X"].groupby("Bin", observed=False).size() 

estimated_draws = bin_draws / bin_totals 

bookmarker_draws = second_half_normalized_prob_df.groupby("Bin", observed=False)["Pr{Draw}"].mean() 

plt.figure(figsize=(12, 6))

plt.bar(bin_totals.index.astype(str), estimated_draws , width=0.2, align="center", edgecolor = "black", color = "green", label="Estimated Draw Rate")
plt.hlines(
    y=bookmarker_draws,                  
    xmin=-0.2 + np.arange(len(bookmarker_draws)),  
    xmax=0.2 + np.arange(len(bookmarker_draws)),  
    colors="red",
    linestyles="--",
    label="Bookmarker Draw Rate Lines")

plt.legend(loc="upper right")
plt.xlabel("Home Win - Away Win")
plt.ylabel("Estimated Probability of Draws")
plt.title("Estimated Draw Rates for Different Home Win - Away Win (Actual) Probabilities in the 2nd Half")

plt.show()

In [None]:
df["Goal_in_first_10_mins"] = df.apply(
    lambda row: True if row["halftime"] == "1st-half" and row["minute"] < 10 and (row["Goals - home"] + row["Goals - away"] >= 1) 
    else False,
    axis=1)

columns_to_keep = ["fixture_id", "halftime", "minute", "Goals - away", "Goals - home"]
matches = df.copy()
matches = matches[columns_to_keep]
matches["Total Goals"] = matches["Goals - away"] + matches["Goals - home"]
matches = matches[(matches["minute"] < 40) & (matches["halftime"] == "2nd-half")]
matches = matches.loc[matches.groupby("fixture_id")["Total Goals"].idxmax()]
matches.drop(columns=["Goals - away", "Goals - home", "halftime", "minute"], inplace=True)
df["Goal_after_80_mins"] = df.apply(
    lambda row: True if row["minute"] >= 40 and row["halftime"] == "2nd-half" and
    (row["Goals - home"] + row["Goals - away"]) > matches[matches["fixture_id"] == row["fixture_id"]]["Total Goals"].values[0] 
    else False,
    axis=1)

df["Redcard_in_first_15_mins"] = df.apply(
    lambda row: True if (row["halftime"] == "1st-half" and row["minute"] < 15) 
    and (row["Redcards - home"] + row["Redcards - away"] + row["Yellowred Cards - home"] + row["Yellowred Cards - away"] >= 1) 
    else False,
    axis=1)

columns_to_keep = ["fixture_id", "halftime", "minute", "Redcards - away", "Redcards - home", "Yellowred Cards - away", "Yellowred Cards - home"]
matches = df.copy()
matches = matches[columns_to_keep]
matches["Total Red Cards"] = matches["Redcards - away"] + matches["Redcards - home"] + matches["Yellowred Cards - away"] + matches["Yellowred Cards - home"]
matches = matches[(matches["minute"] < 35) & (matches["halftime"] == "2nd-half")]
matches = matches.loc[matches.groupby("fixture_id")["Total Red Cards"].idxmax()]
matches.drop(columns=["Redcards - away", "Redcards - home", "Yellowred Cards - away", "Yellowred Cards - home" ,"halftime", "minute"], inplace=True)
df["Red_cards_after_75_mins"] = df.apply(
    lambda row: True if row["minute"] >= 35 and row["halftime"] == "2nd-half" and
    (row["Redcards - home"] + row["Redcards - away"] + row["Yellowred Cards - away"] + row["Yellowred Cards - home"]) > matches[matches["fixture_id"] == row["fixture_id"]]["Total Red Cards"].values[0] 
    else False,
    axis=1)

df["Yellowcard_in_first_10_mins"] = df.apply(
    lambda row: True if row["halftime"] == "1st-half" and row["minute"] < 10 and (row["Yellowcards - home"] + row["Yellowcards - away"] >= 1)
    else False,
    axis=1)

df["Injuries_in_first_30_mins"] = df.apply(
    lambda row: True if (row["halftime"] == "1st-half") and row["minute"] < 30 and (row["Injuries - home"] + row["Injuries - away"] >= 1) 
    else False,
    axis=1)

columns_to_keep = ["fixture_id", "halftime", "minute", "Injuries - away", "Injuries - home"]
matches = df.copy()
matches = matches[columns_to_keep]
matches["Total Injuries"] = matches["Injuries - away"] + matches["Injuries - home"]
matches = matches[(matches["minute"] < 35) & (matches["halftime"] == "2nd-half")]
matches = matches.loc[matches.groupby("fixture_id")["Total Injuries"].idxmax()]
matches.drop(columns=["Injuries - away", "Injuries - home", "halftime", "minute"], inplace=True)
df["Injuries_after_75_mins"] = df.apply(
    lambda row: True if row["minute"] >= 35 and row["halftime"] == "2nd-half" and
    (row["Injuries - home"] + row["Injuries - away"]) > matches[matches["fixture_id"] == row["fixture_id"]]["Total Injuries"].values[0] 
    else False,
    axis=1)

df["Substutions_in_first_30_mins"] = df.apply(
    lambda row: True if row["halftime"] == "1st-half" and row["minute"] < 30 and (row["Substitutions - home"] + row["Substitutions - away"] >= 1) 
    else False,
    axis=1)

In [None]:
print(f"Number of matches with a goal before 10 minutes: {df['Goal_in_first_10_mins'].sum()}")
print(f"Number of matches with a goal after 80 minutes: {df['Goal_after_80_mins'].sum()}")
print(f"Number of matches with a red card before 15 minutes: {df['Redcard_in_first_15_mins'].sum()}")
print(f"Number of matches with a red card after 75 minutes: {df['Red_cards_after_75_mins'].sum()}")
print(f"Number of matches with a yellow card before 10 minutes: {df['Yellowcard_in_first_10_mins'].sum()}")
print(f"Number of matches with an injury before 30 minutes: {df['Injuries_in_first_30_mins'].sum()}")
print(f"Number of matches with an injury after 75 minutes: {df['Injuries_after_75_mins'].sum()}")
print(f"Number of matches with a substitutions before 30 minutes: {df['Substutions_in_first_30_mins'].sum()}")

In [None]:
print(f"Current number of rows: {len(df)}")
a = len(df)

columns_to_check = [
    "Goal_in_first_10_mins",
    "Redcard_in_first_15_mins",
    "Yellowcard_in_first_10_mins",
    "Injuries_in_first_30_mins",
    "Goal_after_80_mins",
    "Substutions_in_first_30_mins",
    "Red_cards_after_75_mins",
    "Injuries_after_75_mins"
]

mask = (df[columns_to_check] == False).all(axis=1)

df = df[mask]

df = df.drop(columns=columns_to_check)

print(f"Number of rows after removing matches with events: {len(df)}")
print(f"Difference in number of rows: {a - len(df)}")

In [None]:
df_odds = df.copy()
results = df_odds['result']
half = df_odds['halftime']
df_odds = df_odds.loc[:, "1":"X"]

prob_df = 1 / df_odds
prob_df.columns = ["Pr{Home Win}", "Pr{Away Win}", "Pr{Draw}"]

normalized_prob_df = prob_df.div(prob_df.sum(axis=1), axis=0)

prob_df["Result"] = results
prob_df["Half"] = half
normalized_prob_df["Result"] = results
normalized_prob_df["Half"] = half

In [None]:
prob_df["Pr{Home Win} - Pr{Away Win}"] = prob_df["Pr{Home Win}"] - prob_df["Pr{Away Win}"]
prob_df["Bin"] = pd.cut(prob_df["Pr{Home Win} - Pr{Away Win}"], bins)

In [None]:
first_half_prob_df = prob_df[prob_df["Half"] == "1st-half"]

bin_totals = first_half_prob_df.groupby("Bin", observed=False).size()
bin_draws = first_half_prob_df[first_half_prob_df["Result"] == "X"].groupby("Bin", observed=False).size()

estimated_draws = bin_draws / bin_totals

bookmarker_draws = first_half_prob_df.groupby("Bin", observed=False)["Pr{Draw}"].mean()

plt.figure(figsize=(12, 6))

plt.bar(bin_totals.index.astype(str), estimated_draws , width=0.2, align="center", edgecolor = "black", color = "green", label="Estimated Draw Rate")
plt.hlines(
    y=bookmarker_draws,                  
    xmin=-0.2 + np.arange(len(bookmarker_draws)),  
    xmax=0.2 + np.arange(len(bookmarker_draws)),  
    colors="red",
    linestyles="--",
    label="Bookmarker Draw Rate Lines")

plt.legend(loc="upper right")
plt.xlabel("Home Win - Away Win")
plt.ylabel("Estimated Probability of Draws")
plt.title("Estimated Draw Rates for Different Home Win - Away Win Probabilities in the 1st Half")

plt.show()

In [None]:
second_half_prob_df = prob_df[prob_df["Half"] == "2nd-half"]

bin_totals = second_half_prob_df.groupby("Bin", observed=False).size()
bin_draws = second_half_prob_df[second_half_prob_df["Result"] == "X"].groupby("Bin", observed=False).size()

estimated_draws = bin_draws / bin_totals

bookmarker_draws = second_half_prob_df.groupby("Bin", observed=False)["Pr{Draw}"].mean()

plt.figure(figsize=(12, 6))

plt.bar(bin_totals.index.astype(str), estimated_draws , width=0.2, align="center", edgecolor = "black", color = "green", label="Estimated Draw Rate")
plt.hlines(
    y=bookmarker_draws,                  
    xmin=-0.2 + np.arange(len(bookmarker_draws)),  
    xmax=0.2 + np.arange(len(bookmarker_draws)),  
    colors="red",
    linestyles="--",
    label="Bookmarker Draw Rate Lines")

plt.legend(loc="upper right")
plt.xlabel("Home Win - Away Win")
plt.ylabel("Estimated Probability of Draws")
plt.title("Estimated Draw Rates for Different Home Win - Away Win Probabilities in the 2nd Half")

plt.show()

In [None]:
normalized_prob_df["Pr{Home Win} - Pr{Away Win}"] = normalized_prob_df["Pr{Home Win}"] - normalized_prob_df["Pr{Away Win}"]
normalized_prob_df["Bin"] = pd.cut(normalized_prob_df["Pr{Home Win} - Pr{Away Win}"], bins)

In [None]:
first_half_normalized_prob_df = normalized_prob_df[normalized_prob_df["Half"] == "1st-half"]

bin_totals = first_half_normalized_prob_df.groupby("Bin", observed=False).size()
bin_draws = first_half_normalized_prob_df[first_half_normalized_prob_df["Result"] == "X"].groupby("Bin", observed=False).size()

estimated_draws = bin_draws / bin_totals

bookmarker_draws = first_half_normalized_prob_df.groupby("Bin", observed=False)["Pr{Draw}"].mean()

plt.figure(figsize=(12, 6))

plt.bar(bin_totals.index.astype(str), estimated_draws , width=0.2, align="center", edgecolor = "black", color = "green", label="Estimated Draw Rate")
plt.hlines(
    y=bookmarker_draws,                  
    xmin=-0.2 + np.arange(len(bookmarker_draws)),  
    xmax=0.2 + np.arange(len(bookmarker_draws)),  
    colors="red",
    linestyles="--",
    label="Bookmarker Draw Rate Lines")

plt.legend(loc="upper right")
plt.xlabel("Home Win - Away Win")
plt.ylabel("Estimated Probability of Draws")
plt.title("Estimated Draw Rates for Different Home Win - Away Win (Actual) Probabilities in the 1st Half")

plt.show()

In [None]:
second_half_normalized_prob_df = normalized_prob_df[normalized_prob_df["Half"] == "2nd-half"]

bin_totals = second_half_normalized_prob_df.groupby("Bin", observed=False).size()
bin_draws = second_half_normalized_prob_df[second_half_normalized_prob_df["Result"] == "X"].groupby("Bin", observed=False).size()

estimated_draws = bin_draws / bin_totals

bookmarker_draws = second_half_normalized_prob_df.groupby("Bin", observed=False)["Pr{Draw}"].mean()

plt.figure(figsize=(12, 6))

plt.bar(bin_totals.index.astype(str), estimated_draws , width=0.2, align="center", edgecolor = "black", color = "green", label="Estimated Draw Rate")
plt.hlines(
    y=bookmarker_draws,                  
    xmin=-0.2 + np.arange(len(bookmarker_draws)),  
    xmax=0.2 + np.arange(len(bookmarker_draws)),  
    colors="red",
    linestyles="--",
    label="Bookmarker Draw Rate Lines")

plt.legend(loc="upper right")
plt.xlabel("Home Win - Away Win")
plt.ylabel("Estimated Probability of Draws")
plt.title("Estimated Draw Rates for Different Home Win - Away Win (Actual) Probabilities in the 2nd Half")

plt.show()

In [None]:
target = [1 if result == "1" else 2 if result == "2" else 0 for result in df["result"]] 

In [None]:
feature_df = df.copy()

feature_df['minute_no_half'] = df.groupby('fixture_id').cumcount() + 1

feature_df.drop(columns=["result", "fixture_id", "current_time", "half_start_datetime", "match_start_datetime", "latest_bookmaker_update", 
                        "suspended", "stopped", "1", "2", "X", "final_score", "Assists - away", "Assists - home", "name", "second", "ticking",
                        "Penalties - home", "Penalties - away"], inplace=True)

feature_df["minute"] = feature_df["minute"].astype(float)
feature_df["halftime"] = [1.0 if half == "1st-half" else 2.0 for half in feature_df["halftime"]]
feature_df["current_state"] = [1.0 if result == "1" else 2.0 if result == "2" else 0 for result in feature_df["current_state"]]
feature_df["Total Goals"] = feature_df["Goals - away"] + feature_df["Goals - home"]
feature_df["Goal Difference"] = feature_df["Goals - home"] - feature_df["Goals - away"]
feature_df["Dangerous Attacks Difference"] = feature_df["Dangerous Attacks - home"] - feature_df["Dangerous Attacks - away"]
feature_df["Ball Possession % Difference"] = feature_df["Ball Possession % - home"] - feature_df["Ball Possession % - away"]
feature_df["Goal Attempts Difference"] = feature_df["Goal Attempts - home"] - feature_df["Goal Attempts - away"]

feature_df

In [None]:
classifier = DecisionTreeClassifier(min_impurity_decrease=1e-5, max_depth=10, max_leaf_nodes=15)

classifier.fit(feature_df, target)

In [None]:
predictions = classifier.predict(feature_df)

accuracy = accuracy_score(target, predictions)

correct_guesses = int(len(target) * accuracy)

print(f"Accuracy: {accuracy}")
print(f"Correct Guesses: {correct_guesses}")
print(f"Wrong Guesses: {len(target) - correct_guesses}")

In [None]:
plt.figure(figsize=(30, 15))  

plot_tree(
    classifier,
    feature_names=feature_df.columns,
    class_names=["Draw","Home Win", "Away Win"], 
    filled=True,
    fontsize=10,  
    rounded=True)

plt.show()

In [None]:
print("Feature Importances:")

for feature, importance in zip(feature_df.columns, classifier.feature_importances_):
    if importance > 0:
        print(f"{feature}: {importance:.5f}") 

In [None]:
predict_probs = classifier.predict_proba(feature_df) 

predicted_prob_df = pd.DataFrame(
    predict_probs, 
    columns=["Pred{Draw}", "Pred{Home Win}", "Pred{Away Win}"])

df_odds = df[["X", "1", "2"]].copy()
prob_df = 1 / df_odds
prob_df.columns = ["Pr{Draw}", "Pr{Home Win}", "Pr{Away Win}"]
normalized_prob_df = prob_df.div(prob_df.sum(axis=1), axis=0)

compare_df = pd.concat([normalized_prob_df.reset_index(drop=True), predicted_prob_df.reset_index(drop=True)], axis=1)

compare_df["Diff{Draw}"] = compare_df["Pred{Draw}"] - compare_df["Pr{Draw}"]
compare_df["Diff{Home Win}"] = compare_df["Pred{Home Win}"] - compare_df["Pr{Home Win}"]
compare_df["Diff{Away Win}"] = compare_df["Pred{Away Win}"] - compare_df["Pr{Away Win}"]

compare_df

In [None]:
plt.figure(figsize=(10, 6))

compare_df["Diff{Home Win}"].hist(bins=50, alpha=0.5, color='black', edgecolor='black', label="Home Win")
compare_df["Diff{Away Win}"].hist(bins=50, alpha=0.5, color='red', edgecolor='black', label="Away Win")
compare_df["Diff{Draw}"].hist(bins=50, alpha=0.5, color='yellow', edgecolor='black', label="Draw")

plt.title("Distribution of Differences for Predicted vs. Implied Probabilities", fontsize=14)
plt.xlabel("Difference (Predicted - Implied)", fontsize=12)
plt.ylabel("Frequency", fontsize=12)

plt.grid(alpha=0.3)
plt.legend(loc = "upper right")
plt.show()

In [None]:
classifier = DecisionTreeClassifier()

classifier.fit(feature_df, target)

predictions = classifier.predict(feature_df)
accuracy = accuracy_score(target, predictions)
correct_guesses = int(len(target) * accuracy)

print(f"Accuracy: {accuracy}")
print(f"Correct Guesses: {correct_guesses}")
print(f"Wrong Guesses: {len(target) - correct_guesses}")

In [None]:
importance_list = []

for feature, importance in zip(feature_df.columns, classifier.feature_importances_):
    if importance > 0:
        importance_list.append((feature, importance))
    importance_list.sort(key=lambda x: x[1], reverse=True)

print("Feature Importances:")
for feature, importance in importance_list:
    print(f"{feature}: {importance:.5f}")

In [None]:
predict_probs = classifier.predict_proba(feature_df) 

predicted_prob_df = pd.DataFrame(
    predict_probs, 
    columns=["Pred{Draw}", "Pred{Home Win}", "Pred{Away Win}"])

df_odds = df[["X", "1", "2"]].copy()
prob_df = 1 / df_odds
prob_df.columns = ["Pr{Draw}", "Pr{Home Win}", "Pr{Away Win}"]
normalized_prob_df = prob_df.div(prob_df.sum(axis=1), axis=0)

compare_df = pd.concat([normalized_prob_df.reset_index(drop=True), predicted_prob_df.reset_index(drop=True)], axis=1) 

compare_df["Diff{Draw}"] = compare_df["Pred{Draw}"] - compare_df["Pr{Draw}"]
compare_df["Diff{Home Win}"] = compare_df["Pred{Home Win}"] - compare_df["Pr{Home Win}"]
compare_df["Diff{Away Win}"] = compare_df["Pred{Away Win}"] - compare_df["Pr{Away Win}"]

compare_df

plt.figure(figsize=(10, 6))

compare_df["Diff{Home Win}"].hist(bins=50, alpha=0.5, color='black', edgecolor='black', label="Home Win")
compare_df["Diff{Away Win}"].hist(bins=50, alpha=0.5, color='red', edgecolor='black', label="Away Win")
compare_df["Diff{Draw}"].hist(bins=50, alpha=0.5, color='yellow', edgecolor='black', label="Draw")

plt.title("Distribution of Differences for Predicted vs. Implied Probabilities", fontsize=14)
plt.xlabel("Difference (Predicted - Implied)", fontsize=12)
plt.ylabel("Frequency", fontsize=12)

plt.grid(alpha=0.3)
plt.legend(loc = "upper right")
plt.show()