In [1]:
import pandas as pd # data processing, CSV file
import datetime # to split the data into train and test
import matplotlib.pyplot as plt # for plotting
import seaborn as sns # for plotting
import numpy as np # for mathematical operations
import warnings # to reduce warnings if needed
from sklearn.tree import DecisionTreeClassifier # for building decision tree model
from sklearn.ensemble import RandomForestClassifier # for building random forest model
from xgboost import XGBClassifier # for building XGBoost model
from sklearn.model_selection import GridSearchCV # for parameter tuning
from sklearn.metrics import accuracy_score, confusion_matrix # for model evaluation
from sklearn.model_selection import train_test_split # for splitting the data into two sets; validation and training
import random
random.seed(2512)
np.random.seed(2512)

In [None]:
data_loc = "Project/Data/match_data"

df = pd.read_csv(data_loc)

mask = (df[["suspended","stopped"]] == False).all(axis=1) # mask for rows where all columns are False
df = df[mask] # remove rows where all columns are False
df = df.drop(["suspended","stopped"], axis=1) # remove the columns

df.head(5)

In [None]:
result_counts = df['result'].value_counts() # count the number of each result

result_counts

In [None]:
for col in df.isna(): 
    if df[col].isna().sum() > 0:
        print("{col} has {n} missing values".format(col=col, n=df[col].isna().sum())) # print the number of missing values in each column

# impute missing values with the mean of the column
for column in df.columns:
    if column != 'current_state' and df[column].isnull().sum() > 0: # if the column is not current_state since it is categorical
        df[column] = df.groupby('fixture_id', observed=True)[column].transform(lambda x: x.fillna(x.mean()))

# forward fill the current_state column, since it is categorical
df['current_state'] = df.groupby('fixture_id')['current_state'].ffill()

In [None]:
for col in df.isna():
    if df[col].isna().sum() > 0:
        print("{col} has {n} missing values".format(col=col, n=df[col].isna().sum())) # print the number of missing values in each column

In [6]:
df = df.dropna(subset=["current_state"]) # drop rows where current_state is missing

df["current_state"] = [1 if x == "1" else 2 if x == "2" else 0 for x in df["current_state"]] # convert the current_state column to 1, 2, 0

In [7]:
# Create new columns as a batch
new_columns = pd.DataFrame({
    "Total Goals": df["Goals - home"] + df["Goals - away"],
    "Total Red Cards": df["Redcards - home"] + df["Redcards - away"],
    "Total Yellow Cards": df["Yellowcards - home"] + df["Yellowcards - away"],
    "Total Injuries": df["Injuries - home"] + df["Injuries - away"],
    "Total Substitutions": df["Substitutions - home"] + df["Substitutions - away"],
    "Goal Difference": df["Goals - home"] - df["Goals - away"],
    "Dangerous Attacks Difference": df["Dangerous Attacks - home"] - df["Dangerous Attacks - away"],
    "Ball Possession % Difference": df["Ball Possession % - home"] - df["Ball Possession % - away"],
    "Goal Attempts Difference": df["Goal Attempts - home"] - df["Goal Attempts - away"],
    "Successful Passes % Difference": df["Successful Passes Percentage - home"] - df["Successful Passes Percentage - away"]
})

# Concatenate the new columns with the original DataFrame
df = pd.concat([df, new_columns], axis=1)

# Add probabilities of home win, away win and draw
df_odds = df.copy()
df_odds = df_odds.loc[:, "1":"X"]
prob_df = 1 / df_odds
prob_df.columns = ["Pr{Home Win}", "Pr{Away Win}", "Pr{Draw}"]
normalized_prob_df = prob_df.div(prob_df.sum(axis=1), axis=0)
df = pd.concat([df, normalized_prob_df], axis=1)

In [None]:
# Goal in first 10 mins
df["Goal_in_first_10_mins"] = (df["halftime"] == "1st-half") & (df["minute"] < 10) & (df["Total Goals"] >= 1)

# Prepare matches dataframe for "Goal_after_80_mins"
matches = df[(df["minute"] < 40) & (df["halftime"] == "2nd-half")]
matches = matches.loc[matches.groupby("fixture_id")["Total Goals"].idxmax()]
matches = matches[["fixture_id", "Total Goals"]]

# Goal after 80 mins
df = df.merge(matches, on="fixture_id", how="left", suffixes=("", "_max"))
df["Goal_after_80_mins"] = (df["minute"] >= 40) & (df["halftime"] == "2nd-half") & (df["Total Goals"] > df["Total Goals_max"])
df.drop(columns=["Total Goals_max"], inplace=True)

# Red card in first 15 mins
df["Redcard_in_first_15_mins"] = (df["halftime"] == "1st-half") & (df["minute"] < 15) & (df["Total Red Cards"] >= 1)

# Prepare matches dataframe for "Red_cards_after_75_mins"
matches = df[(df["minute"] < 35) & (df["halftime"] == "2nd-half")]
matches = matches.loc[matches.groupby("fixture_id")["Total Red Cards"].idxmax()]
matches = matches[["fixture_id", "Total Red Cards"]]

# Red cards after 75 mins
df = df.merge(matches, on="fixture_id", how="left", suffixes=("", "_max"))
df["Red_cards_after_75_mins"] = (df["minute"] >= 35) & (df["halftime"] == "2nd-half") & (df["Total Red Cards"] > df["Total Red Cards_max"])
df.drop(columns=["Total Red Cards_max"], inplace=True)

# Yellow card in first 10 mins
df["Yellowcard_in_first_10_mins"] = (df["halftime"] == "1st-half") & (df["minute"] < 10) & (df["Total Yellow Cards"] >= 1)

# Injuries in first 15 mins
df["Injuries_in_first_15_mins"] = (df["halftime"] == "1st-half") & (df["minute"] < 15) & (df["Total Injuries"] >= 1)

# Substitutions in first 30 mins
df["Substutions_in_first_30_mins"] = (df["halftime"] == "1st-half") & (df["minute"] < 30) & (df["Total Substitutions"] >= 1)

# List of columns to check
columns_to_check = [
    "Goal_in_first_10_mins",
    "Redcard_in_first_15_mins",
    "Yellowcard_in_first_10_mins",
    "Injuries_in_first_15_mins",
    "Goal_after_80_mins",
    "Substutions_in_first_30_mins",
    "Red_cards_after_75_mins"
]

# Print event counts
for col in columns_to_check:
    print(f"Number of {col} events: {df[col].sum()}")
print("\n")
print(f"Current number of rows: {len(df)}")

# Remove rows with any of the specified events
df = df[~df[columns_to_check].any(axis=1)]
df = df.drop(columns=columns_to_check)

# Print updated row count
print(f"Number of rows after removing matches with events: {len(df)}")


In [None]:
df.head(5)

In [None]:
# Convert the 'match_start_datetime' column to a date object and store it in a new column 'match_date'
df['match_date'] = pd.to_datetime(df['match_start_datetime']).dt.date

# Define the split date for training and test sets
test_train_split_date = datetime.date(2024, 11, 1)

# Create the test set: Rows where the match date is on or after the split date
test_df = df[df['match_date'] >= test_train_split_date].copy()

# Create the training set: Rows where the match date is before the split date
train_df = df[df['match_date'] < test_train_split_date].copy()

# Print the number of rows and unique matches in the training set
print(f"Number of rows in the training set: {len(train_df)}")
print(f"Number of matches in the training set: {train_df['fixture_id'].nunique()}")
print("\n")

# Print the number of rows and unique matches in the test set
print(f"Number of rows in the test set: {len(test_df)}")
print(f"Number of matches in the test set: {test_df['fixture_id'].nunique()}")

In [None]:
# Define unrelated columns
unrelated_columns = [
    "fixture_id", "current_time", "half_start_datetime", 
    "match_start_datetime", "latest_bookmaker_update", "name", 
    "result", "match_date", "final_score"]

# Prepare the full training set
X_full_train = train_df.copy()
X_full_train = X_full_train.drop(columns=unrelated_columns)
X_full_train = pd.get_dummies(X_full_train, drop_first=True)

y_full_train = np.select(
    [train_df['result'] == "1", train_df['result'] == "2"],
    [1, 2],
    default=0
)

# Split the full training set into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X_full_train, y_full_train, test_size=0.3, stratify=y_full_train, random_state = 2512
)

# Prepare the test set
X_test = test_df.copy()
X_test = X_test.drop(columns=unrelated_columns)
X_test = pd.get_dummies(X_test, drop_first=True)

y_test = np.select(
    [test_df['result'] == "1", test_df['result'] == "2"],
    [1, 2],
    default=0)

# Print dataset sizes
print(f"Number of rows in the training set (split): {len(X_train)}")
print(f"Number of rows in the validation set: {len(X_val)}")
print(f"Number of rows in the test set: {len(X_test)}")

In [12]:
# Parameter Combination for Decision Tree
param_grid = {
    'criterion': ['gini', 'entropy'],         
    'max_depth': [None, 10, 20],          
    'min_samples_split': [2, 5, 10],          
    'min_samples_leaf': [1, 2, 5],            
    'max_features': ['sqrt', 'log2'],
    'class_weight': ['balanced', None]}

In [None]:
# Find the best parameters for the Decision Tree model
dt = DecisionTreeClassifier(random_state=2512)

grid_search = GridSearchCV(estimator=dt, 
                           param_grid=param_grid, 
                           cv=10, 
                           n_jobs=-1, 
                           return_train_score=True,
                           verbose=1)

grid_search.fit(X_val, y_val)

In [None]:
best_dt = grid_search.best_estimator_
best_params = grid_search.best_params_

# Predict on the training set and test set using the best model
train_predictions = best_dt.predict(X_full_train)
test_predictions = best_dt.predict(X_test)

# Calculate the accuracies
train_accuracy_dt = accuracy_score(y_full_train, train_predictions)
test_accuracy_dt = accuracy_score(y_test, test_predictions)

# Print the best parameters and accuracies
print(f"Best Parameters: {best_params}")
print(f"Training Accuracy: {train_accuracy_dt:.4f}")
print(f"Test Accuracy: {test_accuracy_dt:.4f}")

In [None]:
results = grid_search.cv_results_
params = results['params']
mean_test_accuracy = results['mean_test_score']
mean_train_accuracy = results['mean_train_score'] 

# Create a DataFrame with the results
results_df = pd.DataFrame(params)
results_df['mean_test_accuracy'] = mean_test_accuracy
results_df['mean_train_accuracy'] = mean_train_accuracy

results_df = results_df.sort_values(by='mean_train_accuracy', ascending=False)
results_df = results_df.reset_index(drop=True)

results_df

In [None]:
plt.figure(figsize=(8, 6))

plt.plot(results_df['mean_test_accuracy'], label='Test Accuracy', color='orange')
plt.plot(results_df['mean_train_accuracy'], label='Train Accuracy', color='blue')

plt.ylabel('Accuracy')
plt.xlabel('Model')
plt.legend(loc = 'best')
plt.show()

In [None]:
feature_importances = best_dt.feature_importances_

feature_importance_df = pd.DataFrame({
    'Feature': X_val.columns,  
    'Importance': feature_importances
})

# Sort by importance
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
feature_importance_df.reset_index(drop=True, inplace=True)

top_features = feature_importance_df.head(10)

top_features

In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(
    x='Importance', 
    y='Feature', 
    data=top_features, 
    hue=None,   
    color='blue'  
)

plt.title('Top 10 Feature Importances')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

In [None]:
cm = confusion_matrix(y_full_train, train_predictions)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Draw","Home Win", "Away Win"], yticklabels=["Draw","Home Win", "Away Win"])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix of Training Set')
plt.show()

In [None]:
cm = confusion_matrix(y_test, test_predictions)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Draw","Home Win", "Away Win"], yticklabels=["Draw","Home Win", "Away Win"])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix of Test Set')
plt.show()

In [None]:
predict_proba_dt = best_dt.predict_proba(X_test) # Get the probabilities of the predictions

# Create a DataFrame with the probabilities, real results, predicted results, fixture_id and odds
betting_df = pd.DataFrame(predict_proba_dt, columns=[f'{i}' for i in range(predict_proba_dt.shape[1])])
betting_df["real_result"] = y_test
betting_df["predicted_result"] = test_predictions
betting_df["fixture_id"] = test_df["fixture_id"].values
betting_df["Odd_Home"] = test_df["1"].values
betting_df["Odd_Away"] = test_df["2"].values
betting_df["Odd_Draw"] = test_df["X"].values

# Iterate over the rows and bet based on the probabilities
betted_rows_dt_1 = {}

for fixture_id, match in betting_df.groupby("fixture_id"):
    for id_, row in match.iterrows():
        
        if row["0"] > 0.33:
            betted_rows_dt_1[fixture_id] = {"bet": "Draw", "odd": row["Odd_Draw"]}
            break
        elif row["1"] > 0.33:
            betted_rows_dt_1[fixture_id] = {"bet": "Home Win", "odd": row["Odd_Home"]}
            break
        elif row["2"] > 0.33:
            betted_rows_dt_1[fixture_id] = {"bet": "Away Win", "odd": row["Odd_Away"]}
            break

# Calculate the total return
total_return_dt_1 = 0

for fixture_id, betting_detail in betted_rows_dt_1.items():

    real_result = betting_df[betting_df["fixture_id"] == fixture_id]["real_result"].values[0]
    odd = betting_detail["odd"]
    bet = betting_detail["bet"]

    if bet == "Draw" and real_result == 0:
        total_return_dt_1 += odd
    elif bet == "Home Win" and real_result == 1:
        total_return_dt_1 += odd
    elif bet == "Away Win" and real_result == 2:
        total_return_dt_1 += odd
    else:
        continue


profit_dt1 = total_return_dt_1 - len(betted_rows_dt_1)
print(f"{total_return_dt_1:.2f} units of earn")
print(f"{profit_dt1:.2f} units of profit")


In [None]:
predict_proba_dt = best_dt.predict_proba(X_test) # Get the probabilities of the predictions

# Create a DataFrame with the probabilities, real results, predicted results, fixture_id and odds
betting_df = pd.DataFrame(predict_proba_dt, columns=[f'{i}' for i in range(predict_proba_dt.shape[1])])
betting_df["real_result"] = y_test
betting_df["predicted_result"] = test_predictions
betting_df["fixture_id"] = test_df["fixture_id"].values
betting_df["Odd_Home"] = test_df["1"].values
betting_df["Odd_Away"] = test_df["2"].values
betting_df["Odd_Draw"] = test_df["X"].values

# Iterate over the rows and bet based on the probabilities
betted_rows_dt_2 = {}

for fixture_id, match in betting_df.groupby("fixture_id"):
    for id_, row in match.iterrows():
        
        if row["0"] > 0.5:
            betted_rows_dt_2[fixture_id] = {"bet": "Draw", "odd": row["Odd_Draw"]}
            break
        elif row["1"] > 0.5:
            betted_rows_dt_2[fixture_id] = {"bet": "Home Win", "odd": row["Odd_Home"]}
            break
        elif row["2"] > 0.5:
            betted_rows_dt_2[fixture_id] = {"bet": "Away Win", "odd": row["Odd_Away"]}
            break

# Calculate the total return
total_return_dt_2 = 0

for fixture_id, betting_detail in betted_rows_dt_2.items():

    real_result = betting_df[betting_df["fixture_id"] == fixture_id]["real_result"].values[0]
    odd = betting_detail["odd"]
    bet = betting_detail["bet"]

    if bet == "Draw" and real_result == 0:
        total_return_dt_2 += odd
    elif bet == "Home Win" and real_result == 1:
        total_return_dt_2 += odd
    elif bet == "Away Win" and real_result == 2:
        total_return_dt_2 += odd
    else:
        continue

profit_dt2 = total_return_dt_2 - len(betted_rows_dt_2)
print(f"{total_return_dt_2:.2f} units of earn")
print(f"{profit_dt2:.2f} units of profit")

In [None]:
predict_proba_dt = best_dt.predict_proba(X_test) # Get the probabilities of the predictions

# Create a DataFrame with the probabilities, real results, predicted results, fixture_id and odds
betting_df = pd.DataFrame(predict_proba_dt, columns=[f'{i}' for i in range(predict_proba_dt.shape[1])])
betting_df["real_result"] = y_test
betting_df["predicted_result"] = test_predictions
betting_df["fixture_id"] = test_df["fixture_id"].values
betting_df["Odd_Home"] = test_df["1"].values
betting_df["Odd_Away"] = test_df["2"].values
betting_df["Odd_Draw"] = test_df["X"].values

# Iterate over the rows and bet based on the probabilities
betted_rows_dt_3 = {}

for fixture_id, match in betting_df.groupby("fixture_id"):
    for id_, row in match.iterrows():
        
        if row["0"] > 0.9:
            betted_rows_dt_3[fixture_id] = {"bet": "Draw", "odd": row["Odd_Draw"]}
            break
        elif row["1"] > 0.9:
            betted_rows_dt_3[fixture_id] = {"bet": "Home Win", "odd": row["Odd_Home"]}
            break
        elif row["2"] > 0.9:
            betted_rows_dt_3[fixture_id] = {"bet": "Away Win", "odd": row["Odd_Away"]}
            break

# Calculate the total return
total_return_dt_3 = 0

for fixture_id, betting_detail in betted_rows_dt_3.items():

    real_result = betting_df[betting_df["fixture_id"] == fixture_id]["real_result"].values[0]
    odd = betting_detail["odd"]
    bet = betting_detail["bet"]

    if bet == "Draw" and real_result == 0:
        total_return_dt_3 += odd
    elif bet == "Home Win" and real_result == 1:
        total_return_dt_3 += odd
    elif bet == "Away Win" and real_result == 2:
        total_return_dt_3 += odd
    else:
        continue

profit_dt3 = total_return_dt_3 - len(betted_rows_dt_3)
print(f"{total_return_dt_3:.2f} units of earn")
print(f"{profit_dt3:.2f} units of profit")


In [24]:
# Parameter Combination for Random Forest
param_grid = {
    'n_estimators': [50 ,100, 200],
    'max_depth':  [5, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 5],
    'n_jobs': [-1],
    'class_weight': ['balanced', None]}


In [None]:
# Find the best parameters for the Random Forest model
rf = RandomForestClassifier(random_state=123)

grid_search = GridSearchCV(estimator=rf, 
                           param_grid=param_grid, 
                           cv=10, 
                           scoring='accuracy',
                           return_train_score=True,
                           n_jobs=-1, 
                           verbose=1)

grid_search.fit(X_val, y_val)

In [None]:
best_rf = grid_search.best_estimator_
best_params = grid_search.best_params_

# Predict on the training set and test set using the best model
train_predictions = best_rf.predict(X_full_train)
test_predictions = best_rf.predict(X_test)

# Calculate the accuracies
train_accuracy_rf = accuracy_score(y_full_train, train_predictions)
test_accuracy_rf = accuracy_score(y_test, test_predictions)

# Print the best parameters and accuracies
print(f"Best Parameters: {best_params}")
print(f"Training Accuracy: {train_accuracy_rf:.4f}")
print(f"Test Accuracy: {test_accuracy_rf:.4f}")

In [None]:
plt.figure(figsize=(8, 6))

plt.plot(results_df['mean_test_accuracy'], label='Test Accuracy', color='orange')
plt.plot(results_df['mean_train_accuracy'], label='Train Accuracy', color='blue')

plt.ylabel('Accuracy')
plt.xlabel('Model')
plt.legend(loc = 'best')
plt.show()

In [None]:
feature_importances = best_rf.feature_importances_

feature_importance_df = pd.DataFrame({
    'Feature': X_val.columns,  
    'Importance': feature_importances
})

# Sort by importance
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
feature_importance_df.reset_index(drop=True, inplace=True)

top_features = feature_importance_df.head(15)

top_features

In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(
    x='Importance', 
    y='Feature', 
    data=top_features, 
    hue=None,  
    legend=False,  
    color='blue'  
)

plt.title('Top 10 Feature Importances')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

In [None]:
cm = confusion_matrix(y_full_train, train_predictions)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Draw","Home Win", "Away Win"], yticklabels=["Draw","Home Win", "Away Win"])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix of Training Set')
plt.show()

In [None]:
cm = confusion_matrix(y_test, test_predictions)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Draw","Home Win", "Away Win"], yticklabels=["Draw","Home Win", "Away Win"])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix of Test Set')
plt.show()

In [None]:
predict_proba_rf = best_rf.predict_proba(X_test) # Get the probabilities of the predictions

# Create a DataFrame with the probabilities, real results, predicted results, fixture_id and odds
betting_df = pd.DataFrame(predict_proba_rf, columns=[f'{i}' for i in range(predict_proba_rf.shape[1])])
betting_df["real_result"] = y_test
betting_df["predicted_result"] = test_predictions
betting_df["fixture_id"] = test_df["fixture_id"].values
betting_df["Odd_Home"] = test_df["1"].values
betting_df["Odd_Away"] = test_df["2"].values
betting_df["Odd_Draw"] = test_df["X"].values

# Iterate over the rows and bet based on the probabilities
betted_rows_rf_1 = {}

for fixture_id, match in betting_df.groupby("fixture_id"):
    for id_, row in match.iterrows():
        
        if row["0"] > 0.33:
            betted_rows_rf_1[fixture_id] = {"bet": "Draw", "odd": row["Odd_Draw"]}
            break
        elif row["1"] > 0.33:
            betted_rows_rf_1[fixture_id] = {"bet": "Home Win", "odd": row["Odd_Home"]}
            break
        elif row["2"] > 0.33:
            betted_rows_rf_1[fixture_id] = {"bet": "Away Win", "odd": row["Odd_Away"]}
            break

# Calculate the total return
total_return_rf_1 = 0

for fixture_id, betting_detail in betted_rows_rf_1.items():

    real_result = betting_df[betting_df["fixture_id"] == fixture_id]["real_result"].values[0]
    odd = betting_detail["odd"]
    bet = betting_detail["bet"]

    if bet == "Draw" and real_result == 0:
        total_return_rf_1 += odd
    elif bet == "Home Win" and real_result == 1:
        total_return_rf_1 += odd
    elif bet == "Away Win" and real_result == 2:
        total_return_rf_1 += odd
    else:
        continue


profit_rf1 = total_return_rf_1 - len(betted_rows_rf_1)
print(f"{total_return_rf_1:.2f} units of earn")
print(f"{profit_rf1:.2f} units of profit")

In [None]:
predict_proba_dt = best_rf.predict_proba(X_test) # Get the probabilities of the predictions

# Create a DataFrame with the probabilities, real results, predicted results, fixture_id and odds
betting_df = pd.DataFrame(predict_proba_dt, columns=[f'{i}' for i in range(predict_proba_dt.shape[1])])
betting_df["real_result"] = y_test
betting_df["predicted_result"] = test_predictions
betting_df["fixture_id"] = test_df["fixture_id"].values
betting_df["Odd_Home"] = test_df["1"].values
betting_df["Odd_Away"] = test_df["2"].values
betting_df["Odd_Draw"] = test_df["X"].values

# Iterate over the rows and bet based on the probabilities
betted_rows_rf_2 = {}

for fixture_id, match in betting_df.groupby("fixture_id"):
    for id_, row in match.iterrows():
        
        if row["0"] > 0.5:
            betted_rows_rf_2[fixture_id] = {"bet": "Draw", "odd": row["Odd_Draw"]}
            break
        elif row["1"] > 0.5:
            betted_rows_rf_2[fixture_id] = {"bet": "Home Win", "odd": row["Odd_Home"]}
            break
        elif row["2"] > 0.5:
            betted_rows_rf_2[fixture_id] = {"bet": "Away Win", "odd": row["Odd_Away"]}
            break

# Calculate the total return
total_return_rf_2 = 0

for fixture_id, betting_detail in betted_rows_rf_2.items():

    real_result = betting_df[betting_df["fixture_id"] == fixture_id]["real_result"].values[0]
    odd = betting_detail["odd"]
    bet = betting_detail["bet"]

    if bet == "Draw" and real_result == 0:
        total_return_rf_2 += odd
    elif bet == "Home Win" and real_result == 1:
        total_return_rf_2 += odd
    elif bet == "Away Win" and real_result == 2:
        total_return_rf_2 += odd
    else:
        continue

profit_rf2 = total_return_rf_2 - len(betted_rows_rf_2)
print(f"{total_return_rf_2:.2f} units of earn")
print(f"{profit_rf2:.2f} units of profit")

In [None]:
predict_proba_dt = best_rf.predict_proba(X_test) # Get the probabilities of the predictions

# Create a DataFrame with the probabilities, real results, predicted results, fixture_id and odds
betting_df = pd.DataFrame(predict_proba_dt, columns=[f'{i}' for i in range(predict_proba_dt.shape[1])])
betting_df["real_result"] = y_test
betting_df["predicted_result"] = test_predictions
betting_df["fixture_id"] = test_df["fixture_id"].values
betting_df["Odd_Home"] = test_df["1"].values
betting_df["Odd_Away"] = test_df["2"].values
betting_df["Odd_Draw"] = test_df["X"].values

# Iterate over the rows and bet based on the probabilities
betted_rows_rf_3 = {}

for fixture_id, match in betting_df.groupby("fixture_id"):
    for id_, row in match.iterrows():
        
        if row["0"] > 0.9:
            betted_rows_rf_3[fixture_id] = {"bet": "Draw", "odd": row["Odd_Draw"]}
            break
        elif row["1"] > 0.9:
            betted_rows_rf_3[fixture_id] = {"bet": "Home Win", "odd": row["Odd_Home"]}
            break
        elif row["2"] > 0.9:
            betted_rows_rf_3[fixture_id] = {"bet": "Away Win", "odd": row["Odd_Away"]}
            break

# Calculate the total return
total_return_rf_3 = 0

for fixture_id, betting_detail in betted_rows_rf_3.items():

    real_result = betting_df[betting_df["fixture_id"] == fixture_id]["real_result"].values[0]
    odd = betting_detail["odd"]
    bet = betting_detail["bet"]

    if bet == "Draw" and real_result == 0:
        total_return_rf_3 += odd
    elif bet == "Home Win" and real_result == 1:
        total_return_rf_3 += odd
    elif bet == "Away Win" and real_result == 2:
        total_return_rf_3 += odd
    else:
        continue

profit_rf3 = total_return_rf_3 - len(betted_rows_rf_3)
print(f"{total_return_rf_3:.2f} units of earn")
print(f"{profit_rf3:.2f} units of profit")

In [None]:
# Find the best parameters for the XGBoost model
xgb = XGBClassifier(random_state=123)

grid_search = GridSearchCV(
    estimator = xgb,
    param_grid = param_grid,
    scoring = 'accuracy',
    cv = 10,                
    verbose = 1,
    return_train_score = True)

grid_search.fit(X_val, y_val)

In [None]:
best_bt = grid_search.best_estimator_
best_params = grid_search.best_params_

# Predict on the training set and test set using the best model
train_predictions = best_bt.predict(X_full_train)
test_predictions = best_bt.predict(X_test)

# Calculate the accuracies
train_accuracy_bt = accuracy_score(y_full_train, train_predictions)
test_accuracy_bt = accuracy_score(y_test, test_predictions)

# Print the best parameters and accuracies
print(f"Best Parameters: {best_params}")
print(f"Training Accuracy: {train_accuracy_bt:.4f}")
print(f"Test Accuracy: {test_accuracy_bt:.4f}")

In [None]:
feature_importances = best_bt.feature_importances_

feature_importance_df = pd.DataFrame({
    'Feature': X_val.columns,  
    'Importance': feature_importances
})

# Sort by importance
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
feature_importance_df.reset_index(drop=True, inplace=True)

top_features = feature_importance_df.head(15)

top_features

In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(
    x='Importance', 
    y='Feature', 
    data=top_features, 
    hue=None,  
    legend=False,  
    color='blue'  
)

plt.title('Top 15 Feature Importances')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

In [None]:
cm = confusion_matrix(y_test, test_predictions)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Draw","Home Win", "Away Win"], yticklabels=["Draw","Home Win", "Away Win"])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix of Test Set')
plt.show()

In [None]:
predict_proba_xgboost = best_bt.predict_proba(X_test) # Get the probabilities of the predictions

# Create a DataFrame with the probabilities, real results, predicted results, fixture_id and odds
betting_df = pd.DataFrame(predict_proba_xgboost, columns=[f'{i}' for i in range(predict_proba_xgboost.shape[1])])
betting_df["real_result"] = y_test
betting_df["predicted_result"] = test_predictions
betting_df["fixture_id"] = test_df["fixture_id"].values
betting_df["Odd_Home"] = test_df["1"].values
betting_df["Odd_Away"] = test_df["2"].values
betting_df["Odd_Draw"] = test_df["X"].values

# Iterate over the rows and bet based on the probabilities
betted_rows_bt_1 = {}

for fixture_id, match in betting_df.groupby("fixture_id"):
    for id_, row in match.iterrows():
        
        if row["0"] > 0.33:
            betted_rows_bt_1[fixture_id] = {"bet": "Draw", "odd": row["Odd_Draw"]}
            break
        elif row["1"] > 0.33:
            betted_rows_bt_1[fixture_id] = {"bet": "Home Win", "odd": row["Odd_Home"]}
            break
        elif row["2"] > 0.33:
            betted_rows_bt_1[fixture_id] = {"bet": "Away Win", "odd": row["Odd_Away"]}
            break

# Calculate the total return
total_return_bt_1 = 0

for fixture_id, betting_detail in betted_rows_bt_1.items():

    real_result = betting_df[betting_df["fixture_id"] == fixture_id]["real_result"].values[0]
    odd = betting_detail["odd"]
    bet = betting_detail["bet"]

    if bet == "Draw" and real_result == 0:
        total_return_bt_1 += odd
    elif bet == "Home Win" and real_result == 1:
        total_return_bt_1 += odd
    elif bet == "Away Win" and real_result == 2:
        total_return_bt_1 += odd
    else:
        continue

profit_bt1 = total_return_bt_1 - len(betted_rows_bt_1)
print(f"{total_return_bt_1:.2f} units of earn")
print(f"{profit_bt1:.2f} units of profit")

In [None]:
predict_proba_xgboost = best_bt.predict_proba(X_test) # Get the probabilities of the predictions

# Create a DataFrame with the probabilities, real results, predicted results, fixture_id and odds
betting_df = pd.DataFrame(predict_proba_xgboost, columns=[f'{i}' for i in range(predict_proba_xgboost.shape[1])])
betting_df["real_result"] = y_test
betting_df["predicted_result"] = test_predictions
betting_df["fixture_id"] = test_df["fixture_id"].values
betting_df["Odd_Home"] = test_df["1"].values
betting_df["Odd_Away"] = test_df["2"].values
betting_df["Odd_Draw"] = test_df["X"].values

# Iterate over the rows and bet based on the probabilities
betted_rows_bt_2 = {}

for fixture_id, match in betting_df.groupby("fixture_id"):
    for id_, row in match.iterrows():
        
        if row["0"] > 0.5:
            betted_rows_bt_2[fixture_id] = {"bet": "Draw", "odd": row["Odd_Draw"]}
            break
        elif row["1"] > 0.5:
            betted_rows_bt_2[fixture_id] = {"bet": "Home Win", "odd": row["Odd_Home"]}
            break
        elif row["2"] > 0.5:
            betted_rows_bt_2[fixture_id] = {"bet": "Away Win", "odd": row["Odd_Away"]}
            break

# Calculate the total return
total_return_bt_2 = 0

for fixture_id, betting_detail in betted_rows_bt_2.items():

    real_result = betting_df[betting_df["fixture_id"] == fixture_id]["real_result"].values[0]
    odd = betting_detail["odd"]
    bet = betting_detail["bet"]

    if bet == "Draw" and real_result == 0:
        total_return_bt_2 += odd
    elif bet == "Home Win" and real_result == 1:
        total_return_bt_2 += odd
    elif bet == "Away Win" and real_result == 2:
        total_return_bt_2 += odd
    else:
        continue

profit_bt2 = total_return_bt_2 - len(betted_rows_bt_2)
print(f"{total_return_bt_2:.2f} units of earn")
print(f"{profit_bt2:.2f} units of profit")

In [None]:
predict_proba_xgboost = best_bt.predict_proba(X_test) # Get the probabilities of the predictions

# Create a DataFrame with the probabilities, real results, predicted results, fixture_id and odds
betting_df = pd.DataFrame(predict_proba_xgboost, columns=[f'{i}' for i in range(predict_proba_xgboost.shape[1])])
betting_df["real_result"] = y_test
betting_df["predicted_result"] = test_predictions
betting_df["fixture_id"] = test_df["fixture_id"].values
betting_df["Odd_Home"] = test_df["1"].values
betting_df["Odd_Away"] = test_df["2"].values
betting_df["Odd_Draw"] = test_df["X"].values

# Iterate over the rows and bet based on the probabilities
betted_rows_bt_3 = {}

for fixture_id, match in betting_df.groupby("fixture_id"):
    for id_, row in match.iterrows():
        
        if row["0"] > 0.9:
            betted_rows_bt_3[fixture_id] = {"bet": "Draw", "odd": row["Odd_Draw"]}
            break
        elif row["1"] > 0.9:
            betted_rows_bt_3[fixture_id] = {"bet": "Home Win", "odd": row["Odd_Home"]}
            break
        elif row["2"] > 0.9:
            betted_rows_bt_3[fixture_id] = {"bet": "Away Win", "odd": row["Odd_Away"]}
            break

# Calculate the total return
total_return_bt_3 = 0

for fixture_id, betting_detail in betted_rows_bt_3.items():

    real_result = betting_df[betting_df["fixture_id"] == fixture_id]["real_result"].values[0]
    odd = betting_detail["odd"]
    bet = betting_detail["bet"]

    if bet == "Draw" and real_result == 0:
        total_return_bt_3 += odd
    elif bet == "Home Win" and real_result == 1:
        total_return_bt_3 += odd
    elif bet == "Away Win" and real_result == 2:
        total_return_bt_3 += odd
    else:
        continue

profit_bt3 = total_return_bt_3 - len(betted_rows_bt_3)
print(f"{total_return_bt_3:.2f} units of earn")
print(f"{profit_bt3:.2f} units of profit")

In [None]:
# Create a DataFrame with the total return results
earn_results = {
    'Method': ['Decision Tree', 'Decision Tree', 'Decision Tree','Random Forest', 'Random Forest','Random Forest', 'Gradient Boosted Decision Tree', 'Gradient Boosted Decision Tree',  'Gradient Boosted Decision Tree'],
    'Treshold': [0.33, 0.5, 0.9 ,0.33, 0.5, 0.9,0.33, 0.5, 0.9],
    'Total Return': [total_return_dt_1, total_return_dt_2, total_return_dt_3, total_return_rf_1, total_return_rf_2, total_return_rf_3, total_return_bt_1, total_return_bt_2, total_return_bt_3],
    'Total Played': [len(betted_rows_dt_1), len(betted_rows_dt_2), len(betted_rows_dt_3), len(betted_rows_rf_1), len(betted_rows_rf_2), len(betted_rows_rf_3), len(betted_rows_bt_1), len(betted_rows_bt_2), len(betted_rows_bt_3)],
}

earn_df = pd.DataFrame(earn_results)
earn_df["Net Profit"] = earn_df["Total Return"] - earn_df["Total Played"]

earn_df