In [63]:
# ============================================================
# Notebook setup
# ============================================================

%load_ext autoreload
%autoreload 2

figsize=(14, 4)

random_state = 42   

from utils.data_visualizator import plot_confusion_matrix

from utils import DataAggregator

import tensorflow as tf
import tensorflow.keras as keras
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Input, Concatenate
from keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.losses import MeanSquaredError, SparseCategoricalCrossentropy

from sklearn.preprocessing import StandardScaler

import pandas as pd

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [33]:
data_aggregator = DataAggregator()

In [34]:
df = data_aggregator.get_data(["E0"])
df = data_aggregator.format_date(df, "Date")
df = data_aggregator.encode_result(df,
                        mapping={"H": 1, "D": 0, "A": -1}, 
                        result_column="FTR")
df = data_aggregator.create_form_data(df, form_window=5)
df.drop(columns=["Div", "FTHG", "FTAG", "HTHG", "HTAG", "HTR", "Referee", "HS", "AS", "HST", "AST", "HF", "AF", "HC","AC","HY","AY","HR","AR"], inplace=True)
df = data_aggregator.one_hot_encode_teams(df, "HomeTeam", "AwayTeam")

In [35]:
target = "FTR"

df_train = df[df["Date"] < "2023-07-01"]
df_test = df[df["Date"] >= "2023-07-01"]
df_test = df_test[df_test["Date"] < "2024-07-01"]

X_train = df_train.drop(columns=[target, "Date", "FTR"])
y_train = df_train[target]
X_test = df_test.drop(columns=[target, "Date", "FTR"])
y_test = df_test[target]

In [36]:
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
input_dim = X_train.shape[1]

## Classifier
First of all, we start by creating a neural network that will act as a classifier. We will use it to classify the matches into categorical "Win", "Draw" and "Loss".

In [80]:
def create_classifier(input_dim):
    model = Sequential([
        Dense(64, activation='relu', input_dim=input_dim),
        Dropout(0.2),
        Dense(32, activation='relu'),
        Dropout(0.2),
        Dense(3, activation='softmax')
    ])
    model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])
    return model

In [81]:
y_train_cls = to_categorical(y_train.map({-1: 0, 0: 1, 1: 2}))  # Map to [0, 1, 2] classes
y_test_cls = to_categorical(y_test.map({-1: 0, 0: 1, 1: 2}))

In [82]:
classifier = create_classifier(input_dim)
print(classifier.summary())

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


None


In [83]:
history = classifier.fit(X_train, y_train_cls, validation_split=0.2, epochs=50, batch_size=32, verbose=1)

Epoch 1/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.4506 - loss: 1.1267 - val_accuracy: 0.5495 - val_loss: 0.9680
Epoch 2/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 681us/step - accuracy: 0.5185 - loss: 0.9803 - val_accuracy: 0.5714 - val_loss: 0.9376
Epoch 3/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 667us/step - accuracy: 0.5651 - loss: 0.9283 - val_accuracy: 0.5783 - val_loss: 0.9248
Epoch 4/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 664us/step - accuracy: 0.5744 - loss: 0.9087 - val_accuracy: 0.5783 - val_loss: 0.9213
Epoch 5/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 658us/step - accuracy: 0.5908 - loss: 0.8871 - val_accuracy: 0.5771 - val_loss: 0.9235
Epoch 6/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 655us/step - accuracy: 0.5880 - loss: 0.8910 - val_accuracy: 0.5808 - val_loss: 0.9199
Epoch 7/50
[1m200

In [84]:
y_pred_cls = classifier.predict(X_test)
mapped_predictions_cls = y_pred_cls.argmax(axis=1) - 1

[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 


In [85]:
cls_df = pd.concat([df_test[[target, "B365H", "B365D", "B365A"]].reset_index(drop=True), pd.Series(mapped_predictions_cls, name="PredictedOutcome")], axis=1)

cls_accuracy, cls_won = data_aggregator.calculate_accuracy(cls_df, target, "PredictedOutcome")

print(f"""The accuracy of the model is {cls_accuracy:.2%}
The accuracy of the model is {cls_accuracy*len(cls_df):.0f} out of {len(cls_df)} games.
""")
print(f"With this model, the expected return on value would be {cls_won:.2f}€")

The accuracy of the model is 62.63%
The accuracy of the model is 238 out of 380 games.

With this model, the expected return on value would be 7791.50€


## Regressor
Next up we will define a neural network as a regressor, and will use it to predict the goal difference of a match.

In [88]:
def create_regressor(input_dim):
    model = Sequential([
        Input(shape=(input_dim,)),
        Dense(64, activation='relu'),
        Dropout(0.2),
        Dense(32, activation='relu'),
        Dropout(0.2),
        Dense(1)
    ])
    model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mae'])
    return model

In [89]:
regressor = create_regressor(input_dim)
print(regressor.summary())

None


In [90]:
history = regressor.fit(X_train, y_train, validation_split=0.2, epochs=50, batch_size=32, verbose=1)

Epoch 1/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 1.3684 - mae: 0.9254 - val_loss: 0.6731 - val_mae: 0.7039
Epoch 2/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 744us/step - loss: 0.6622 - mae: 0.6912 - val_loss: 0.6220 - val_mae: 0.6820
Epoch 3/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 684us/step - loss: 0.5877 - mae: 0.6533 - val_loss: 0.5989 - val_mae: 0.6714
Epoch 4/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 663us/step - loss: 0.5568 - mae: 0.6334 - val_loss: 0.5993 - val_mae: 0.6603
Epoch 5/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 646us/step - loss: 0.5317 - mae: 0.6168 - val_loss: 0.5826 - val_mae: 0.6499
Epoch 6/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 650us/step - loss: 0.5357 - mae: 0.6198 - val_loss: 0.5745 - val_mae: 0.6397
Epoch 7/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7

In [91]:
def map_result(predicitions):
    return [1 if round(p) >= 1 else -1 if round(p) <= -1 else 0 for p in predicitions]

y_pred = regressor.predict(X_test).flatten()

mapped_predictions = map_result(y_pred)

[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 


In [92]:
regr_df = pd.concat([df_test[[target, "B365H", "B365D", "B365A"]].reset_index(drop=True), pd.Series(mapped_predictions, name="PredictedOutcome")], axis=1)

regr_accuracy, regr_won = data_aggregator.calculate_accuracy(regr_df, target, "PredictedOutcome")

print(f"""The accuracy of the model is {regr_accuracy:.2%}
The accuracy of the model is {regr_accuracy*len(regr_df):.0f} out of {len(regr_df)} games.
""")
print(f"With this model, the expected return on value would be {regr_won:.2f}€")

The accuracy of the model is 45.79%
The accuracy of the model is 174 out of 380 games.

With this model, the expected return on value would be 5861.80€


## Hybrid approach
Finally, we will implement a hybrid variant, where we will take the output of the regressor, i.e., the predicted goal difference, and then use it as a input to a classifier NN in order to predict the results using another NN.

In [93]:
def create_hybrid_model(input_dim):
    inputs = Input(shape=(input_dim,), name="main_input")
    
    regressor = Dense(64, activation="relu", name="regressor_hidden_1")(inputs)
    regressor = Dropout(0.2)(regressor)
    regressor = Dense(32, activation="relu", name="regressor_hidden_2")(regressor)
    gd_output = Dense(1, activation="linear", name="goal_difference_output")(regressor)  # Regression output

    concatenated = Concatenate(name="concat_layer")([inputs, gd_output])
    
    classifier = Dense(64, activation="relu", name="classifier_hidden_1")(concatenated)
    classifier = Dropout(0.2)(classifier)
    classifier = Dense(32, activation="relu", name="classifier_hidden_2")(classifier)
    outcome_output = Dense(3, activation="softmax", name="match_outcome_output")(classifier)  # Classification output

    model = Model(inputs=inputs, outputs=[gd_output, outcome_output])
    model.compile(
        optimizer=Adam(learning_rate=0.001),
        loss={
            "goal_difference_output": MeanSquaredError(),
            "match_outcome_output": SparseCategoricalCrossentropy()
        },
        metrics={
            "goal_difference_output": "mae",
            "match_outcome_output": "accuracy"
        }
    )
    return model

In [94]:
hybrid_model = create_hybrid_model(input_dim)
print(hybrid_model.summary())

None


In [95]:
history = hybrid_model.fit(
    X_train,
    {"goal_difference_output": y_train, "match_outcome_output": y_train.map({-1: 0, 0: 1, 1: 2})},
    validation_split=0.2,
    epochs=50,
    batch_size=32,
    verbose=1
)

Epoch 1/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - goal_difference_output_loss: 0.9063 - goal_difference_output_mae: 0.7899 - loss: 1.9898 - match_outcome_output_accuracy: 0.4654 - match_outcome_output_loss: 1.0835 - val_goal_difference_output_loss: 0.6444 - val_goal_difference_output_mae: 0.6836 - val_loss: 1.6088 - val_match_outcome_output_accuracy: 0.5583 - val_match_outcome_output_loss: 0.9640
Epoch 2/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - goal_difference_output_loss: 0.5976 - goal_difference_output_mae: 0.6549 - loss: 1.5428 - match_outcome_output_accuracy: 0.5488 - match_outcome_output_loss: 0.9452 - val_goal_difference_output_loss: 0.6554 - val_goal_difference_output_mae: 0.6717 - val_loss: 1.6074 - val_match_outcome_output_accuracy: 0.5702 - val_match_outcome_output_loss: 0.9513
Epoch 3/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - goal_difference_output_loss: 0.5646 

In [96]:
y_pred_gd, y_pred_cls = hybrid_model.predict(X_test)

mapped_predictions_gd = map_result(y_pred_gd.flatten())
mapped_predictions_cls = y_pred_cls.argmax(axis=1) - 1

[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 


In [97]:
regr_accuracy, regr_won = data_aggregator.calculate_accuracy(regr_df, target, "PredictedOutcome")
print("Regression predictions:")
print(f"""The accuracy of the model is {regr_accuracy:.2%}
The accuracy of the model is {regr_accuracy*len(regr_df):.0f} out of {len(regr_df)} games.
""")
print(f"With this model, the expected return on value would be {regr_won:.2f}€")

Regression predictions:
The accuracy of the model is 45.79%
The accuracy of the model is 174 out of 380 games.

With this model, the expected return on value would be 5861.80€


In [98]:
cls_accuracy, cls_won = data_aggregator.calculate_accuracy(cls_df, target, "PredictedOutcome")
print("Classification predictions:")
print(f"""The accuracy of the model is {cls_accuracy:.2%}
The accuracy of the model is {cls_accuracy*len(cls_df):.0f} out of {len(cls_df)} games.
""")
print(f"With this model, the expected return on value would be {cls_won:.2f}€")

Classification predictions:
The accuracy of the model is 62.63%
The accuracy of the model is 238 out of 380 games.

With this model, the expected return on value would be 7791.50€


Well, as we can see from the output above, the model actually just ignores the predicted goal difference feature. Meaning that the dataset itself is good enough to predict the matches with approx 62% accuracy. In order to achieve better result, we should append more features, and maybe include more form data such as 3 game form, 5 game form, 10 game form, the form against a similar opponent previous...

In [100]:
data_aggregator.save_metrics("nn_cls", cls_accuracy, cls_won)
data_aggregator.save_metrics("nn_regr", regr_accuracy, regr_won)