In [None]:
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

import pandas as pd
import tensorflow as tf

from tensorflow import keras as k
from utility import *

In [None]:
data = load_data()
X, y = create_XY(data)

enc = OneHotEncoder(sparse=False)
y = enc.fit_transform(y)
X_imputed = SimpleImputer().fit_transform(X)

# LSTM

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X_imputed, y, shuffle=False, test_size=test_size)

x_train = reshape_to_inputshape(x_train, trn_ssn)
y_train = reshape_to_inputshape(y_train, trn_ssn)

x_test = reshape_to_inputshape(x_test, tst_ssn)
y_test = reshape_to_inputshape(y_test,  tst_ssn)

In [None]:
model = k.models.Sequential(
    [
        k.layers.LSTM(64, return_sequences=True),
        k.layers.Dropout(0.4),
        k.layers.Dense(1000, activation="relu"),
        k.layers.Dropout(0.3),
        k.layers.Dense(250, activation="relu"),
        k.layers.Dropout(0.2),
        k.layers.Dense(3, activation="softmax")
    ]
)

print(model(x_train).shape)
model.summary()

model.compile(
    loss='categorical_crossentropy',
    optimizer=tf.keras.optimizers.Adam(0.01),
    metrics=["accuracy"]
)

callback = tf.keras.callbacks.EarlyStopping(monitor='accuracy', patience=5)

model.fit(x_train, y_train, epochs=10, callbacks=[callback])

In [None]:
model.evaluate(x_test, y_test)
report2(model, x_train, y_train, "train", enc)
report2(model, x_test, y_test, "test", enc)

## 2 LSTM concatenate + dense e dropout in sequenza

In [None]:
inputs = k.layers.Input(shape=(10,94))
lstm1 = k.layers.LSTM(100, return_sequences=True, activation="relu")(inputs)
lstm2 = k.layers.LSTM(50, return_sequences=True, activation="relu")(inputs)
concatenated = k.layers.Concatenate()([
    lstm1,
    lstm2
])

out = k.layers.Dropout(0.5)(concatenated)
out = k.layers.Dense(1000, activation="relu")(out)
out = k.layers.Dropout(0.5)(out)
out = k.layers.Dense(250, activation="relu")(out)
out = k.layers.Dropout(0.5)(out)
out = k.layers.Dense(3, activation="softmax")(out)

model = k.models.Model(inputs=inputs, outputs=out)

In [None]:
model.summary()

In [None]:
model.compile(
    loss='categorical_crossentropy',
    optimizer=tf.keras.optimizers.Adam(0.0001),
    metrics=["accuracy"]
)

callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=5)

model.fit(x_train, y_train, epochs=500, callbacks=[callback])

In [None]:
model.evaluate(x_test, y_test)
report2(model, x_train, y_train, "train", enc)
report2(model, x_test, y_test, "test", enc)

### Fit
loss: 0.9373 - accuracy: 0.5609
### Evaluate
loss: 0.9192 - accuracy: 0.5930
### Classification report

| | precision | recall | f1-score | support |
| - | ----------- | ------ | -------- | ------- |
| A | 0.609 | 0.614 | 0.612 | 345 |
| D | 0.321 | 0.201 | 0.247 | 254 |
| H | 0.652 | 0.763 | 0.704 | 541 |
| accuracy | |  | 0.593 | 1140 |
| macro avg | 0.527 | 0.526 | 0.521 | 1140 |
| weighted avg | 0.565 | 0.593 | 0.574 | 1140 |

## LSTM in parallelo

In [None]:
inputs = k.layers.Input(shape=(10,94))
x = []
Tx = 10
for t in range(Tx):
    module = k.layers.LSTM((t+1)*10, return_sequences=True)(inputs)
    module = k.layers.Dropout(0.7)(module)
    x.append(module)
x = k.layers.Concatenate()(x)

out = k.layers.Dense(1000, activation="relu")(x)
out = k.layers.Dropout(0.7)(out)
out = k.layers.Dense(250, activation="relu")(out)
out = k.layers.Dense(3, activation="softmax")(out)

txLstm = k.models.Model(inputs=inputs, outputs=out)

In [None]:
txLstm.summary()

In [None]:
txLstm.compile(
    loss='categorical_crossentropy',
    optimizer=tf.keras.optimizers.Adam(0.0001),
    metrics=["accuracy"]
)

callback = tf.keras.callbacks.EarlyStopping(monitor='accuracy', patience=3)

txLstm.fit(x_train, y_train, epochs=500, callbacks=[callback], batch_size=16)

In [None]:
txLstm.evaluate(x_test, y_test)
report2(txLstm, x_train, y_train, "train", enc)
report2(txLstm, x_test, y_test, "test", enc)

# GRU

In [None]:
gru = k.models.Sequential([
    k.layers.GRU(100, return_sequences=True, activation="relu"),
    k.layers.Dropout(0.4),
    k.layers.Dense(1000, activation="relu"),
    k.layers.Dropout(0.3),
    k.layers.Dense(250, activation="relu"),
    k.layers.Dropout(0.2),
    k.layers.Dense(3, activation="softmax")
])

In [None]:
gru(x_train)
gru.summary()

In [None]:
gru.compile(
    loss='categorical_crossentropy',
    optimizer=tf.keras.optimizers.Adam(0.0001),
    metrics=["accuracy"]
)

callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=5)

gru.fit(x_train, y_train, epochs=500, callbacks=[callback])

In [None]:
gru.evaluate(x_test, y_test)
report2(gru, x_train, y_train, "train", enc)
report2(gru, x_test, y_test, "test", enc)

# Time distributed

In [None]:
def time_step(data, label, step=10):
    x,y = [],[]
    for i in range(data.shape[0] - step):
        x.append(data[i:i+step])
        y.append(label[i+step])
    return np.array(x), np.array(y)

In [None]:
def create_dataset(team_name):
    ht = data.loc[data['HomeTeam']==team_name]
    at = data.loc[data['AwayTeam']==team_name]
    team_data = pd.concat([ht,at])
    team_data['Team'] = team_name
    team_data_label = team_data['FTR']
    team_data_featured = pd.get_dummies(team_data[features])
    return team_data_featured,team_data_label

In [None]:
team_name = 'Chelsea'
data_f, data_l = create_dataset(team_name)
data_f = SimpleImputer().fit_transform(data_f)
x_ars, y_ars = time_step(data_f, data_l.to_numpy())

print(x_ars.shape)
print(y_ars.shape)

y_ars = enc.fit_transform(y_ars.reshape(-1,1))
x_train, x_test, y_train, y_test = train_test_split(x_ars, y_ars, shuffle=False, test_size=test_size)

Tx = x_train.shape[1] #Time steps
Ty = y_train.shape[1] #Time Steps

In [None]:
model = k.models.Sequential(
    [
        k.layers.GRU(128, dropout=0.5, return_sequences=True),
        k.layers.Flatten(),
        k.layers.Dense(40, activation="relu"),
        k.layers.Dropout(0.4),
        k.layers.Dense(20, activation="relu"),
        k.layers.Dropout(0.2),
        k.layers.Dense(10, activation="relu"),
        k.layers.Dense(3, activation="softmax")
    ]
)

In [None]:
model.compile(
    loss='categorical_crossentropy',
    optimizer=tf.keras.optimizers.Adam(0.0001),
    metrics=["accuracy"]
)
model(x_train)
model.summary()

In [None]:
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=5)

model.fit(x_train, y_train, epochs=1000, callbacks=[callback])

In [None]:
model.evaluate(x_test, y_test)

report1(model, x_train, y_train, "train", enc)
report1(model, x_test, y_test, "test", enc)

In [None]:
teams = set(data['HomeTeam'])
df_x = pd.DataFrame()
df_y = pd.DataFrame()
for team in teams:
    data_f, data_l = create_dataset(team)
    #data_f = SimpleImputer().fit_transform(pd.get_dummies(data_f))  
    x_team, y_team = time_step(data_f, data_l.to_numpy())
    y_team = enc.fit_transform(y_team.reshape(-1,1))
    df_x = pd.concat([df_x, pd.DataFrame(x_team.reshape(x_team.shape[0], -1))])
    df_y = pd.concat([df_y, pd.DataFrame(y_team)])
df_x = SimpleImputer().fit_transform(pd.get_dummies(df_x))  
df_x = df_x.reshape(df_x.shape[0], 10, -1)
print(df_y.shape)
x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, shuffle=False, test_size=test_size)

# LSTM timestep = features

In [None]:
features_not_teams = ['HTeamEloScore', 'ATeamEloScore', 
            'HTdaysSinceLastMatch', 'ATdaysSinceLastMatch', 
            'HTW_rate', 'ATW_rate', 'ATD_rate', 'HTD_rate', 
            '7_HTW_rate', '12_HTW_rate', '7_ATW_rate', '12_ATW_rate', 
            '7_HTD_rate', '12_HTD_rate', '7_ATD_rate', '12_ATD_rate',
            '7_HTL_rate', '12_HTL_rate', '7_ATL_rate', '12_ATL_rate',
            '5_HTHW_rate', '5_ATAW_rate']

X = pd.get_dummies(data[features_not_teams]).to_numpy()
y = data[['FTR']].to_numpy().ravel().reshape(-1, 1)
enc = OneHotEncoder(sparse=False)
y = enc.fit_transform(y)
X_imputed = SimpleImputer().fit_transform(X)

In [None]:
def reshape_features(input: np.ndarray):
    arr = input.reshape(input.shape[0], input.shape[1], 1)
    return arr

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X_imputed, y, shuffle=False, test_size=test_size)

x_train = reshape_features(x_train)
x_test = reshape_features(x_test)
print("x_train:", x_train.shape)
print("x_test:", x_test.shape)
print("y_train:", y_train.shape)
print("y_test:", y_test.shape)

In [None]:
model = k.models.Sequential(
    [
        k.layers.LSTM(64),
        k.layers.Dense(3, activation="softmax")
    ]
)

model(x_train)
model.summary()

model.compile(
    loss='categorical_crossentropy',
    optimizer=tf.keras.optimizers.Adam(0.01),
    metrics=["accuracy"]
)

callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=5)

model.fit(x_train, y_train, epochs=10, callbacks=[callback])

In [None]:
report1(model, x_train, y_train, 'train', enc)
report1(model, x_test, y_test, 'test', enc)