# Import e funzioni ausiliarie

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import SGDClassifier, RidgeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

from tensorflow import keras as k

import tensorflow as tf
import pandas as pd
import numpy as np

# %pip install d2l==1.0.0a1.post0
from d2l import tensorflow as d2l


In [2]:
data = pd.read_csv('./input/dataset.csv')
# dataImputed = pd.read_csv('./input/datasetImputed.csv')

In [3]:
def reshape_to_inputshape(a_prev,season):
    # e.g. (a_prev=x_train, season=trn_ssn)
    totalMatches = len(season)*38
    input_step = int(a_prev.shape[0]/totalMatches)
    prev_f = a_prev.shape[1]
    return np.reshape(a_prev, (totalMatches, input_step, prev_f))

In [4]:
def encode(x):
    if x == 0:
        v = [1,0,0]
    elif x == 1:
        v = [0,1,0]
    elif x == 2:
        v = [0,0,1]
    return v

def encode_list(l):
    r = []
    for i in l:
        r.append(encode(i))
    return r

In [5]:
def report(model, x_train, y_train, x_test, y_test, encoder):
    # TRAIN
    y_pred_train = model.predict(x_train)
    y_predm_train = np.asarray(y_pred_train)
    y_predm_train = np.argmax(y_predm_train, axis=1)
    y_predm_train = encode_list(y_predm_train.reshape(-1))
    y_predm_train = np.array(y_predm_train)

    y_trainm = np.argmax(y_train, axis=1)
    y_trainm = encode_list(y_trainm.reshape(-1))
    y_trainm = np.array(y_trainm)

    # Inverse One-hot transform - TRAIN
    y_predm_train = encoder.inverse_transform(y_predm_train)
    y_trainm = encoder.inverse_transform(y_trainm)

    # TEST
    y_pred = model.predict(x_test)
    y_predm = np.asarray(y_pred)
    y_predm = np.argmax(y_predm, axis=1)
    y_predm = encode_list(y_predm.reshape(-1))
    y_predm = np.array(y_predm)
    
    y_testm = np.argmax(y_test, axis=1)
    y_testm = encode_list(y_testm.reshape(-1))
    y_testm = np.array(y_testm)

    # Inverse One-hot transform - TEST
    y_predm = encoder.inverse_transform(y_predm)
    y_testm = encoder.inverse_transform(y_testm)

    #Model Metrics test
    print("TEST REPORT")
    print(classification_report(y_testm, y_predm, digits=3))
    print("--------------------------------------------------")

    #Model Metrics train
    print("TRAINING REPORT")
    print(classification_report(y_trainm, y_predm_train, digits=3))
    print("--------------------------------------------------")

# Preprocessamento dati

In [6]:
features = ['HomeTeam', 'AwayTeam', 
            'HTeamEloScore', 'ATeamEloScore', 
            'HTdaysSinceLastMatch', 'ATdaysSinceLastMatch', 
            'HTW_rate', 'ATW_rate', 'ATD_rate', 'HTD_rate', 
            '7_HTW_rate', '12_HTW_rate', '7_ATW_rate', '12_ATW_rate', 
            '7_HTD_rate', '12_HTD_rate', '7_ATD_rate', '12_ATD_rate',
            '7_HTL_rate', '12_HTL_rate', '7_ATL_rate', '12_ATL_rate',
            '5_HTHW_rate', '5_ATAW_rate']

X = pd.get_dummies(data[features])
print(X)
print(type(X))

# Se non cambiamo nulla il OneHotEncoder assegna:
# A -> 1 0 0
# D -> 0 1 0
# H -> 0 0 1
y = data[['FTR']].to_numpy().ravel().reshape(-1, 1)
enc = OneHotEncoder(sparse=False)
y = enc.fit_transform(y)
X_imputed = SimpleImputer().fit_transform(X)


      HTeamEloScore  ATeamEloScore  HTdaysSinceLastMatch  \
0         10.000000     -10.000000                   NaN   
1         10.000000     -10.000000                   NaN   
2        -10.000000      10.000000                   NaN   
3         10.000000     -10.000000                   NaN   
4         10.000000     -10.000000                   NaN   
...             ...            ...                   ...   
4175    -160.435965     -50.589179                   8.0   
4176     -19.797491     115.180619                   6.0   
4177     238.315260     -19.773266                   8.0   
4178      99.808261    -128.614330                   7.0   
4179     -70.454854    -214.666430                   8.0   

      ATdaysSinceLastMatch  HTW_rate  ATW_rate  ATD_rate  HTD_rate  \
0                      NaN       NaN       NaN       NaN       NaN   
1                      NaN       NaN       NaN       NaN       NaN   
2                      NaN       NaN       NaN       NaN       NaN   

In [7]:
trn_ssn = [2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015]
trn_ssn_len = len(trn_ssn)
tst_ssn = [2016,2017,2018] 
tst_ssn_len = len(tst_ssn)

test_size = float(tst_ssn_len)/(tst_ssn_len+trn_ssn_len)

#Split X and Y into training and Test Sets
x_train, x_test, y_train, y_test = train_test_split(X_imputed, y, shuffle=False, test_size=test_size)

# RandomForestClassifier

In [8]:
forest = RandomForestClassifier(n_estimators=2, random_state=2)
forest = forest.fit(x_train, y_train)


#Forest Model Metrics
print("Forest Classifier")
print("Train Score: ", forest.score(x_train, y_train))
print("Test Score: ", forest.score(x_test, y_test))

Forest Classifier
Train Score:  0.6302631578947369
Test Score:  0.22017543859649122


In [9]:
n = 10
m = 5
max_depth = 10
forests = []
grid = [{"n_estimators": list(range(1, n)), "random_state": list(range(0, m)), "max_depth": list(range(1, max_depth))}]
gridSearch = GridSearchCV(RandomForestClassifier(), param_grid=grid, n_jobs=10, return_train_score=True)
gridSearch.fit(x_train, y_train)

print("Forest Classifiers Best Score: ", gridSearch.best_score_)
print("Forest Classifiers Best Params: ", gridSearch.best_params_)
print("Forest Classifiers Best Params: ", gridSearch.best_estimator_)

KeyboardInterrupt: 

# Neural Network

In [10]:
y_nn = ((data[['ordinalHR']]).to_numpy()*2)
#Split X and Y into training and Test Sets
# y_nn = OneHotEncoder(sparse=True).fit_transform(y_nn)
x_train_nn, x_test_nn, y_train_nn, y_test_nn = train_test_split(X_imputed, y_nn, shuffle=True)

In [11]:
nn = k.models.Sequential([
    k.layers.Flatten(),
    k.layers.Dense(41, activation='relu'),
    k.layers.Dense(75, activation='relu'),
    k.layers.Dropout(0.3),
    k.layers.Dense(3, activation='softmax'),
])

Metal device set to: Apple M1 Pro

systemMemory: 16.00 GB
maxCacheSize: 5.33 GB



2022-11-23 20:45:52.336060: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-11-23 20:45:52.336208: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [12]:
learning_rate=0.01

nn.compile(
    loss='sparse_categorical_crossentropy',
    optimizer=k.optimizers.Adam(learning_rate),
    metrics=['accuracy']
)
nn(x_train_nn)
nn.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten (Flatten)           (3135, 94)                0         
                                                                 
 dense (Dense)               (3135, 41)                3895      
                                                                 
 dense_1 (Dense)             (3135, 75)                3150      
                                                                 
 dropout (Dropout)           (3135, 75)                0         
                                                                 
 dense_2 (Dense)             (3135, 3)                 228       
                                                                 
Total params: 7,273
Trainable params: 7,273
Non-trainable params: 0
_________________________________________________________________


In [13]:
epochs=10
batch_size=50
nn.fit(x_train_nn, y_train_nn, epochs=epochs, batch_size=batch_size)

Epoch 1/10


2022-11-23 20:45:53.342600: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2022-11-23 20:45:53.621454: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2acfba910>

In [14]:
print(nn.evaluate(x_test_nn, y_test_nn))



2022-11-23 20:45:59.983298: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


[0.9098030924797058, 0.5933014154434204]


# Lstm preprocessing 😍

In [15]:
def time_step(data, label, step=10):
    x,y = [],[]
    for i in range(data.shape[0]-(step+1)):
        x.append(data[i:i+step])
        y.append(label[i+step+1])
    return np.array(x),np.array(y)

In [16]:
def create_dataset(team_name):
    ht = data.loc[data['HomeTeam']==team_name]
    at = data.loc[data['AwayTeam']==team_name]
    team_data = pd.concat([ht,at])
    team_data_label = team_data['FTR']
    team_data_featured = pd.get_dummies(team_data[features])
    return team_data_featured,team_data_label

In [17]:
team_name = 'Chelsea'
data_f, data_l = create_dataset(team_name)
data_f = SimpleImputer().fit_transform(data_f)
x_ars, y_ars = time_step(data_f, data_l.to_numpy())
print(x_ars.shape)
print(y_ars.shape)

(407, 10, 94)
(407, 11)


In [18]:
y_ars = enc.fit_transform(y_ars.reshape(-1,1))

In [19]:
x_train_lstm, x_test_lstm, y_train_lstm, y_test_lstm = train_test_split(x_ars, y_ars, shuffle=False, test_size=test_size)

ValueError: Found input variables with inconsistent numbers of samples: [407, 4477]

In [None]:
Tx = x_train_lstm.shape[1] #Time steps
Ty = y_train_lstm.shape[1] #Time Steps

# LSTM

In [None]:
model_path = './models/lstm/'
# Check se il modello è già salvato
model = k.models.load_model(model_path)
if model == None:
    model = k.models.Sequential(
        [
            k.layers.LSTM(64, return_sequences=True),
            k.layers.Dropout(0.4),
            k.layers.Dense(1000, activation="relu"),
            k.layers.Dropout(0.3),
            k.layers.Dense(250, activation="relu"),
            k.layers.Dropout(0.2),
            k.layers.Dense(3, activation="softmax")
        ]
    )

    model.compile(
        loss='categorical_crossentropy',
        optimizer=tf.keras.optimizers.Adam(0.0001),
        metrics=["accuracy"]
    )

    callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=5)

    model.fit(x_train_lstm, y_train_lstm, epochs=10, callbacks=[callback])
    model.save('./models/lstm')

In [None]:
model.evaluate(x_test_lstm, y_test_lstm)
report(model, x_train_lstm, y_train_lstm, x_test_lstm, y_test_lstm, enc)

## 2 LSTM concatenate + dense e dropout in sequenza

In [None]:
inputs = k.layers.Input(shape=(10,94))
lstm1 = k.layers.LSTM(100, return_sequences=True, activation="relu")(inputs)
lstm2 = k.layers.LSTM(50, return_sequences=True, activation="relu")(inputs)
concateneted = k.layers.Concatenate()([
    lstm1,
    lstm2
])

out = k.layers.Dropout(0.5)(concateneted)
out = k.layers.Dense(1000, activation="relu")(out)
out = k.layers.Dropout(0.5)(out)
out = k.layers.Dense(250, activation="relu")(out)
out = k.layers.Dropout(0.5)(out)
out = k.layers.Dense(3, activation="softmax")(out)

model = k.models.Model(inputs=inputs, outputs=out)

In [None]:
model.summary()

In [None]:
model.compile(
    loss='categorical_crossentropy',
    optimizer=tf.keras.optimizers.Adam(0.0001),
    metrics=["accuracy"]
)

callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=5)

model.fit(x_train_lstm, y_train_lstm, epochs=500, callbacks=[callback])

In [None]:
model.evaluate(x_test_lstm, y_test_lstm)
report(model, x_train_lstm, y_train_lstm, x_test_lstm, y_test_lstm, enc)

### Fit
loss: 0.9373 - accuracy: 0.5609
### Evaluate
loss: 0.9192 - accuracy: 0.5930
### Classification report

| | precision | recall | f1-score | support |
| - | ----------- | ------ | -------- | ------- |
| A | 0.609 | 0.614 | 0.612 | 345 |
| D | 0.321 | 0.201 | 0.247 | 254 |
| H | 0.652 | 0.763 | 0.704 | 541 |
| accuracy | |  | 0.593 | 1140 |
| macro avg | 0.527 | 0.526 | 0.521 | 1140 |
| weighted avg | 0.565 | 0.593 | 0.574 | 1140 |

## LSTM in parallelo

In [None]:
inputs = k.layers.Input(shape=(10,94))
x = []
for t in range(Tx):
    module = k.layers.LSTM((t+1)*10, return_sequences=True)(inputs)
    module = k.layers.Dropout(0.7)(module)
    x.append(module)
x = k.layers.Concatenate()(x)

out = k.layers.Dense(1000, activation="relu")(x)
out = k.layers.Dropout(0.7)(out)
out = k.layers.Dense(250, activation="relu")(out)
out = k.layers.Dense(3, activation="softmax")(out)

txLstm = k.models.Model(inputs=inputs, outputs=out)

In [None]:
txLstm.summary()

In [None]:
txLstm.compile(
    loss='categorical_crossentropy',
    optimizer=tf.keras.optimizers.Adam(0.0001),
    metrics=["accuracy"]
)

callback = tf.keras.callbacks.EarlyStopping(monitor='accuracy', patience=3)

txLstm.fit(x_train_lstm, y_train_lstm, epochs=500, callbacks=[callback], batch_size=16)

In [None]:
txLstm.evaluate(y_test_lstm, y_test_lstm)
report(txLstm, x_train_lstm, y_train_lstm, x_test_lstm, y_test_lstm, enc)

# GRU

In [None]:
gru = k.models.Sequential([
    k.layers.GRU(100, return_sequences=True, activation="relu"),
    k.layers.Dropout(0.4),
    k.layers.Dense(1000, activation="relu"),
    k.layers.Dropout(0.3),
    k.layers.Dense(250, activation="relu"),
    k.layers.Dropout(0.2),
    k.layers.Dense(3, activation="softmax")
])

In [None]:
gru(x_train_lstm)
gru.summary()

In [None]:
gru.compile(
    loss='categorical_crossentropy',
    optimizer=tf.keras.optimizers.Adam(0.0001),
    metrics=["accuracy"]
)

callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=5)

gru.fit(x_train_lstm, y_train_lstm, epochs=500, callbacks=[callback])

In [None]:
gru.evaluate(x_test_lstm, y_test_lstm)
report(gru, x_train_lstm, y_train_lstm, x_test_lstm, y_test_lstm, enc)

# Time distributed

In [None]:
model = k.models.Sequential(
    [
        k.layers.GRU(128, dropout=0.5, return_sequences=True),
        k.layers.Flatten(),
        k.layers.Dense(40, activation="relu"),
        k.layers.Dropout(0.4),
        k.layers.Dense(20, activation="relu"),
        k.layers.Dropout(0.2),
        k.layers.Dense(10, activation="relu"),
        k.layers.Dense(3, activation="softmax")
    ]
)

In [None]:
model.compile(
    loss='categorical_crossentropy',
    optimizer=tf.keras.optimizers.Adam(0.0001),
    metrics=["accuracy"]
)
model(x_train_lstm)
model.summary()

Model: "sequential_24"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 gru_6 (GRU)                 (296, 10, 128)            86016     
                                                                 
 flatten_5 (Flatten)         (296, 1280)               0         
                                                                 
 dense_55 (Dense)            (296, 40)                 51240     
                                                                 
 dropout_24 (Dropout)        (296, 40)                 0         
                                                                 
 dense_56 (Dense)            (296, 20)                 820       
                                                                 
 dropout_25 (Dropout)        (296, 20)                 0         
                                                                 
 dense_57 (Dense)            (296, 10)               

In [None]:
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=10)

model.fit(x_train_lstm, y_train_lstm, epochs=1000, callbacks=[callback])

Epoch 1/1000


2022-11-23 19:28:06.185058: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-11-23 19:28:06.362577: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


 1/10 [==>...........................] - ETA: 14s - loss: 1.1049 - accuracy: 0.2812

2022-11-23 19:28:06.757084: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000


<keras.callbacks.History at 0x411e29160>

In [None]:
model.evaluate(x_test_lstm, y_test_lstm)
report(model, x_train_lstm, y_train_lstm, x_test_lstm, y_test_lstm, enc)



2022-11-23 19:28:12.045044: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-11-23 19:28:12.099787: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


 1/10 [==>...........................] - ETA: 2s

2022-11-23 19:28:12.513809: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-11-23 19:28:12.556108: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


TEST REPORT
              precision    recall  f1-score   support

           A      0.541     0.983     0.698        60
           D      0.000     0.000     0.000        21
           H      1.000     0.033     0.065        30

    accuracy                          0.541       111
   macro avg      0.514     0.339     0.254       111
weighted avg      0.563     0.541     0.395       111

--------------------------------------------------
TRAINING REPORT
              precision    recall  f1-score   support

           A      0.453     0.565     0.503        69
           D      0.500     0.016     0.030        64
           H      0.654     0.834     0.733       163

    accuracy                          0.595       296
   macro avg      0.536     0.472     0.422       296
weighted avg      0.574     0.595     0.528       296

--------------------------------------------------
