In [71]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import SGDClassifier, RidgeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

from tensorflow import keras as k

import tensorflow as tf
import pandas as pd
import numpy as np

# %pip install d2l==1.0.0a1.post0
from d2l import tensorflow as d2l


In [72]:
data = pd.read_csv('./input/dataset.csv')
# dataImputed = pd.read_csv('./input/datasetImputed.csv')

In [73]:
# è stata riscritta una funzione di reshape inutile 
# e.g. (a_prev=x_train, season=trn_ssn)
def reshape_to_inputshape(a_prev,season):
    totalMatches = len(season)*38
    input_step = int(a_prev.shape[0]/totalMatches)
    prev_f = a_prev.shape[1]
    return np.reshape(a_prev, (totalMatches, input_step, prev_f))

In [74]:
features = ['HomeTeam', 'AwayTeam', 
            'HTeamEloScore', 'ATeamEloScore', 
            'HTdaysSinceLastMatch', 'ATdaysSinceLastMatch', 
            'HTW_rate', 'ATW_rate', 'ATD_rate', 'HTD_rate', 
            '7_HTW_rate', '12_HTW_rate', '7_ATW_rate', '12_ATW_rate', 
            '7_HTD_rate', '12_HTD_rate', '7_ATD_rate', '12_ATD_rate',
            '7_HTL_rate', '12_HTL_rate', '7_ATL_rate', '12_ATL_rate',
            '5_HTHW_rate', '5_ATAW_rate']

X = pd.get_dummies(data[features])

# Se non cambiamo nulla il OneHotEncoder assegna:
# A -> 1 0 0
# D -> 0 1 0
# H -> 0 0 1
y = data[['FTR']].to_numpy().ravel().reshape(-1, 1)
y = OneHotEncoder(sparse=False).fit_transform(y)
X_imputed = SimpleImputer().fit_transform(X)



In [75]:
trn_ssn = [2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015]
trn_ssn_len = len(trn_ssn)
tst_ssn = [2016,2017,2018] 
tst_ssn_len = len(tst_ssn)

test_size = float(tst_ssn_len)/(tst_ssn_len+trn_ssn_len)

#Split X and Y into training and Test Sets
x_train, x_test, y_train, y_test = train_test_split(X_imputed, y, shuffle=False, test_size=test_size)

In [76]:
def time_step(a_prev,season):
    a_prev = a_prev[np.newaxis, ...]
    totalMatches = len(season)*38

    prev_f = a_prev.shape[2]
    input_step = int(a_prev.shape[1]/totalMatches)
    step = 0
    a_new = np.zeros((totalMatches, input_step, prev_f))
    for i in range(totalMatches):
        # rows divise in porzioni di totalMatches rows
        step += input_step
        
        # per tutte le righe nell'intervallo di righe che stiamo guardando ora 
        # va in ogni porzione di righe di volta in volta
        for j in range(step-input_step,step):

            # per ogni colonna
            for k in range(prev_f):
                a_new[i, j - input_step * i, k] = a_prev[:, j, k]
    
    return a_new

In [77]:
#Setup XY to have 10 game steps
lstm_x_train = reshape_to_inputshape(x_train,trn_ssn)
lstm_y_train = reshape_to_inputshape(y_train,trn_ssn)
# y_train = np.moveaxis(y_train, 0, 1)

lstm_x_test = reshape_to_inputshape(x_test,tst_ssn)
lstm_y_test = reshape_to_inputshape(y_test,tst_ssn)
# y_test = np.moveaxis(y_test, 0, 1)

Tx = lstm_x_train.shape[1] #Time steps
Ty = lstm_y_train.shape[0] #Time Steps

num_features = lstm_x_train.shape[2] #Features per step
inputs = tf.keras.Input(shape=(Tx, num_features))

In [78]:
print(lstm_x_train.shape)
print(lstm_y_train.shape)

(304, 10, 94)
(304, 10, 3)


# Prova di tensorflow

In [79]:
inputs = tf.random.normal((32, 10, 8))
lstm = tf.keras.layers.LSTM(18)
output = lstm(inputs)
print(output.shape)

(32, 18)


In [80]:
inputs = np.array([[[0, 1], [0, 1]], [[1, 2], [1, 2]], [[2, 3], [2, 3]]], dtype=np.float64)
lstm = k.models.Sequential([
    tf.keras.layers.LSTM(2)
])
out = lstm(inputs)
print(out)


tf.Tensor(
[[-0.04938694  0.20068401]
 [-0.01805884  0.34465146]
 [ 0.00431494  0.44729358]], shape=(3, 2), dtype=float32)


In [81]:
MLP = tf.keras.models.Sequential([
    k.layers.Dense(4000),
    k.layers.Dropout(.7),
    k.layers.Dense(100),
    k.layers.Dense(3, activation='softmax')
])

In [82]:
MLP.compile(
    loss='categorical_crossentropy',
    optimizer=tf.keras.optimizers.Adam(0.001),
    metrics=[tf.keras.metrics.Accuracy()]
)
MLP(x_train)
MLP.summary()

Model: "sequential_14"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_12 (Dense)            (3040, 4000)              380000    
                                                                 
 dropout_5 (Dropout)         (3040, 4000)              0         
                                                                 
 dense_13 (Dense)            (3040, 100)               400100    
                                                                 
 dense_14 (Dense)            (3040, 3)                 303       
                                                                 
Total params: 780,403
Trainable params: 780,403
Non-trainable params: 0
_________________________________________________________________


In [83]:
MLP.fit(lstm_x_train, lstm_y_train)



2022-11-05 12:58:04.792372: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




<keras.callbacks.History at 0x3175d8370>

# RandomForestClassifieri

In [84]:
forest = RandomForestClassifier(n_estimators=2, random_state=2)
forest = forest.fit(x_train, y_train)


#Forest Model Metrics
print("Forest Classifier")
print("Train Score: ", forest.score(x_train, y_train))
print("Test Score: ", forest.score(x_test, y_test))

Forest Classifier
Train Score:  0.6302631578947369
Test Score:  0.22017543859649122


In [85]:
n = 10
m = 5
max_depth = 10
forests = []
grid = [{"n_estimators": list(range(1, n)), "random_state": list(range(0, m)), "max_depth": list(range(1, max_depth))}]
gridSearch = GridSearchCV(RandomForestClassifier(), param_grid=grid, n_jobs=10, return_train_score=True)
gridSearch.fit(x_train, y_train)
# for i in range(1, n+1):
#     for j in range(m):
#         forest = RandomForestClassifier(n_estimators=i, random_state=j)
#         forest = forest.fit(x_train, y_train)
#         forests.append(forest)
#         print("Forest Classifier, n_estimators: ", i, "random_state: ", j)
#         print("Train Score: ", forest.score(x_train, y_train))
#         print("Test Score: ", forest.score(x_test, y_test))

# #Forest Model Metrics
# print("Forest Classifier")
# print("Train Score: ", forest.score(x_train, y_train))
# print("Test Score: ", forest.score(x_test, y_test))
print("Forest Classifiers Best Score: ", gridSearch.best_score_)
print("Forest Classifiers Best Params: ", gridSearch.best_params_)
print("Forest Classifiers Best Params: ", gridSearch.best_estimator_)

Forest Classifiers Best Score:  0.4305921052631579
Forest Classifiers Best Params:  {'max_depth': 8, 'n_estimators': 1, 'random_state': 1}
Forest Classifiers Best Params:  RandomForestClassifier(max_depth=8, n_estimators=1, random_state=1)


# LSTM

In [86]:
x_prova = reshape_to_inputshape(x_train, trn_ssn)
y_prova = reshape_to_inputshape(y_train, trn_ssn)
x_prova_test = reshape_to_inputshape(x_test, tst_ssn)
y_prova_test = reshape_to_inputshape(y_test, tst_ssn)

In [87]:
# 65-67% Accuracy no Overfit
model = k.models.Sequential(
    [
        k.layers.LSTM(30),
        k.layers.Dense(1000, activation="relu"),
        k.layers.Dropout(0.4),
        k.layers.Dense(250, activation="relu"),
        k.layers.Dropout(0.2),
        k.layers.Dense(30, activation="relu"),
        k.layers.Reshape((10, 3)),
    ]
)

model.compile(
    loss='categorical_crossentropy',
    optimizer=tf.keras.optimizers.Adam(0.001),
    metrics=[tf.keras.metrics.Accuracy()]
)

model.fit(x_prova, y_prova, epochs=1000)

Epoch 1/1000


2022-11-05 12:58:15.442809: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-11-05 12:58:15.629466: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


 1/10 [==>...........................] - ETA: 16s - loss: 6.6942 - accuracy: 0.3583

2022-11-05 12:58:15.922706: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
Epoch 73/1000


KeyboardInterrupt: 

In [88]:
model.evaluate(x_prova_test, y_prova_test)

2022-11-05 12:59:42.539701: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-11-05 12:59:42.613182: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




[2.2390284538269043, 0.07339181005954742]

In [89]:
print(x_prova_test.shape)
result = model.predict(x_prova_test)
print(result)

(114, 10, 94)

2022-11-05 12:59:44.341371: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-11-05 12:59:44.399684: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


[[[1.0756292  0.         1.6736791 ]
  [2.6071496  0.         3.9934971 ]
  [0.         3.5384612  0.        ]
  ...
  [0.         0.         2.1049345 ]
  [0.         2.8374145  0.        ]
  [0.4417551  0.9153567  0.7121923 ]]

 [[2.1978781  1.6746589  3.578195  ]
  [0.8946435  2.544868   5.1717315 ]
  [3.9018521  4.870901   4.477558  ]
  ...
  [2.9291863  3.654651   2.9990366 ]
  [2.2434068  4.6095796  3.3329449 ]
  [2.2950897  0.9802348  7.892979  ]]

 [[3.3220043  2.2513387  5.4257765 ]
  [5.5586476  3.5043566  6.027802  ]
  [2.2562926  5.2247515  4.6893115 ]
  ...
  [2.9763765  2.457679   4.2613654 ]
  [1.3316855  3.2381716  6.7113905 ]
  [4.5631447  4.075909   3.0562885 ]]

 ...

 [[0.         1.8861023  2.3555148 ]
  [1.3947667  2.0156822  6.7626586 ]
  [3.2876775  0.21468496 1.7362942 ]
  ...
  [0.64773494 3.5973024  2.7812085 ]
  [5.0609574  1.7696464  0.47302046]
  [0.7726325  0.         6.485748  ]]

 [[2.190314   3.2683988  2.021895  ]
  [1.789462   3.900584   4.0114694 ]


In [90]:
def revert_yoh(Y):
    Y_new = np.empty([Y.shape[0],Y.shape[1]], dtype="<U1")
    #Y_new = np.zeros((Y.shape[0],Y.shape[1]))
    for i in range(Y.shape[0]):
        for j in range(Y.shape[1]):
            if (Y[i, j] == 0):
                Y_new[i, j]= 'A'
            elif (Y[i, j] == 1):
                Y_new[i, j]= 'D'
            elif (Y[i, j] == 2):
                Y_new[i, j]='H'
    return Y_new

y_pred = model.predict(x_prova_test)
y_predm = np.asarray(y_pred)
y_predm = np.argmax(y_predm, axis=2)
y_testm = np.argmax(y_prova_test, axis=2)

y_pred_train = model.predict(x_prova)
y_pred_train = np.asarray(y_pred_train)
y_predm_train = np.argmax(y_pred_train, axis=2)
y_trainm = np.argmax(y_prova, axis = 2)

y_predm = revert_yoh(y_predm).ravel()
y_testm = revert_yoh(y_testm).ravel()

y_predm_train = revert_yoh(y_predm_train).ravel()
y_trainm = revert_yoh(y_trainm).ravel()

#Model Metrics
print(classification_report(y_testm, y_predm, digits=3))

              precision    recall  f1-score   support

           A      0.310     0.249     0.277       345
           D      0.185     0.110     0.138       254
           H      0.479     0.630     0.544       541

    accuracy                          0.399      1140
   macro avg      0.325     0.330     0.320      1140
weighted avg      0.363     0.399     0.373      1140



# Neural Network

In [105]:
# trasforma i valori delle vittorie come 2 e dei pareggi come 1
y_nn = (data[['ordinalHR']].to_numpy().ravel()*2).reshape(-1,1)

y_nn = OneHotEncoder(sparse=False).fit_transform(y_nn)
print(y_nn)
#Split X and Y into training and Test Sets
x_train, x_test, y_train, y_test = train_test_split(X_imputed, y_nn, shuffle=True)

[[0. 0. 1.]
 [0. 0. 1.]
 [1. 0. 0.]
 ...
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]]


In [106]:
nn = k.models.Sequential([
    k.layers.Dense(300, activation='relu'),
    k.layers.Dense(100, activation='relu'),
    k.layers.Dense(3, activation='softmax'),
])

In [107]:
learning_rate=0.001

nn.compile(
    loss='categorical_crossentropy',
    optimizer=k.optimizers.Adam(learning_rate),
    metrics=[k.metrics.Accuracy()]
)
nn(x_train)
nn.summary()

Model: "sequential_18"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_24 (Dense)            (3135, 300)               28500     
                                                                 
 dense_25 (Dense)            (3135, 100)               30100     
                                                                 
 dense_26 (Dense)            (3135, 3)                 303       
                                                                 
Total params: 58,903
Trainable params: 58,903
Non-trainable params: 0
_________________________________________________________________


In [109]:
epochs=3000
batch_size=50
nn.fit(x_train, y_train, epochs=epochs, batch_size=batch_size)

Epoch 1/3000
Epoch 2/3000
Epoch 3/3000
Epoch 4/3000
Epoch 5/3000
Epoch 6/3000
Epoch 7/3000
Epoch 8/3000
Epoch 9/3000
Epoch 10/3000
Epoch 11/3000
Epoch 12/3000
Epoch 13/3000
Epoch 14/3000
Epoch 15/3000
Epoch 16/3000
Epoch 17/3000
Epoch 18/3000
Epoch 19/3000
Epoch 20/3000
Epoch 21/3000
Epoch 22/3000
Epoch 23/3000
Epoch 24/3000
Epoch 25/3000
Epoch 26/3000
Epoch 27/3000
Epoch 28/3000
Epoch 29/3000
Epoch 30/3000
Epoch 31/3000
Epoch 32/3000
Epoch 33/3000
Epoch 34/3000
Epoch 35/3000
Epoch 36/3000
Epoch 37/3000
Epoch 38/3000
Epoch 39/3000
Epoch 40/3000
Epoch 41/3000
Epoch 42/3000
Epoch 43/3000
Epoch 44/3000
Epoch 45/3000
Epoch 46/3000
Epoch 47/3000
Epoch 48/3000
Epoch 49/3000
Epoch 50/3000
Epoch 51/3000
Epoch 52/3000
Epoch 53/3000
Epoch 54/3000
Epoch 55/3000
Epoch 56/3000
Epoch 57/3000
Epoch 58/3000
Epoch 59/3000
Epoch 60/3000
Epoch 61/3000
Epoch 62/3000
Epoch 63/3000
Epoch 64/3000
Epoch 65/3000
Epoch 66/3000
Epoch 67/3000
Epoch 68/3000
Epoch 69/3000
Epoch 70/3000
Epoch 71/3000
Epoch 72/3000
E

KeyboardInterrupt: 