In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import SGDClassifier, RidgeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

from tensorflow import keras as k

import tensorflow as tf
import pandas as pd
import numpy as np

# %pip install d2l==1.0.0a1.post0
from d2l import tensorflow as d2l


In [None]:
data = pd.read_csv('./input/dataset.csv')
# dataImputed = pd.read_csv('./input/datasetImputed.csv')

In [None]:
# è stata riscritta una funzione di reshape inutile 
# e.g. (a_prev=x_train, season=trn_ssn)
def reshape_to_inputshape(a_prev,season):
    totalMatches = len(season)*38
    input_step = int(a_prev.shape[0]/totalMatches)
    prev_f = a_prev.shape[1]
    return np.reshape(a_prev, (totalMatches, input_step, prev_f))

In [None]:
features = ['HomeTeam', 'AwayTeam', 
            'HTeamEloScore', 'ATeamEloScore', 
            'HTdaysSinceLastMatch', 'ATdaysSinceLastMatch', 
            'HTW_rate', 'ATW_rate', 'ATD_rate', 'HTD_rate', 
            '7_HTW_rate', '12_HTW_rate', '7_ATW_rate', '12_ATW_rate', 
            '7_HTD_rate', '12_HTD_rate', '7_ATD_rate', '12_ATD_rate',
            '7_HTL_rate', '12_HTL_rate', '7_ATL_rate', '12_ATL_rate',
            '5_HTHW_rate', '5_ATAW_rate']

X = pd.get_dummies(data[features])

# Se non cambiamo nulla il OneHotEncoder assegna:
# A -> 1 0 0
# D -> 0 1 0
# H -> 0 0 1
y = data[['FTR']].to_numpy().ravel().reshape(-1, 1)
y = OneHotEncoder(sparse=False).fit_transform(y)
X_imputed = SimpleImputer().fit_transform(X)



In [None]:
trn_ssn = [2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015]
trn_ssn_len = len(trn_ssn)
tst_ssn = [2016,2017,2018] 
tst_ssn_len = len(tst_ssn)

test_size = float(tst_ssn_len)/(tst_ssn_len+trn_ssn_len)

#Split X and Y into training and Test Sets
x_train, x_test, y_train, y_test = train_test_split(X_imputed, y, shuffle=False, test_size=test_size)

In [None]:
def time_step(a_prev,season):
    a_prev = a_prev[np.newaxis, ...]
    totalMatches = len(season)*38

    prev_f = a_prev.shape[2]
    input_step = int(a_prev.shape[1]/totalMatches)
    step = 0
    a_new = np.zeros((totalMatches, input_step, prev_f))
    for i in range(totalMatches):
        # rows divise in porzioni di totalMatches rows
        step += input_step
        
        # per tutte le righe nell'intervallo di righe che stiamo guardando ora 
        # va in ogni porzione di righe di volta in volta
        for j in range(step-input_step,step):

            # per ogni colonna
            for k in range(prev_f):
                a_new[i, j - input_step * i, k] = a_prev[:, j, k]
    
    return a_new

In [None]:
#Setup XY to have 10 game steps
lstm_x_train = reshape_to_inputshape(x_train,trn_ssn)
lstm_y_train = reshape_to_inputshape(y_train,trn_ssn)
# y_train = np.moveaxis(y_train, 0, 1)

lstm_x_test = reshape_to_inputshape(x_test,tst_ssn)
lstm_y_test = reshape_to_inputshape(y_test,tst_ssn)
# y_test = np.moveaxis(y_test, 0, 1)

Tx = lstm_x_train.shape[1] #Time steps
Ty = lstm_y_train.shape[0] #Time Steps

num_features = lstm_x_train.shape[2] #Features per step
inputs = tf.keras.Input(shape=(Tx, num_features))

In [None]:
print(lstm_x_train.shape)
print(lstm_y_train.shape)

# Prova di tensorflow

In [None]:
inputs = tf.random.normal((32, 10, 8))
lstm = tf.keras.layers.LSTM(18)
output = lstm(inputs)
print(output.shape)

In [None]:
inputs = np.array([[[0], [0]], [[1], [1]], [[2], [2]]])

lstm = tf.keras.layers.LSTM(10)
out = lstm(inputs)
print(out)


In [None]:
MLP = tf.keras.models.Sequential([
    k.layers.Dense(4000),
    k.layers.Dropout(.7),
    k.layers.Dense(100),
    k.layers.Dense(3, activation='softmax')
])

In [None]:
MLP.compile(
    loss='categorical_crossentropy',
    optimizer=tf.keras.optimizers.Adam(0.001),
    metrics=[tf.keras.metrics.Accuracy()]
)
MLP(x_train)
MLP.summary()

In [None]:
MLP.fit(lstm_x_train, lstm_y_train)

In [None]:
forest = RandomForestClassifier(n_estimators=2, random_state=2)
forest = forest.fit(x_train, y_train)


#Forest Model Metrics
print("Forest Classifier")
print("Train Score: ", forest.score(x_train, y_train))
print("Test Score: ", forest.score(x_test, y_test))

In [None]:
for (x, y) in [ [(i, j) for j in range(n)] for i in range(m)]:
    forest = RandomForestClassifier(n_estimators=i, random_state=j)
    forest = forest.fit(x_train, y_train)


#Forest Model Metrics
print("Forest Classifier")
print("Train Score: ", forest.score(x_train, y_train))
print("Test Score: ", forest.score(x_test, y_test))

# Prove

In [None]:
x_prova = reshape_to_inputshape(x_train, trn_ssn)
y_prova = reshape_to_inputshape(y_train, trn_ssn)
x_prova_test = reshape_to_inputshape(x_test, tst_ssn)
y_prova_test = reshape_to_inputshape(y_test, tst_ssn)

In [None]:
# 65-67% Accuracy no Overfit
model = k.models.Sequential(
    [
        k.layers.LSTM(30),
        k.layers.Dense(1000, activation="relu"),
        k.layers.Dense(250, activation="relu"),
        k.layers.Dense(30, activation="relu"),
        k.layers.Reshape((10, 3)),
    ]
)

model.compile(
    loss='categorical_crossentropy',
    optimizer=tf.keras.optimizers.Adam(0.001),
    metrics=[tf.keras.metrics.Accuracy()]
)

model.fit(x_prova, y_prova, epochs=1000)

In [None]:
model.fit(x_prova, y_prova, epochs=100)

In [None]:
model.evaluate(x_prova_test, y_prova_test)

In [None]:
print(x_prova_test.shape)
result = model.predict(x_prova_test)
print(result)

In [None]:
def revert_yoh(Y):
    Y_new = np.empty([Y.shape[0],Y.shape[1]], dtype="<U1")
    #Y_new = np.zeros((Y.shape[0],Y.shape[1]))
    for i in range(Y.shape[0]):
        for j in range(Y.shape[1]):
            if (Y[i, j] == 0):
                Y_new[i, j]= 'A'
            elif (Y[i, j] == 1):
                Y_new[i, j]= 'D'
            elif (Y[i, j] == 2):
                Y_new[i, j]='H'
    return Y_new

y_pred = model.predict(x_prova_test)
y_predm = np.asarray(y_pred)
y_predm = np.argmax(y_predm, axis=2)
y_testm = np.argmax(y_prova_test, axis=2)

y_pred_train = model.predict(x_prova)
y_pred_train = np.asarray(y_pred_train)
y_predm_train = np.argmax(y_pred_train, axis=2)
y_trainm = np.argmax(y_prova, axis = 2)

y_predm = revert_yoh(y_predm).ravel()
y_testm = revert_yoh(y_testm).ravel()

y_predm_train = revert_yoh(y_predm_train).ravel()
y_trainm = revert_yoh(y_trainm).ravel()

#Model Metrics
print(classification_report(y_testm, y_predm, digits=3))