### This notebook contains the Feature Engineering, Hyperparameter tuning and Modelling steps.

#### Notebook 1: Extract-Transform-Load
#### Notebook 2: Data Visualization
#### Notebook 3: Feature Engineering, Hyperparameter tuning and Modelling
#### Notebook 4: Result Evaluation

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import time as time
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from keras.utils import np_utils
from sklearn.tree import DecisionTreeClassifier

from keras.models import Sequential
from keras.layers import Dense, Dropout, LeakyReLU
from keras import regularizers
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils;

## Data Import

In [None]:
# data import - from ETL notebook
Dataset = pd.read_csv(r"C:\Users\### LOCAL PATH ###\dataforcapstone.txt", sep = "\t")
Dataset.head()

In [None]:
Dataset.shape

In [None]:
Teams_list = list(Dataset["HomeTeam"].drop_duplicates())
Seasons_list = list(Dataset["Season start"].drop_duplicates())

## Feature Engineering

##### Adding the previous season's summary statistics as features

In [None]:
Season_data = pd.DataFrame()

for Team in Teams_list:
    data1 = Dataset.loc[Dataset["HomeTeam"] == Team]
    data2 = Dataset.loc[Dataset["AwayTeam"] == Team]
    
    for year in Seasons_list:
        
        data1_b = data1.loc[data1["Season start"] == year]
        data2_b = data2.loc[data2["Season start"] == year]

        Goals_H_for = data1_b["Home Goals"].sum()
        Goals_H_against = data1_b["Away Goals"].sum()
        Goals_A_for = data2_b["Away Goals"].sum()
        Goals_A_against = data2_b["Home Goals"].sum()

        # "Counting" Wins, draws, losses - first at home then away
        Wins_H = data1_b["Result"].str.count("H").sum()
        Ties_H = data1_b["Result"].str.count("D").sum()
        Losses_H = data1_b["Result"].str.count("A").sum()
        
        Wins_A = data2_b["Result"].str.count("A").sum()
        Ties_A = data2_b["Result"].str.count("D").sum()
        Losses_A = data2_b["Result"].str.count("H").sum()
        
        Total_points = Wins_H * 3 + Ties_H + Wins_A * 3 + Ties_A
              
        Season_data = Season_data.append([[Team, year, Total_points, Wins_H, Ties_H, Losses_H, Goals_H_for, Goals_H_against, Wins_A, Ties_A, Losses_A, Goals_A_for, Goals_A_against]])

Season_data.columns = ["Team","Season start","Total Points","Home W","Home T","Home L","Home Goals For", "Home Goals Against","Away W","Away T","Away L","Away Goals For", "Away Goals Against"]
Season_data.reset_index(drop=True, inplace=True)

In [None]:
# Combining the data - prev season summary for each Game
Season_data["prev season"] = pd.to_numeric(Season_data["Season start"]) +1
Season_data = Season_data.drop("Season start", axis=1)
# To merge
Season_data2 = Season_data.rename(columns={"Team":"AwayTeam","Total Points":"T2 Total Points","Home W":"T2 Home W","Home T":"T2 Home T","Home L":"T2 Home L", "Home Goals For":"T2 Home Goals For","Home Goals Against":"T2 Home Goals Against","Away W":"T2 Away W","Away T":"T2 Away T","Away L":"T2 Away L","Away Goals For":"T2 Away Goals For","Away Goals Against":"T2 Away Goals Against"})
Season_data = Season_data.rename(columns={"Team":"HomeTeam","Total Points":"T1 Total Points","Home W":"T1 Home W","Home T":"T1 Home T","Home L":"T1 Home L", "Home Goals For":"T1 Home Goals For","Home Goals Against":"T1 Home Goals Against","Away W":"T1 Away W","Away T":"T1 Away T","Away L":"T1 Away L","Away Goals For":"T1 Away Goals For","Away Goals Against":"T1 Away Goals Against"})


In [None]:
Dataset["Season start"] = pd.to_numeric(Dataset["Season start"])
Dataset["prev season"] = pd.to_numeric(Dataset["Season start"])

##### Creating a test set to avoid overfitting when tuning the hyperparameters

In [None]:
# Impose randomly set Train-Val-Test sets
for i in range(Dataset.shape[0]):
    Dataset.loc[i,"Test set identifier"] = np.random.randn()
    Dataset.loc[i,"Val set identifier"] = np.random.randn()

In [None]:
# merging season data
Dataset = Dataset.merge(Season_data, how="left",on=["prev season","HomeTeam"])
Dataset = Dataset.merge(Season_data2, how="left", on=["prev season","AwayTeam"])

##### The weight for the previous season's summary statistics defined as a hyperparameter

In [None]:
def prev_season_weight(weight, Dataset):
    
    Dataset["Weight"] = (Dataset["Gameday"] ** (weight/100) ) / Dataset["Gameday"]
    
    for i in range(13,Dataset.shape[1]):
        Dataset.iloc[:,i] = np.where(Dataset.iloc[:, i].notnull(),np.multiply(Dataset.iloc[:, i] , Dataset["Weight"]),np.nan)
    
    Dataset2 = Dataset.drop(["Weight"], axis = 1)
    Indicator = Dataset2[["Season start","Gameday"]].drop_duplicates().reset_index(drop=True)
    
    return Dataset2, Indicator

##### Averaging statistics over the previous n games as additional features

In [None]:
def prev_games_avg_data(n, Indicator, Dataset2):
    Prev_Games_avg_data = pd.DataFrame()


    for i in range(Indicator.shape[0]-n+1):
    
        data = Indicator.iloc[i:(i+n),:].merge(Dataset2[["Season start", "Gameday", "HomeTeam", "AwayTeam", "Home Goals", "Away Goals", "Result", "Odds H","Odds D","Odds A"]], how="left", on=["Season start","Gameday"])
    
        for Team in Teams_list:
            data1_b = data.loc[data["HomeTeam"] == Team].reset_index(drop=True)
            data2_b = data.loc[data["AwayTeam"] == Team].reset_index(drop=True)
    
            Goals_H_for = data1_b["Home Goals"].mean() 
            Goals_H_against = data1_b["Away Goals"].mean() 
            Goals_A_for = data2_b["Away Goals"].mean()
            Goals_A_against = data2_b["Home Goals"].mean()

        # Avg Odds
            H_Odds_when_H = data1_b["Odds H"].mean()
            D_Odds_when_H = data1_b["Odds D"].mean() 
            A_Odds_when_H = data1_b["Odds A"].mean() 
            H_Odds_when_A = data2_b["Odds H"].mean() 
            D_Odds_when_A = data2_b["Odds D"].mean()
            A_Odds_when_A = data2_b["Odds A"].mean()

        # "Counting" Wins, draws, losses - first at home then away
            if float(data1_b.shape[0]) != 0:
                Wins_H = data1_b["Result"].str.count("H").sum() / float(data1_b.shape[0])
                Ties_H = data1_b["Result"].str.count("D").sum() / float(data1_b.shape[0])
                Losses_H = data1_b["Result"].str.count("A").sum() / float(data1_b.shape[0])
            else:
                Wins_H = 0
                Ties_H = 0
                Losses_H = 0

            if float(data2_b.shape[0]) != 0:
                Wins_A = data2_b["Result"].str.count("A").sum() / float(data2_b.shape[0])
                Ties_A = data2_b["Result"].str.count("D").sum() / float(data2_b.shape[0]) 
                Losses_A = data2_b["Result"].str.count("H").sum() / float(data2_b.shape[0])
            else:
                Wins_A = 0
                Ties_A = 0
                Losses_A = 0
                
            Total_points = (data1_b["Result"].str.count("H").sum() * 3 + data1_b["Result"].str.count("D").sum() + data2_b["Result"].str.count("A").sum() * 3 + data2_b["Result"].str.count("D").sum()) / n # avg points
            Total_G_For = (data1_b["Home Goals"].sum()+ data2_b["Away Goals"].sum()) /n
            Total_G_Against = (data1_b["Away Goals"].sum()+data2_b["Home Goals"].sum()) /n
            
            Prev_Games_avg_data = Prev_Games_avg_data.append([[Team, data.iloc[-1,0], data.iloc[-1,1],Total_points, Total_G_For, Total_G_Against, Wins_H, Ties_H, Losses_H, Goals_H_for, Goals_H_against, Wins_A, Ties_A, Losses_A, Goals_A_for, Goals_A_against,H_Odds_when_H,D_Odds_when_H,A_Odds_when_H,H_Odds_when_A,D_Odds_when_A,A_Odds_when_A]])
    
    Prev_Games_avg_data.columns = ["Team","Season start","Gameday","Avg Total Points","Avg. Total Goals For", "Avg. Total Goals Against", "Avg Home W","Avg Home T","Avg Home L","Avg Home Goals For", "Avg Home Goals Against","Avg Away W","Avg Away T","Avg Away L","Avg Away Goals For", "Avg Away Goals Against","Avg H Odds when H","Avg D Odds when H","Avg A Odds when H","Avg H Odds when A","Avg D Odds when A","Avg A Odds when A"]
    Prev_Games_avg_data.reset_index(drop=True, inplace=True)
    Prev_Games_avg_data = Prev_Games_avg_data.dropna(subset=["Avg Total Points"])
    
    return Prev_Games_avg_data

##### Ranking average statistics among all Teams over the previous m games

In [None]:
def prev_games_avg_data_for_rank(m, Indicator, Dataset2):
    Prev_Games_avg_data_for_rank = pd.DataFrame()


    for i in range(Indicator.shape[0]-m+1):
    
        data = Indicator.iloc[i:(i+m),:].merge(Dataset2[["Season start", "Gameday", "HomeTeam", "AwayTeam", "Home Goals", "Away Goals", "Result", "Odds H","Odds D","Odds A"]], how="left", on=["Season start","Gameday"])
    
        for Team in Teams_list:
            data1_b = data.loc[data["HomeTeam"] == Team].reset_index(drop=True)
            data2_b = data.loc[data["AwayTeam"] == Team].reset_index(drop=True)

            Goals_H_for = data1_b["Home Goals"].mean() 
            Goals_H_against = data1_b["Away Goals"].mean()
            Goals_A_for = data2_b["Away Goals"].mean() 
            Goals_A_against = data2_b["Home Goals"].mean()

        # Avg Odds
            H_Odds_when_H = data1_b["Odds H"].mean()
            D_Odds_when_H = data1_b["Odds D"].mean()
            A_Odds_when_H = data1_b["Odds A"].mean()
            H_Odds_when_A = data2_b["Odds H"].mean() 
            D_Odds_when_A = data2_b["Odds D"].mean() 
            A_Odds_when_A = data2_b["Odds A"].mean() 

        # "Counting" Wins, draws, losses - first at home then away
            if float(data1_b.shape[0]) != 0:
                Wins_H = data1_b["Result"].str.count("H").sum() / float(data1_b.shape[0])
                Ties_H = data1_b["Result"].str.count("D").sum() / float(data1_b.shape[0])
                Losses_H = data1_b["Result"].str.count("A").sum()/ float(data1_b.shape[0])
            else:
                Wins_H = 0
                Ties_H = 0
                Losses_H = 0
                
            if float(data2_b.shape[0]) != 0:
                Wins_A = data2_b["Result"].str.count("A").sum()/ float(data2_b.shape[0])
                Ties_A = data2_b["Result"].str.count("D").sum() / float(data2_b.shape[0])  
                Losses_A = data2_b["Result"].str.count("H").sum() / float(data2_b.shape[0])
            else:
                Wins_A = 0
                Ties_A = 0
                Losses_A = 0
                
            Total_points = (data1_b["Result"].str.count("H").sum() * 3 + data1_b["Result"].str.count("D").sum() + data2_b["Result"].str.count("A").sum() * 3 + data2_b["Result"].str.count("D").sum()) / m # avg points
            Total_G_For = (data1_b["Home Goals"].sum() + data2_b["Away Goals"].sum()) /m
            Total_G_Against = (data1_b["Away Goals"].sum()+data2_b["Home Goals"].sum())/m
            
            Prev_Games_avg_data_for_rank = Prev_Games_avg_data_for_rank.append([[Team, data.iloc[-1,0], data.iloc[-1,1],Total_points, Total_G_For, Total_G_Against, Wins_H, Ties_H, Losses_H, Goals_H_for, Goals_H_against, Wins_A, Ties_A, Losses_A, Goals_A_for, Goals_A_against,H_Odds_when_H,D_Odds_when_H,A_Odds_when_H,H_Odds_when_A,D_Odds_when_A,A_Odds_when_A]])
    
    Prev_Games_avg_data_for_rank.columns = ["Team","Season start","Gameday","Avg Total Points","Avg. Total Goals For", "Avg. Total Goals Against", "Avg Home W","Avg Home T","Avg Home L","Avg Home Goals For", "Avg Home Goals Against","Avg Away W","Avg Away T","Avg Away L","Avg Away Goals For", "Avg Away Goals Against","Avg H Odds when H","Avg D Odds when H","Avg A Odds when H","Avg H Odds when A","Avg D Odds when A","Avg A Odds when A"]
    Prev_Games_avg_data_for_rank.reset_index(drop=True, inplace=True)
    Prev_Games_avg_data_for_rank = Prev_Games_avg_data_for_rank.dropna(subset=["Avg Total Points"])
    Indicator2 = Prev_Games_avg_data_for_rank[["Season start","Gameday"]].drop_duplicates().reset_index(drop=True)
    
    return Prev_Games_avg_data_for_rank, Indicator2

In [None]:
def prev_games_ranking(Indicator2, Prev_Games_avg_data_for_rank):
    Prev_Games_ranking = pd.DataFrame()

    for i in range(Indicator2.shape[0]):

        data = Prev_Games_avg_data_for_rank.loc[(Prev_Games_avg_data_for_rank["Season start"] == Indicator2.iloc[i,0])&(Prev_Games_avg_data_for_rank["Gameday"] == Indicator2.iloc[i,1])]
        Rankings = data.drop(["Team","Season start","Gameday"], axis=1).rank(axis = 0, ascending = False)
    
        Prev_Games_ranking = Prev_Games_ranking.append(Rankings)
    
    Prev_Games_ranking.columns = ["Rank " + str(col) for col in Prev_Games_ranking.columns]
    
    return Prev_Games_ranking

##### Merging the Features

In [None]:
def data_merge(Prev_Games_avg_data, Prev_Games_avg_data_for_rank, Prev_Games_ranking, Dataset2):
    Data_from_prev_games_1 = Prev_Games_avg_data
    Data_from_prev_games_2 = pd.concat([Prev_Games_avg_data_for_rank[["Team","Season start","Gameday"]], Prev_Games_ranking], axis=1)
    
    # prep for merge
    Data_from_prev_games_1["Season start"] = pd.to_numeric(Data_from_prev_games_1["Season start"])
    Data_from_prev_games_1["Gameday"] = pd.to_numeric(Data_from_prev_games_1["Gameday"])

    Data_from_prev_games_1["Season start"] = np.where(Data_from_prev_games_1["Gameday"] == 34, Data_from_prev_games_1["Season start"] +1,Data_from_prev_games_1["Season start"])
    Data_from_prev_games_1["Gameday"] = np.where(Data_from_prev_games_1["Gameday"] == 34, 1,Data_from_prev_games_1["Gameday"]+1)

#
    Data_from_prev_games_2["Season start"] = pd.to_numeric(Data_from_prev_games_2["Season start"])
    Data_from_prev_games_2["Gameday"] = pd.to_numeric(Data_from_prev_games_2["Gameday"])

    Data_from_prev_games_2["Season start"] = np.where(Data_from_prev_games_2["Gameday"] == 34, Data_from_prev_games_2["Season start"] +1,Data_from_prev_games_2["Season start"])
    Data_from_prev_games_2["Gameday"] = np.where(Data_from_prev_games_2["Gameday"] == 34, 1,Data_from_prev_games_2["Gameday"]+1)
    
    #
    x1 = Data_from_prev_games_1[["Team", "Season start", "Gameday", "Avg Total Points","Avg. Total Goals For","Avg. Total Goals Against","Avg Home W", "Avg Home T", "Avg Home L", "Avg Home Goals For","Avg Home Goals Against"]]
    x1.columns = ["Team 1 " + str(col) for col in x1.columns]
    x1 = x1.rename(columns={"Team 1 Team":"HomeTeam","Team 1 Season start":"Season start","Team 1 Gameday":"Gameday"})

    x2 = Data_from_prev_games_1[["Team", "Season start", "Gameday", "Avg Total Points","Avg. Total Goals For","Avg. Total Goals Against","Avg Away W", "Avg Away T", "Avg Away L", "Avg Away Goals For","Avg Away Goals Against"]]
    x2.columns = ["Team 2 " + str(col) for col in x2.columns]
    x2 = x2.rename(columns={"Team 2 Team":"AwayTeam","Team 2 Season start":"Season start","Team 2 Gameday":"Gameday"})

#
    x3 = Data_from_prev_games_2[["Team", "Season start", "Gameday","Rank Avg Total Points","Rank Avg. Total Goals For","Rank Avg. Total Goals Against","Rank Avg Home W","Rank Avg Home T", "Rank Avg Home L", "Rank Avg Home Goals For","Rank Avg Home Goals Against"]]
    x3.columns = ["Team 1 " + str(col) for col in x3.columns]
    x3 = x3.rename(columns={"Team 1 Team":"HomeTeam","Team 1 Season start":"Season start","Team 1 Gameday":"Gameday"})

    x4 = Data_from_prev_games_2[["Team", "Season start", "Gameday","Rank Avg Total Points","Rank Avg. Total Goals For","Rank Avg. Total Goals Against","Rank Avg Away W","Rank Avg Away T", "Rank Avg Away L", "Rank Avg Away Goals For","Rank Avg Away Goals Against"]]
    x4.columns = ["Team 2 " + str(col) for col in x4.columns]
    x4 = x4.rename(columns={"Team 2 Team":"AwayTeam","Team 2 Season start":"Season start","Team 2 Gameday":"Gameday"})
    
    Dataset2["Season start"] = pd.to_numeric(Dataset2["Season start"])
    Dataset2["Gameday"] = pd.to_numeric(Dataset2["Gameday"])

    Data = Dataset2.merge(x1, how="left",on=["HomeTeam","Season start","Gameday"])
    Data = Data.merge(x2, how="left",on=["AwayTeam","Season start","Gameday"])
    Data = Data.merge(x3, how="left",on=["HomeTeam","Season start","Gameday"])
    Data = Data.merge(x4, how="left",on=["AwayTeam","Season start","Gameday"])
    
    # getting rid of some irrelevant data
    Data = Data.drop(["T1 Away W","T1 Away T","T1 Away L","T1 Away Goals For","T1 Away Goals Against","T2 Home W","T2 Home T","T2 Home L","T2 Home Goals For","T2 Home Goals Against","prev season"], axis=1)
    Data = Data.dropna(subset=["T1 Total Points"])
    Data = Data.loc[Data["T1 Total Points"] != 0]
    Data = Data.dropna(subset=["T2 Total Points"])
    Data = Data.loc[Data["T2 Total Points"] != 0]
    Data = Data.reset_index(drop=True)
    
    return Data

##### Choice of which type of Scaler as a hyperparamter

In [None]:
def scaleData(data):
    # normalize features
    scaler = MinMaxScaler(feature_range=(0, 1))
    
    return scaler.fit_transform(data)

In [None]:
def scaleData2(data):
    scaler = StandardScaler()
    return scaler.fit_transform(data)

## Modelling

##### Decision Tree

In [None]:
## Hyperparameters:
# g - type of scaler (MinMax / Standard / [-1,1] - except for dummies)
# k - max depth of the tree

def Tree_Modelling(Data, g, k):
    
    try:
        Data = Data.drop(["Weight"], axis=1).dropna()
    except:
        Data = Data.dropna()
    try:
        Data = Data.drop(["prev season"], axis=1).dropna()
    except:
        Data = Data.dropna()
    Data = Data.dropna()
    

    Data = Data.sort_values(["Test set identifier"], ascending = True)
    
    
    Test_set_data = Data.iloc[:(round(0.15*Data.shape[0])),:]
    Train_and_Val_set = Data.iloc[(round(0.15*Data.shape[0])):,:]
    Train_and_Val_set = Train_and_Val_set.drop(["Test set identifier"], axis = 1)
    
    Train_and_Val_set = Train_and_Val_set.sort_values(["Val set identifier"], ascending = True)
    Val_set = Train_and_Val_set.iloc[:(round(0.12*Train_and_Val_set.shape[0])),:]
    Train_set = Train_and_Val_set.iloc[(round(0.12*Train_and_Val_set.shape[0])):,:]
    Train_set = Train_set.drop(["Val set identifier"], axis = 1)
    Val_set = Val_set.drop(["Val set identifier"], axis = 1)
    
    Target = Train_set.loc[:, "Result"]
    Val_Target = Val_set.loc[:,"Result"]
    Features = Train_set.iloc[:, 7:].values
    Val_Features = Val_set.iloc[:, 7:].values
    
    
    if g == 0:
        scaled_Features = scaleData(Features)
        scaled_Val = scaleData(Val_Features)
    elif g == 1:
        scaled_Features = scaleData2(Features)
        scaled_Val = scaleData2(Val_Features)
    else:
        
        scaled_Features = Features
        scaled_Val = Val_Features
    
    train_set = scaled_Features
    val_set = scaled_Val
    train_label = Target
    val_label = Val_Target
    
    Result_Tree = DecisionTreeClassifier(criterion='entropy', max_depth = k)  
    Result_Tree.fit(train_set, train_label)
    
    
    score_train = metrics.accuracy_score(train_label, Result_Tree.predict(train_set))
    
    score_test = metrics.accuracy_score(val_label, Result_Tree.predict(val_set))
    
    
    return score_train, score_test

##### Support-Vector-Machine Model

In [None]:
## Hyperparameters:
# g - type of scaler (MinMax / Standard / [-1,1] - except for dummies)
# k - type of kernel (linear / polynomial / gaussian)
# p - polynomial degree

def SVM_Modelling(Data, g, k, p):
    
    try:
        Data = Data.drop(["Weight"], axis=1).dropna()
    except:
        Data = Data.dropna()
    try:
        Data = Data.drop(["prev season"], axis=1).dropna()
    except:
        Data = Data.dropna()
    Data = Data.dropna()
    
    Data = Data.sort_values(["Test set identifier"], ascending = True)
    
    Test_set_data = Data.iloc[:(round(0.15*Data.shape[0])),:]
    Train_and_Val_set = Data.iloc[(round(0.15*Data.shape[0])):,:]
    Train_and_Val_set = Train_and_Val_set.drop(["Test set identifier"], axis = 1)
    
    Train_and_Val_set = Train_and_Val_set.sort_values(["Val set identifier"], ascending = True)
    Val_set = Train_and_Val_set.iloc[:(round(0.12*Train_and_Val_set.shape[0])),:]
    Train_set = Train_and_Val_set.iloc[(round(0.12*Train_and_Val_set.shape[0])):,:]
    Train_set = Train_set.drop(["Val set identifier"], axis = 1)
    Val_set = Val_set.drop(["Val set identifier"], axis = 1)
    
    Target = Train_set.loc[:, "Result"]
    Val_Target = Val_set.loc[:,"Result"]
    Features = Train_set.iloc[:, 7:].values
    Val_Features = Val_set.iloc[:, 7:].values
    
    
    if g == 0:
        scaled_Features = scaleData(Features)
        scaled_Val = scaleData(Val_Features)
    elif g == 1:
        scaled_Features = scaleData2(Features)
        scaled_Val = scaleData2(Val_Features)
    else:
        for i in range(Features.shape[0]):
            for j in range(Features.shape[1]):
                Features[i,j] = 2 * (Features[i,j] - Features[:,j].min()) / (Features[:,j].max() - Features[:,j].min())-1
        for i in range(Val_Features.shape[0]):
            for j in range(Val_Features.shape[1]):
                Val_Features[i,j] = 2 * (Val_Features[i,j] - Val_Features[:,j].min()) / (Val_Features[:,j].max() - Val_Features[:,j].min())-1
        scaled_Features = Features
        scaled_Val = Val_Features
    
    train_set = scaled_Features
    val_set = scaled_Val
    train_label = Target
    val_label = Val_Target
    
    
    if k == 0:
        clf = svm.SVC(kernel='linear',gamma='auto')
    elif k == 1:
        clf= svm.SVC(kernel ='poly', degree= p, gamma='auto')
    else:
        clf = svm.SVC(kernel='rbf', gamma = 'auto')
    
    
    clf.fit(train_set, train_label)
    
    score_train = metrics.accuracy_score(train_label, clf.predict(train_set))
    score_test = metrics.accuracy_score(val_label, clf.predict(val_set))
    
    
    return score_train, score_test

##### Feed-Forward Neural Network

In [None]:
# Dense - Dropout - Dense - Dropout - Dense - Dense(softmax)

## Hyperparameters:
# a - 1. Dropoutlayer dropout rate
# b - 2. Dropoutlayer dropout rate
# c - 2. Dense layer number of nodes
# d - 3. Dense layer number of nodes
# e - Number of epochs - fixed to 40
# f - Batch size
# g - type of scaler (MinMax / Standard / [-1,1] - except for dummies)
# h - type of optimizer (Adam / Adadelta / Adagrad / Stochastic Gradient Descent)
# z - type of activation function (tanh / relu)

def NN_Modelling(Data, a, b, c, d, e, f, g, h,z):
    
    try:
        Data = Data.drop(["Weight"], axis=1).dropna()
    except:
        Data = Data.dropna()
    try:
        Data = Data.drop(["prev season"], axis=1).dropna()
    except:
        Data = Data.dropna()
    
    Data = Data.sort_values(["Test set identifier"], ascending = True)

    Test_set_data = Data.iloc[:(round(0.15*Data.shape[0])),:]
    Train_and_Val_set = Data.iloc[(round(0.15*Data.shape[0])):,:]
    Train_and_Val_set = Train_and_Val_set.drop(["Test set identifier"], axis = 1)
    
    Train_and_Val_set = Train_and_Val_set.sort_values(["Val set identifier"], ascending = True)
    Val_set = Train_and_Val_set.iloc[:(round(0.12*Train_and_Val_set.shape[0])),:]
    Train_set = Train_and_Val_set.iloc[(round(0.12*Train_and_Val_set.shape[0])):,:]
    Train_set = Train_set.drop(["Val set identifier"], axis = 1)
    Val_set = Val_set.drop(["Val set identifier"], axis = 1)
    
    Target = Train_set.loc[:, "Result"]
    Val_Target = Val_set.loc[:,"Result"]
    Features = Train_set.iloc[:, 7:].values
    Val_Features = Val_set.iloc[:, 7:].values    
    
    encoder = LabelEncoder()
    encoder.fit(Target)
    encoded_Y = encoder.transform(Target)
    dummy_y = np_utils.to_categorical(encoded_Y)
    encoded_val = encoder.transform(Val_Target)
    dummy_val = np_utils.to_categorical(encoded_val)
    
    if g == 0:
        scaled_Features = scaleData(Features)
        scaled_Val = scaleData(Val_Features)
    elif g == 1:
        scaled_Features = scaleData2(Features)
        scaled_Val = scaleData2(Val_Features)
    else:
        for i in range(Features.shape[0]):
            for j in range(Features.shape[1]):
                Features[i,j] = 2 * (Features[i,j] - Features[:,j].min()) / (Features[:,j].max() - Features[:,j].min())-1
        for i in range(Val_Features.shape[0]):
            for j in range(Val_Features.shape[1]):
                Val_Features[i,j] = 2 * (Val_Features[i,j] - Val_Features[:,j].min()) / (Val_Features[:,j].max() - Val_Features[:,j].min())-1
        scaled_Features = Features
        scaled_Val = Val_Features
    
    if z == 0:
        activ_f = "relu"
    elif z == 1:
        activ_f = "leakyRelu"
    else:
        activ_f = "tanh"
        
    train_set = scaled_Features
    val_set = scaled_Val
    train_label = dummy_y
    val_label = dummy_val
    
    
    model = Sequential()
    if z == 1: # first one as regular relu
        model.add(Dense(Features.shape[1],input_shape=(Features.shape[1], ), activation="relu", use_bias = True))
    else:
        model.add(Dense(Features.shape[1],input_shape=(Features.shape[1], ), activation=activ_f, use_bias = True))
    model.add(Dropout(a/100))
    if z != 1 :
        model.add(Dense(c, activation=activ_f))
    else:
        model.add(LeakyReLU(alpha = 0.1))
    model.add(Dropout(b/100))
    if z !=1:
        model.add(Dense(d, activation=activ_f))
    else:
        model.add(LeakyReLU(alpha = 0.1))
    model.add(Dense(3, activation='softmax'))
    
    if h == 0:
        optim = "adam"
    elif h == 1:
        optim = "adadelta"
    elif h == 2:
        optim = "adagrad"
    else:
        optim ="sgd"
    
    
    model.compile(loss='categorical_crossentropy', optimizer=optim, metrics=['accuracy'])
    model.fit(train_set, train_label, epochs=e, batch_size=f, validation_data = (val_set, val_label))
    
    score = model.evaluate(val_set, val_label)
    score2 = model.evaluate(train_set, train_label)
    
    score_train = score2[1]
    score_test = score[1]
    
    return score_train, score_test

### The Model

#### Hyperparameter tuning

In [None]:
def The_Model(Dataset1):
    Tree_Model_tuning = pd.DataFrame()
    SVM_Model_tuning = pd.DataFrame()
    NN_Model_tuning = pd.DataFrame()
    
    Counter = 0
    
    for weight in [75, 85, 95, 100]:
        
        Dataset2, Indicator = prev_season_weight(weight, Dataset1)
        
        for n in [34,15,5]:
            Prev_Games_avg_data = prev_games_avg_data(n, Indicator, Dataset2)
            
            for m in [34,15,5]:
                Prev_Games_avg_data_for_rank, Indicator2 = prev_games_avg_data_for_rank(m, Indicator, Dataset2)
                Prev_Games_ranking = prev_games_ranking(Indicator2, Prev_Games_avg_data_for_rank)
                
                Data = data_merge(Prev_Games_avg_data, Prev_Games_avg_data_for_rank, Prev_Games_ranking, Dataset2).dropna()
                
                for Modeltype in range(3):#range(3):
                    if Modeltype == 0: # Decision Tree
                        for g in range(3):
                            for k in range(2,15):
                                
                                score_train, score_test = Tree_Modelling(Data,g,k)
                                        
                                Tuned = [[Counter, weight, n, m, g, k, score_train, score_test]]
                                Tree_Model_tuning = Tree_Model_tuning.append(Tuned)
                                print(Counter, weight, n, m, g, k, score_train, score_test)
                                                   
                                Counter += 1
                                    
                    elif Modeltype == 1: # SVM
                        for g in range(3):
                            for k in range(3):
                                for p in [3,6,12]:
                                    score_train, score_test = SVM_Modelling(Data,g,k,p)
                                        
                                    Tuned = [[Counter, weight, n, m, g, k, p,score_train, score_test]]
                                    SVM_Model_tuning = SVM_Model_tuning.append(Tuned)
                                    print(Counter, weight, n, m, g, k, p, score_train, score_test)
                                                   
                                    Counter += 1
                                
                    else: # NN
                        for a in [20,30,40]:
                            for b in [15,25,30]:
                                for c in [30,40]:
                                    for d in [10,20]:
                                        for e in [40,75, 100]:
                                            for f in [32,64, 128]:
                                                for g in range(3):
                                                    for h in range(4):
                                                        for z in range(3):
                                                            
                                                            score_train, score_test = NN_Modelling(Data,a,b,c,d,e,f,g,h,z)
                                        
                                                            Tuned = [[Counter, weight, n, m, a, b, c, d, e, f, g, h, z, score_train, score_test]]
                                                            NN_Model_tuning = NN_Model_tuning.append(Tuned)
                                                            print(Counter, weight, n, m, a, b, c, d, e, f, g, h, z, score_train, score_test)
                                                   
                                                            Counter += 1
                                        
    Tree_Model_tuning.columns= ["Counter","prev season weight","prev games incl","prev games for rank","Scaler","Max depth","score_train","score_val"]
    Tree_Model_tuning = Tree_Model_tuning.reset_index(drop=True)
    SVM_Model_tuning.columns= ["Counter","prev season weight","prev games incl","prev games for rank","Scaler","Kernel","Polynomial","score_train","score_val"]
    SVM_Model_tuning = SVM_Model_tuning.reset_index(drop=True)
    NN_Model_tuning.columns= ["Counter","prev season weight","prev games incl","prev games for rank","Dropout rate1","Dropout rate2","Nodes Dense2","Nodes Dense3","Epochs","Batch size","Scaler","Optimizer","Activation Function","score_train","score_val"]
    NN_Model_tuning = NN_Model_tuning.reset_index(drop=True)
    
    
    return Tree_Model_tuning, SVM_Model_tuning, NN_Model_tuning, Dataset1, Data

##### Extracting the optimal hyperparameters and the dataset with the testset identifier

In [None]:
Tree_Model_tuning, SVM_Model_tuning, NN_Model_tuning, Train_Val_Test_dataset, Final_Dataset = The_Model(Dataset)

In [None]:
print(Train_Val_Test_dataset.shape, Final_Dataset.shape)

In [None]:
Visualization_Dataset = Final_Dataset[["Season start","Gameday","HomeTeam","AwayTeam","Home Goals","Away Goals","Result","Odds H","Odds D","Odds A"]]

# saving data for visualization
Visualization_Dataset.to_csv(r'C:\Users\### LOCAL PATH ###\Data_for_visualization.txt', header = True, index=None, sep='\t')

##### Optimal hyperparameters:

In [None]:
Hyper_Parameters_Tree = Tree_Model_tuning.sort_values(["score_val","score_train"], ascending = [False,False]).reset_index(drop=True)
Hyper_Parameters_Tree.head(3)

In [None]:
Hyper_Parameters_SVM = SVM_Model_tuning.sort_values(["score_val","score_train"], ascending = [False,False]).reset_index(drop=True)
Hyper_Parameters_SVM.head(3)

In [None]:
Hyper_Parameters_NN = NN_Model_tuning.sort_values(["score_val","score_train"], ascending = [False,False]).reset_index(drop=True)
Hyper_Parameters_NN.head(3)

##### saved /extracted the hyperparameters here

In [None]:
# Tree
Tree_weight = Hyper_Parameters_Tree.loc[0, "prev season weight"] 
Tree_n = Hyper_Parameters_Tree.loc[0, "prev games incl"] 
Tree_m = Hyper_Parameters_Tree.loc[0, "prev games for rank"] 
Tree_Scaler = Hyper_Parameters_Tree.loc[0,"Scaler"] 
Tree_depth = Hyper_Parameters_Tree.loc[0,"Max depth"] 

# SVM
SVM_weight = Hyper_Parameters_SVM.loc[0, "prev season weight"] 
SVM_n = Hyper_Parameters_SVM.loc[0, "prev games incl"] 
SVM_m = Hyper_Parameters_SVM.loc[0, "prev games for rank"] 
SVM_Scaler = Hyper_Parameters_SVM.loc[0,"Scaler"] 
SVM_Kernel = Hyper_Parameters_SVM.loc[0,"Kernel"] 
SVM_Polynomial = Hyper_Parameters_SVM.loc[0,"Polynomial"] 

# NN
NN_weight = Hyper_Parameters_NN.loc[0, "prev season weight"] 
NN_n = Hyper_Parameters_NN.loc[0, "prev games incl"] 
NN_m = Hyper_Parameters_NN.loc[0, "prev games for rank"] 
NN_Scaler = Hyper_Parameters_NN.loc[0,"Scaler"] 
NN_dropout_rate1 = Hyper_Parameters_NN.loc[0,"Dropout rate1"] 
NN_dropout_rate2 = Hyper_Parameters_NN.loc[0,"Dropout rate2"] 
NN_Nodes_dense2 = Hyper_Parameters_NN.loc[0,"Nodes Dense2"] 
NN_Nodes_dense3 = Hyper_Parameters_NN.loc[0,"Nodes Dense3"] 
NN_Epochs = Hyper_Parameters_NN.loc[0,"Epochs"] 
NN_Batch_size = Hyper_Parameters_NN.loc[0,"Batch size"] 
NN_Optimizer = Hyper_Parameters_NN.loc[0,"Optimizer"]
NN_Activation_Function = Hyper_Parameters_NN.loc[0,"Activation Function"]

### Model fitting and scoring Test data

In [None]:
# Fitting the best model -> extract hyperparameters, fit the model on the train_val dataset, score the model on the test set

# Tree
Dataset_step1, Indicator1 = prev_season_weight(Tree_weight, Train_Val_Test_dataset)
Prev_Games_avg_data = prev_games_avg_data(Tree_n, Indicator1, Dataset_step1)
Prev_Games_avg_data_for_rank, Indicator2 = prev_games_avg_data_for_rank(Tree_m, Indicator1, Dataset_step1)
Prev_Games_ranking = prev_games_ranking(Indicator2, Prev_Games_avg_data_for_rank)              
Data_for_model_fit = data_merge(Prev_Games_avg_data, Prev_Games_avg_data_for_rank, Prev_Games_ranking, Dataset_step1)


#
Data_for_model_fit = Data_for_model_fit.sort_values(["Test set identifier"], ascending = True)
Data_for_model_fit = Data_for_model_fit.reset_index(drop=True)
Data_for_model_fit = Data_for_model_fit.drop(["Test set identifier"], axis=1).dropna()


    
Test_set = Data_for_model_fit.iloc[:(round(0.15*Data_for_model_fit.shape[0])),:].reset_index(drop=True)
Train_and_Val = Data_for_model_fit.iloc[(round(0.15*Data_for_model_fit.shape[0])):,:].reset_index(drop=True)
    
Train_and_Val = Train_and_Val.sort_values(["Val set identifier"], ascending = True)
Train_and_Val = Train_and_Val.reset_index(drop=True)
Train_and_Val = Train_and_Val.drop(["Val set identifier"], axis=1).dropna()
Test_set = Test_set.drop(["Val set identifier"], axis=1).dropna()
    
Val_set = Train_and_Val.iloc[:(round(0.12*Train_and_Val.shape[0])),:].reset_index(drop=True)
Train_set = Train_and_Val.iloc[(round(0.12*Train_and_Val.shape[0])):,:].reset_index(drop=True)
    
# Trainset
Tree_train_target = Train_set.loc[:, "Result"]
Train_Features = Train_set.iloc[:, 7:].values

# Valset
Tree_val_target = Val_set.loc[:, "Result"]
Val_Features = Val_set.iloc[:, 7:].values

# Testset
Tree_test_target = Test_set.loc[:, "Result"]
Test_Features = Test_set.iloc[:, 7:].values

# scaling
if Tree_Scaler == 2:
    Train_Features = scaleData(Train_Features)
    Val_Features = scaleData(Val_Features)
    Test_Features = scaleData(Test_Features)
elif Tree_Scaler == 1:
    Train_Features = scaleData2(Train_Features)
    Val_Features = scaleData2(Val_Features)
    Test_Features = scaleData2(Test_Features)
else:

    scaled_Features = Features
    scaled_Val = Val_Features

Result_Tree = DecisionTreeClassifier(criterion='entropy', max_depth = Tree_depth)  
Result_Tree.fit(Train_Features, Tree_train_target)
    
    
Tree_train_score = metrics.accuracy_score(Tree_train_target, Result_Tree.predict(Train_Features))
Tree_val_score = metrics.accuracy_score(Tree_val_target, Result_Tree.predict(Val_Features))
Tree_test_score = metrics.accuracy_score(Tree_test_target, Result_Tree.predict(Test_Features))

Tree_train_prediction = Result_Tree.predict(Train_Features)
Tree_val_prediction = Result_Tree.predict(Val_Features)
Tree_test_prediction = Result_Tree.predict(Test_Features)


In [None]:
# Fitting the best model -> extract hyperparameters, fit the model on the train_val dataset, score the model on the test set

# SVM 
Dataset_step1, Indicator1 = prev_season_weight(SVM_weight, Train_Val_Test_dataset)
Prev_Games_avg_data = prev_games_avg_data(SVM_n, Indicator1, Dataset_step1)
Prev_Games_avg_data_for_rank, Indicator2 = prev_games_avg_data_for_rank(SVM_m, Indicator1, Dataset_step1)
Prev_Games_ranking = prev_games_ranking(Indicator2, Prev_Games_avg_data_for_rank)              
Data_for_model_fit = data_merge(Prev_Games_avg_data, Prev_Games_avg_data_for_rank, Prev_Games_ranking, Dataset_step1)


#
Data_for_model_fit = Data_for_model_fit.sort_values(["Test set identifier"], ascending = True)
Data_for_model_fit = Data_for_model_fit.reset_index(drop=True)
Data_for_model_fit = Data_for_model_fit.drop(["Test set identifier"], axis=1).dropna()

Test_set = Data_for_model_fit.iloc[:(round(0.15*Data_for_model_fit.shape[0])),:].reset_index(drop=True)
Train_and_Val = Data_for_model_fit.iloc[(round(0.15*Data_for_model_fit.shape[0])):,:].reset_index(drop=True)

Train_and_Val = Train_and_Val.sort_values(["Val set identifier"], ascending = True)
Train_and_Val = Train_and_Val.reset_index(drop=True)
Train_and_Val = Train_and_Val.drop(["Val set identifier"], axis=1).dropna()
Test_set = Test_set.drop(["Val set identifier"], axis=1).dropna()
    
Val_set = Train_and_Val.iloc[:(round(0.12*Train_and_Val.shape[0])),:].reset_index(drop=True)
Train_set = Train_and_Val.iloc[(round(0.12*Train_and_Val.shape[0])):,:].reset_index(drop=True)
    
# Trainset
SVM_train_target = Train_set.loc[:, "Result"]
Train_Features = Train_set.iloc[:, 7:]
Train_Features = Train_Features.values

# Valset
SVM_val_target = Val_set.loc[:, "Result"]
Val_Features = Val_set.iloc[:, 7:].values
    
# Testset
SVM_test_target = Test_set.loc[:, "Result"]
Test_Features = Test_set.iloc[:, 7:]
Test_Features = Test_Features.values

# scaling
if SVM_Scaler == 2:
    Train_Features = scaleData(Train_Features)
    Val_Features = scaleData(Val_Features)
    Test_Features = scaleData(Test_Features)
elif SVM_Scaler == 1:
    Train_Features = scaleData2(Train_Features)
    Val_Features = scaleData2(Val_Features)
    Test_Features = scaleData2(Test_Features)
else:
    for i in range(Train_Features.shape[0]):
        for j in range(Train_Features.shape[1]):
            Train_Features[i,j] = 2 * (Train_Features[i,j] - Train_Features[:,j].min()) / (Train_Features[:,j].max() - Train_Features[:,j].min())-1
    for i in range(Val_Features.shape[0]):
        for j in range(Val_Features.shape[1]):
            Val_Features[i,j] = 2 * (Val_Features[i,j] - Val_Features[:,j].min()) / (Val_Features[:,j].max() - Val_Features[:,j].min())-1
    for i in range(Test_Features.shape[0]):
        for j in range(Test_Features.shape[1]):
            Test_Features[i,j] = 2 * (Test_Features[i,j] - Test_Features[:,j].min()) / (Test_Features[:,j].max() - Test_Features[:,j].min())-1
    

if SVM_Kernel == 0:
    SVM_clf = svm.SVC(kernel='linear',gamma='auto')
elif SVM_Kernel == 1:
    SVM_clf= svm.SVC(kernel ='poly', degree= SVM_Polynomial, gamma='auto')
else:
    SVM_clf = svm.SVC(kernel='rbf', gamma = 'auto')
    
    
SVM_clf.fit(Train_Features, SVM_train_target)
    
SVM_train_score = metrics.accuracy_score(SVM_train_target, SVM_clf.predict(Train_Features))
SVM_val_score = metrics.accuracy_score(SVM_val_target, SVM_clf.predict(Val_Features))
SVM_test_score = metrics.accuracy_score(SVM_test_target, SVM_clf.predict(Test_Features))
        
SVM_train_prediction = SVM_clf.predict(Train_Features)
SVM_val_prediction = SVM_clf.predict(Val_Features)
SVM_test_prediction = SVM_clf.predict(Test_Features)

In [None]:
# Fitting the best model -> extract hyperparameters, fit the model on the train_val dataset, score the model on the test set

# NN
Dataset_step1, Indicator1 = prev_season_weight(NN_weight, Train_Val_Test_dataset)
Prev_Games_avg_data = prev_games_avg_data(NN_n, Indicator1, Dataset_step1)
Prev_Games_avg_data_for_rank, Indicator2 = prev_games_avg_data_for_rank(NN_m, Indicator1, Dataset_step1)
Prev_Games_ranking = prev_games_ranking(Indicator2, Prev_Games_avg_data_for_rank)              
Data_for_model_fit = data_merge(Prev_Games_avg_data, Prev_Games_avg_data_for_rank, Prev_Games_ranking, Dataset_step1)


#
Data_for_model_fit = Data_for_model_fit.sort_values(["Test set identifier"], ascending = True)
Data_for_model_fit = Data_for_model_fit.reset_index(drop=True)
Data_for_model_fit = Data_for_model_fit.drop(["Test set identifier"], axis=1).dropna()

Test_set = Data_for_model_fit.iloc[:(round(0.15*Data_for_model_fit.shape[0])),:].reset_index(drop=True)
Train_and_Val = Data_for_model_fit.iloc[(round(0.15*Data_for_model_fit.shape[0])):,:].reset_index(drop=True)
    
Train_and_Val = Train_and_Val.sort_values(["Val set identifier"], ascending = True)
Train_and_Val = Train_and_Val.reset_index(drop=True)
Train_and_Val = Train_and_Val.drop(["Val set identifier"], axis=1).dropna()
Test_set = Test_set.drop(["Val set identifier"], axis=1).dropna()
    
Val_set = Train_and_Val.iloc[:(round(0.12*Train_and_Val.shape[0])),:].reset_index(drop=True)
Train_set = Train_and_Val.iloc[(round(0.12*Train_and_Val.shape[0])):,:].reset_index(drop=True)

# Trainset
Train_Target = Train_set.loc[:, "Result"]
Train_Features = Train_set.iloc[:, 7:]
Train_Features = Train_Features.values
    
# Valset
Val_Target = Val_set.loc[:, "Result"]
Val_Features = Val_set.iloc[:, 7:].values

# Testset
Test_Target = Test_set.loc[:, "Result"]
Test_Features = Test_set.iloc[:, 7:]
Test_Features = Test_Features.values
    
encoder = LabelEncoder()
encoder.fit(Train_Target)
encoded_Y = encoder.transform(Train_Target)
NN_train_target = np_utils.to_categorical(encoded_Y)
    
encoded_Val = encoder.transform(Val_Target)
NN_val_target = np_utils.to_categorical(encoded_Val)
    
encoded_Y2 = encoder.transform(Test_Target)
NN_test_target = np_utils.to_categorical(encoded_Y2)
    
# scaling

if NN_Scaler == 2:
    Train_Features = scaleData(Train_Features)
    Val_Features = scaleData(Val_Features)
    Test_Features = scaleData(Test_Features)
elif NN_Scaler == 1:
    Train_Features = scaleData2(Train_Features)
    Val_Features = scaleData2(Val_Features)
    Test_Features = scaleData2(Test_Features)
else:
    for i in range(Train_Features.shape[0]):
        for j in range(Train_Features.shape[1]):
            Train_Features[i,j] = 2 * (Train_Features[i,j] - Train_Features[:,j].min()) / (Train_Features[:,j].max() - Train_Features[:,j].min())-1
    for i in range(Val_Features.shape[0]):
        for j in range(Val_Features.shape[1]):
            Val_Features[i,j] = 2 * (Val_Features[i,j] - Val_Features[:,j].min()) / (Val_Features[:,j].max() - Val_Features[:,j].min())-1
    for i in range(Test_Features.shape[0]):
        for j in range(Test_Features.shape[1]):
            Test_Features[i,j] = 2 * (Test_Features[i,j] - Test_Features[:,j].min()) / (Test_Features[:,j].max() - Test_Features[:,j].min())-1
    
        
if NN_Activation_Function == 0:
    activ_f = "relu"
elif NN_Activation_Function == 1:
    activ_f = "leakyRelu"
else:
    activ_f = "tanh"
        
NN_model = Sequential()
if NN_Activation_Function == 1:
    NN_model.add(Dense(Train_Features.shape[1],input_shape=(Train_Features.shape[1], ), activation="relu", 
                    use_bias = True))
else:
    NN_model.add(Dense(Train_Features.shape[1],input_shape=(Train_Features.shape[1], ), activation=activ_f, 
                    use_bias = True))
NN_model.add(Dropout(NN_dropout_rate1/100))
if NN_Activation_Function != 1 :
    NN_model.add(Dense(NN_Nodes_dense2, activation=activ_f))
else:
    NN_model.add(LeakyReLU(alpha = 0.1))
NN_model.add(Dropout(NN_dropout_rate2/100))
if NN_Activation_Function !=1:
    NN_model.add(Dense(NN_Nodes_dense3, activation=activ_f))
else:
    NN_model.add(LeakyReLU(alpha = 0.1))
NN_model.add(Dense(3, activation='softmax'))
   
if NN_Optimizer == 0:
    optim = "adam"
elif NN_Optimizer == 1:
    optim = "adadelta"
elif NN_Optimizer == 2:
    optim = "adagrad"
else:
    optim ="sgd"

    
NN_model.compile(loss='categorical_crossentropy', optimizer=optim, metrics=['accuracy'])
NN_model.fit(Train_Features, NN_train_target, epochs=NN_Epochs, batch_size=NN_Batch_size)
    
score = NN_model.evaluate(Test_Features, NN_test_target)
score_val = NN_model.evaluate(Val_Features, NN_val_target)
score2 = NN_model.evaluate(Train_Features, NN_train_target)
    
NN_train_score = score2[1]
NN_val_score = score_val[1]
NN_test_score = score[1]
    
NN_train_prediction = NN_model.predict(Train_Features)
NN_val_prediction = NN_model.predict(Val_Features)
NN_test_prediction = NN_model.predict(Test_Features)


## Model Accuracy

In [None]:
# scores
print(Tree_test_score, SVM_test_score, NN_test_score)
print(Tree_val_score,SVM_val_score, NN_val_score)
print(Tree_train_score, SVM_train_score, NN_train_score)

## Preparing Strategy evaluation

##### The models 'like' and 'dislike' certain Teams, which materializes in differences of successful predictions. Using the validation set, here, the optimal number of Teams to pick from what the individual model does will be determined.

In [None]:
# since the NN outputs "3 columns with probabilities" 

NN_test_Result_prediction = pd.DataFrame(columns=["NN Prediction"])

for i in range(Tree_test_target.shape[0]):
    if (NN_test_prediction[i,0] > NN_test_prediction[i,1]) & (NN_test_prediction[i,0] >NN_test_prediction[i,2]):
        NN_test_Result_prediction.loc[i,"NN Prediction"] = "A"
        
    elif (NN_test_prediction[i,1] > NN_test_prediction[i,0]) & (NN_test_prediction[i,1] >NN_test_prediction[i,2]):
        NN_test_Result_prediction.loc[i,"NN Prediction"] = "D"    
    
    elif (NN_test_prediction[i,2] > NN_test_prediction[i,0]) & (NN_test_prediction[i,2] >NN_test_prediction[i,1]):
        NN_test_Result_prediction.loc[i,"NN Prediction"] = "H"
        
    else:
        NN_test_Result_prediction.loc[i,"NN Prediction"] = np.nan

In [None]:
NN_val_Result_prediction = pd.DataFrame(columns=["NN Prediction"])

for i in range(NN_val_target.shape[0]):
    if (NN_val_prediction[i,0] > NN_val_prediction[i,1]) & (NN_val_prediction[i,0] >NN_val_prediction[i,2]):
        NN_val_Result_prediction.loc[i,"NN Prediction"] = "A"
        
    elif (NN_val_prediction[i,1] > NN_val_prediction[i,0]) & (NN_val_prediction[i,1] >NN_val_prediction[i,2]):
        NN_val_Result_prediction.loc[i,"NN Prediction"] = "D"    
    
    elif (NN_val_prediction[i,2] > NN_val_prediction[i,0]) & (NN_val_prediction[i,2] >NN_val_prediction[i,1]):
        NN_val_Result_prediction.loc[i,"NN Prediction"] = "H"
        
    else:
        NN_val_Result_prediction.loc[i,"NN Prediction"] = np.nan

In [None]:
NN_train_Result_prediction = pd.DataFrame(columns=["NN Prediction"])

for i in range(NN_train_target.shape[0]):
    if (NN_train_prediction[i,0] > NN_train_prediction[i,1]) & (NN_train_prediction[i,0] >NN_train_prediction[i,2]):
        NN_train_Result_prediction.loc[i,"NN Prediction"] = "A"
        
    elif (NN_train_prediction[i,1] > NN_train_prediction[i,0]) & (NN_train_prediction[i,1] >NN_train_prediction[i,2]):
        NN_train_Result_prediction.loc[i,"NN Prediction"] = "D"    
    
    elif (NN_train_prediction[i,2] > NN_train_prediction[i,0]) & (NN_train_prediction[i,2] >NN_train_prediction[i,1]):
        NN_train_Result_prediction.loc[i,"NN Prediction"] = "H"
        
    else:
        NN_train_Result_prediction.loc[i,"NN Prediction"] = np.nan

##### The Cells below allow to calculate the accuracy of a strategy that, for a given model and predicted winners /losers, only picks those which involve picking a team / picking against a team, that worked well in the training set. The number of teams is determined based on the validation set

In [None]:
# Part 1
Validation_evaluation = pd.DataFrame(columns=["Actual","Tree Prediction","SVM Prediction","NN Prediction"])
for i in range(Tree_val_target.shape[0]):
    Validation_evaluation.loc[i,"Actual"] = Tree_val_target[i]
    Validation_evaluation.loc[i,"Tree Prediction"] = Tree_val_prediction[i]
    Validation_evaluation.loc[i,"SVM Prediction"] = SVM_val_prediction[i]
    Validation_evaluation.loc[i,"NN Prediction"] = NN_val_Result_prediction.loc[i,"NN Prediction"]
    
Validation_evaluation["Home Team"] = Val_set["HomeTeam"]
Validation_evaluation["Away Team"] = Val_set["AwayTeam"]
Validation_evaluation["Democracy"] = np.nan
Validation_evaluation["Unanimous"] = np.nan
Validation_evaluation["Fav Strategy"] = np.nan

for i in range(Validation_evaluation.shape[0]):
    
    if (Validation_evaluation.loc[i,"Tree Prediction"] == Validation_evaluation.loc[i,"SVM Prediction"])&(Validation_evaluation.loc[i,"Tree Prediction"] == Validation_evaluation.loc[i,"NN Prediction"]):
        Validation_evaluation.loc[i,"Unanimous"] = Validation_evaluation.loc[i,"Tree Prediction"]
    
    #
    if (Validation_evaluation.loc[i,"Tree Prediction"] == Validation_evaluation.loc[i,"SVM Prediction"])&(Validation_evaluation.loc[i,"Tree Prediction"] != Validation_evaluation.loc[i,"NN Prediction"]):
        Validation_evaluation.loc[i,"Democracy"] = Validation_evaluation.loc[i,"Tree Prediction"]
    elif (Validation_evaluation.loc[i,"Tree Prediction"] != Validation_evaluation.loc[i,"SVM Prediction"])&(Validation_evaluation.loc[i,"Tree Prediction"] == Validation_evaluation.loc[i,"NN Prediction"]):
        Validation_evaluation.loc[i,"Democracy"] = Validation_evaluation.loc[i,"Tree Prediction"]
    elif (Validation_evaluation.loc[i,"Tree Prediction"] != Validation_evaluation.loc[i,"SVM Prediction"])&(Validation_evaluation.loc[i,"SVM Prediction"] == Validation_evaluation.loc[i,"NN Prediction"]):
        Validation_evaluation.loc[i,"Democracy"] = Validation_evaluation.loc[i,"SVM Prediction"]
    elif (Validation_evaluation.loc[i,"Tree Prediction"] == Validation_evaluation.loc[i,"SVM Prediction"])&(Validation_evaluation.loc[i,"Tree Prediction"] == Validation_evaluation.loc[i,"NN Prediction"]):
        Validation_evaluation.loc[i,"Democracy"] = Validation_evaluation.loc[i,"Tree Prediction"]
    #
    Validation_evaluation.loc[i,"Fav Strategy"] = np.where((Val_set.loc[i,"Odds H"] < Val_set.loc[i,"Odds A"]) & (Val_set.loc[i,"Odds H"] < Val_set.loc[i,"Odds D"]), "H", Validation_evaluation.loc[i,"Fav Strategy"])
    Validation_evaluation.loc[i,"Fav Strategy"] = np.where((Val_set.loc[i,"Odds A"] < Val_set.loc[i,"Odds D"]) & (Val_set.loc[i,"Odds A"] < Val_set.loc[i,"Odds D"]), "A", Validation_evaluation.loc[i,"Fav Strategy"])
    Validation_evaluation.loc[i,"Fav Strategy"] = np.where((Val_set.loc[i,"Odds D"] < Val_set.loc[i,"Odds H"]) & (Val_set.loc[i,"Odds D"] < Val_set.loc[i,"Odds A"]), "D", Validation_evaluation.loc[i,"Fav Strategy"])
    
    Validation_evaluation.loc[i,"Odds H"] = Val_set.loc[i,"Odds H"]
    Validation_evaluation.loc[i,"Odds D"] = Val_set.loc[i,"Odds D"]
    Validation_evaluation.loc[i,"Odds A"] = Val_set.loc[i,"Odds A"]
    
    
Validation_evaluation = Validation_evaluation[["Home Team","Away Team", "Actual", "Tree Prediction", "SVM Prediction","NN Prediction","Democracy", "Unanimous","Fav Strategy","Odds H","Odds D","Odds A"]]
Validation_evaluation["Pick Home"] = "H"

In [None]:
# Part 2
Accuracy_Ranker_by_Team_prep = pd.DataFrame(columns=["Actual","Tree Prediction","SVM Prediction","NN Prediction"])
for i in range(Tree_train_target.shape[0]):
    Accuracy_Ranker_by_Team_prep.loc[i,"Actual"] = Tree_train_target[i]
    Accuracy_Ranker_by_Team_prep.loc[i,"Tree Prediction"] = Tree_train_prediction[i]
    Accuracy_Ranker_by_Team_prep.loc[i,"SVM Prediction"] = SVM_train_prediction[i]
    Accuracy_Ranker_by_Team_prep.loc[i,"NN Prediction"] = NN_train_Result_prediction.loc[i,"NN Prediction"]
    
Accuracy_Ranker_by_Team_prep["Home Team"] = Train_set["HomeTeam"]
Accuracy_Ranker_by_Team_prep["Away Team"] = Train_set["AwayTeam"]
Accuracy_Ranker_by_Team_prep["Democracy"] = np.nan
Accuracy_Ranker_by_Team_prep["Unanimous"] = np.nan
Accuracy_Ranker_by_Team_prep["Fav Strategy"] = np.nan

for i in range(Accuracy_Ranker_by_Team_prep.shape[0]):
    
    if (Accuracy_Ranker_by_Team_prep.loc[i,"Tree Prediction"] == Accuracy_Ranker_by_Team_prep.loc[i,"SVM Prediction"])&(Accuracy_Ranker_by_Team_prep.loc[i,"Tree Prediction"] == Accuracy_Ranker_by_Team_prep.loc[i,"NN Prediction"]):
        Accuracy_Ranker_by_Team_prep.loc[i,"Unanimous"] = Accuracy_Ranker_by_Team_prep.loc[i,"Tree Prediction"]
    
    #
    if (Accuracy_Ranker_by_Team_prep.loc[i,"Tree Prediction"] == Accuracy_Ranker_by_Team_prep.loc[i,"SVM Prediction"])&(Accuracy_Ranker_by_Team_prep.loc[i,"Tree Prediction"] != Accuracy_Ranker_by_Team_prep.loc[i,"NN Prediction"]):
        Accuracy_Ranker_by_Team_prep.loc[i,"Democracy"] = Accuracy_Ranker_by_Team_prep.loc[i,"Tree Prediction"]
    elif (Accuracy_Ranker_by_Team_prep.loc[i,"Tree Prediction"] != Accuracy_Ranker_by_Team_prep.loc[i,"SVM Prediction"])&(Accuracy_Ranker_by_Team_prep.loc[i,"Tree Prediction"] == Accuracy_Ranker_by_Team_prep.loc[i,"NN Prediction"]):
        Accuracy_Ranker_by_Team_prep.loc[i,"Democracy"] = Accuracy_Ranker_by_Team_prep.loc[i,"Tree Prediction"]
    elif (Accuracy_Ranker_by_Team_prep.loc[i,"Tree Prediction"] != Accuracy_Ranker_by_Team_prep.loc[i,"SVM Prediction"])&(Accuracy_Ranker_by_Team_prep.loc[i,"SVM Prediction"] == Accuracy_Ranker_by_Team_prep.loc[i,"NN Prediction"]):
        Accuracy_Ranker_by_Team_prep.loc[i,"Democracy"] = Accuracy_Ranker_by_Team_prep.loc[i,"SVM Prediction"]
    elif (Accuracy_Ranker_by_Team_prep.loc[i,"Tree Prediction"] == Accuracy_Ranker_by_Team_prep.loc[i,"SVM Prediction"])&(Accuracy_Ranker_by_Team_prep.loc[i,"Tree Prediction"] == Accuracy_Ranker_by_Team_prep.loc[i,"NN Prediction"]):
        Accuracy_Ranker_by_Team_prep.loc[i,"Democracy"] = Accuracy_Ranker_by_Team_prep.loc[i,"Tree Prediction"]
    #
    Accuracy_Ranker_by_Team_prep.loc[i,"Fav Strategy"] = np.where((Train_set.loc[i,"Odds H"] < Train_set.loc[i,"Odds A"]) & (Train_set.loc[i,"Odds H"] < Train_set.loc[i,"Odds D"]), "H", Accuracy_Ranker_by_Team_prep.loc[i,"Fav Strategy"])
    Accuracy_Ranker_by_Team_prep.loc[i,"Fav Strategy"] = np.where((Train_set.loc[i,"Odds A"] < Train_set.loc[i,"Odds D"]) & (Train_set.loc[i,"Odds A"] < Train_set.loc[i,"Odds D"]), "A", Accuracy_Ranker_by_Team_prep.loc[i,"Fav Strategy"])
    Accuracy_Ranker_by_Team_prep.loc[i,"Fav Strategy"] = np.where((Train_set.loc[i,"Odds D"] < Train_set.loc[i,"Odds H"]) & (Train_set.loc[i,"Odds D"] < Train_set.loc[i,"Odds A"]), "D", Accuracy_Ranker_by_Team_prep.loc[i,"Fav Strategy"])
    
Accuracy_Ranker_by_Team_prep = Accuracy_Ranker_by_Team_prep[["Home Team","Away Team", "Actual", "Tree Prediction", "SVM Prediction","NN Prediction","Democracy", "Unanimous","Fav Strategy"]]
Accuracy_Ranker_by_Team_prep["Pick Home"] = "H"

In [None]:
Accuracy = pd.DataFrame(columns=["Tree Prediction","SVM Prediction","NN Prediction","Democracy","Unanimous","Fav Strategy","Pick Home"])

for i in range(Accuracy_Ranker_by_Team_prep.shape[0]):
    for column in Accuracy.columns:
        if str(Accuracy_Ranker_by_Team_prep.loc[i,column]) == "nan":
            Accuracy.loc[i,column] = np.nan
        
        else:
            Accuracy.loc[i,column] = np.where(Accuracy_Ranker_by_Team_prep.loc[i,"Actual"] == Accuracy_Ranker_by_Team_prep.loc[i,column],1,0)

In [None]:
Accuracy = Accuracy.rename(columns={"Tree Prediction":"Tree Acc","SVM Prediction":"SVM Acc","NN Prediction":"NN Acc","Democracy":"Democracy Acc","Unanimous":"Unanimous Acc","Fav Strategy":"Fav Acc","Pick Home":"HP Acc"})
Data = pd.concat([Accuracy_Ranker_by_Team_prep[["Home Team","Away Team","Tree Prediction","SVM Prediction","NN Prediction","Democracy","Unanimous","Fav Strategy","Pick Home"]], Accuracy[["Tree Acc","SVM Acc","NN Acc", "Democracy Acc","Unanimous Acc","Fav Acc","HP Acc"]]], axis=1)
Teams_list = list(set(list(Data["Home Team"])+(list(Data["Away Team"]))))
Data.head()

##### Which teams were accurately predicted to win / not to win:

In [None]:
Data_Team_Bias = pd.DataFrame(columns=["Team","Tree for","Tree for correct","Tree against","Tree against correct",
                                      "SVM for","SVM for correct","SVM against","SVM against correct",
                                      "NN for","NN for correct","NN against","NN against correct",
                                       "Democracy for","Democracy for correct","Democracy against","Democracy against correct",
                        "Unanimous for", "Unanimous for correct", "Unanimous against", "Unanimous against correct",
                                   "Favorite for", "Favorite for correct", "Favorite against", "Favorite against correct",
                                       "HP for", "HP for correct", "HP against", "HP against correct"])

i = 0
for Team in Teams_list:
    df1 = Data.loc[Data["Home Team"] == Team]
    df2 = Data.loc[Data["Away Team"] == Team]
    
    for model in ["Tree","SVM","NN","Democracy","Unanimous","Favorite","HP"]:
        if model == "Tree": 
            T_H_for = df1.loc[df1["Tree Prediction"]=="H"].shape[0]
            T_H_against = df1.loc[df1["Tree Prediction"]!="H"].shape[0]
            
            T_H_for_correct = df1.loc[df1["Tree Prediction"]=="H"]["Tree Acc"].sum()
            T_H_against_correct = df1.loc[df1["Tree Prediction"]!="H"]["Tree Acc"].sum()
            
            ###
            T_A_for = df2.loc[df2["Tree Prediction"]=="A"].shape[0]
            T_A_against = df2.loc[df2["Tree Prediction"]!="A"].shape[0]
            
            T_A_for_correct = df2.loc[df2["Tree Prediction"]=="A"]["Tree Acc"].sum()
            T_A_against_correct = df2.loc[df2["Tree Prediction"]!="A"]["Tree Acc"].sum()
            
            T_for = T_H_for + T_A_for
            T_for_corr = T_H_for_correct + T_A_for_correct
            T_against = T_H_against + T_A_against
            T_against_corr = T_H_against_correct + T_A_against_correct
            
        elif model == "SVM": 
            S_H_for = df1.loc[df1["SVM Prediction"]=="H"].shape[0]
            S_H_against = df1.loc[df1["SVM Prediction"]!="H"].shape[0]
            
            S_H_for_correct = df1.loc[df1["SVM Prediction"]=="H"]["Tree Acc"].sum()
            S_H_against_correct = df1.loc[df1["SVM Prediction"]!="H"]["Tree Acc"].sum()
            
            ###
            S_A_for = df2.loc[df2["SVM Prediction"]=="A"].shape[0]
            S_A_against = df2.loc[df2["SVM Prediction"]!="A"].shape[0]
            
            S_A_for_correct = df2.loc[df2["SVM Prediction"]=="A"]["Tree Acc"].sum()
            S_A_against_correct = df2.loc[df2["SVM Prediction"]!="A"]["Tree Acc"].sum()
            
            S_for = S_H_for + S_A_for
            S_for_corr = S_H_for_correct + S_A_for_correct
            S_against = S_H_against + S_A_against
            S_against_corr = S_H_against_correct + S_A_against_correct
            
        elif model == "NN": 
            N_H_for = df1.loc[df1["NN Prediction"]=="H"].shape[0]
            N_H_against = df1.loc[df1["NN Prediction"]!="H"].shape[0]
            
            N_H_for_correct = df1.loc[df1["NN Prediction"]=="H"]["Tree Acc"].sum()
            N_H_against_correct = df1.loc[df1["NN Prediction"]!="H"]["Tree Acc"].sum()
            
            ###
            N_A_for = df2.loc[df2["NN Prediction"]=="A"].shape[0]
            N_A_against = df2.loc[df2["NN Prediction"]!="A"].shape[0]
            
            N_A_for_correct = df2.loc[df2["NN Prediction"]=="A"]["Tree Acc"].sum()
            N_A_against_correct = df2.loc[df2["NN Prediction"]!="A"]["Tree Acc"].sum()
            
            N_for = N_H_for + N_A_for
            N_for_corr = N_H_for_correct + N_A_for_correct
            N_against = N_H_against + N_A_against
            N_against_corr = N_H_against_correct + N_A_against_correct  
        
        elif model == "Democracy": 
            Democracy_H_for = df1.loc[df1["Democracy"]=="H"].shape[0]
            Democracy_H_against = df1.loc[df1["Democracy"]!="H"].shape[0]
            
            Democracy_H_for_correct = df1.loc[df1["Democracy"]=="H"]["Democracy Acc"].sum()
            Democracy_H_against_correct = df1.loc[df1["Democracy"]!="H"]["Democracy Acc"].sum()
            
            ###
            Democracy_A_for = df2.loc[df2["Democracy"]=="A"].shape[0]
            Democracy_A_against = df2.loc[df2["Democracy"]!="A"].shape[0]
            
            Democracy_A_for_correct = df2.loc[df2["Democracy"]=="A"]["Democracy Acc"].sum()
            Democracy_A_against_correct = df2.loc[df2["Democracy"]!="A"]["Democracy Acc"].sum()
            
            Democracy_for = Democracy_H_for + Democracy_A_for
            Democracy_for_corr = Democracy_H_for_correct + Democracy_A_for_correct
            Democracy_against = Democracy_H_against + Democracy_A_against
            Democracy_against_corr = Democracy_H_against_correct + Democracy_A_against_correct
        
        elif model == "Unanimous": 
            Unanimous_H_for = df1.loc[df1["Unanimous"]=="H"].shape[0]
            Unanimous_H_against = df1.loc[df1["Unanimous"]!="H"].shape[0]
            
            Unanimous_H_for_correct = df1.loc[df1["Unanimous"]=="H"]["Unanimous Acc"].sum()
            Unanimous_H_against_correct = df1.loc[df1["Unanimous"]!="H"]["Unanimous Acc"].sum()
            
            ###
            Unanimous_A_for = df2.loc[df2["Unanimous"]=="A"].shape[0]
            Unanimous_A_against = df2.loc[df2["Unanimous"]!="A"].shape[0]
            
            Unanimous_A_for_correct = df2.loc[df2["Unanimous"]=="A"]["Unanimous Acc"].sum()
            Unanimous_A_against_correct = df2.loc[df2["Unanimous"]!="A"]["Unanimous Acc"].sum()
            
            Unanimous_for = Unanimous_H_for + Unanimous_A_for
            Unanimous_for_corr = Unanimous_H_for_correct + Unanimous_A_for_correct
            Unanimous_against = Unanimous_H_against + Unanimous_A_against
            Unanimous_against_corr = Unanimous_H_against_correct + Unanimous_A_against_correct
        
        elif model == "Favorite": 
            Favorite_H_for = df1.loc[df1["Fav Strategy"]=="H"].shape[0]
            Favorite_H_against = df1.loc[df1["Fav Strategy"]!="H"].shape[0]
            
            Favorite_H_for_correct = df1.loc[df1["Fav Strategy"]=="H"]["Fav Acc"].sum()
            Favorite_H_against_correct = df1.loc[df1["Fav Strategy"]!="H"]["Fav Acc"].sum()
            
            ###
            Favorite_A_for = df2.loc[df2["Fav Strategy"]=="A"].shape[0]
            Favorite_A_against = df2.loc[df2["Fav Strategy"]!="A"].shape[0]
            
            Favorite_A_for_correct = df2.loc[df2["Fav Strategy"]=="A"]["Fav Acc"].sum()
            Favorite_A_against_correct = df2.loc[df2["Fav Strategy"]!="A"]["Fav Acc"].sum()
            
            Favorite_for = Favorite_H_for + Favorite_A_for
            Favorite_for_corr = Favorite_H_for_correct + Favorite_A_for_correct
            Favorite_against = Favorite_H_against + Favorite_A_against
            Favorite_against_corr = Favorite_H_against_correct + Favorite_A_against_correct
        
        elif model == "HP": 
            HP_H_for = df1.loc[df1["Pick Home"]=="H"].shape[0]
            HP_H_against = df1.loc[df1["Pick Home"]!="H"].shape[0]
            
            HP_H_for_correct = df1.loc[df1["Pick Home"]=="H"]["HP Acc"].sum()
            HP_H_against_correct = df1.loc[df1["Pick Home"]!="H"]["HP Acc"].sum()
            
            ###
            HP_A_for = df2.loc[df2["Pick Home"]=="A"].shape[0]
            HP_A_against = df2.loc[df2["Pick Home"]!="A"].shape[0]
            
            HP_A_for_correct = df2.loc[df2["Pick Home"]=="A"]["HP Acc"].sum()
            HP_A_against_correct = df2.loc[df2["Pick Home"]!="A"]["HP Acc"].sum()
            
            HP_for = HP_H_for + HP_A_for
            HP_for_corr = HP_H_for_correct + HP_A_for_correct
            HP_against = HP_H_against + HP_A_against
            HP_against_corr = HP_H_against_correct + HP_A_against_correct
        
    data = [Team, T_for, T_for_corr, T_against, T_against_corr, S_for, S_for_corr, S_against, S_against_corr,
            N_for, N_for_corr, N_against, N_against_corr,
           Democracy_for,Democracy_for_corr,Democracy_against,Democracy_against_corr,
            Unanimous_for, Unanimous_for_corr, Unanimous_against, Unanimous_against_corr,
           Favorite_for, Favorite_for_corr, Favorite_against, Favorite_against_corr,
           HP_for, HP_for_corr, HP_against, HP_against_corr]
    
    Data_Team_Bias.loc[i] = data
    i += 1
    
Data_Team_Bias = Data_Team_Bias.reset_index(drop=True)

In [None]:
# Focus only on teams that are actually included in the validation set for the hyperparameter tuning
Val_set_Teams = list(set(list(Val_set["HomeTeam"])+list(Val_set["AwayTeam"])))

In [None]:
# Tree
def Tree_val_eval_for(Data_Team_Bias, Teams_list):
    Tree_for_data = Data_Team_Bias.loc[:,["Team","Tree for","Tree for correct"]]
    Tree_for_data = Tree_for_data.loc[Tree_for_data["Team"].isin(Teams_list)]
    Tree_for_data["%"] = np.nan
    for i in range(Tree_for_data.shape[0]):
        try:
            Tree_for_data.loc[i,"%"] = round(Tree_for_data.loc[i,"Tree for correct"] / Tree_for_data.loc[i,"Tree for"] *100,2)
        except:
            Tree_for_data.loc[i,"%"] = np.nan
            
    return Tree_for_data.sort_values("%", ascending=False).reset_index(drop=True)

def Tree_val_eval_against(Data_Team_Bias,Teams_list):
    Tree_ag_data = Data_Team_Bias.loc[:,["Team","Tree against","Tree against correct"]]
    Tree_ag_data = Tree_ag_data.loc[Tree_ag_data["Team"].isin(Teams_list)]
    Tree_ag_data["%"] = np.nan
    for i in range(Tree_ag_data.shape[0]):
        try:
            Tree_ag_data.loc[i,"%"] = round(Tree_ag_data.loc[i,"Tree against correct"] / Tree_ag_data.loc[i,"Tree against"] *100,2)
        except:
            Tree_ag_data.loc[i,"%"] = np.nan
            
    return Tree_ag_data.sort_values("%", ascending=False).reset_index(drop=True)

In [None]:
# SVM
def SVM_val_eval_for(Data_Team_Bias,Teams_list):
    SVM_for_data = Data_Team_Bias.loc[:,["Team","SVM for","SVM for correct"]]
    SVM_for_data = SVM_for_data.loc[SVM_for_data["Team"].isin(Teams_list)]
    SVM_for_data["%"] = np.nan
    for i in range(SVM_for_data.shape[0]):
        try:
            SVM_for_data.loc[i,"%"] = round(SVM_for_data.loc[i,"SVM for correct"] / SVM_for_data.loc[i,"SVM for"] *100,2)
        except:
            SVM_for_data.loc[i,"%"] = np.nan
    
    return SVM_for_data.sort_values("%", ascending=False).reset_index(drop=True)

def SVM_val_eval_against(Data_Team_Bias,Teams_list):
    SVM_ag_data = Data_Team_Bias.loc[:,["Team","SVM against","SVM against correct"]]
    SVM_ag_data = SVM_ag_data.loc[SVM_ag_data["Team"].isin(Teams_list)]
    SVM_ag_data["%"] = np.nan
    for i in range(SVM_ag_data.shape[0]):
        try:
            SVM_ag_data.loc[i,"%"] = round(SVM_ag_data.loc[i,"SVM against correct"] / SVM_ag_data.loc[i,"SVM against"] *100,2)
        except:
            SVM_ag_data.loc[i,"%"] = np.nan
            
    return SVM_ag_data.sort_values("%", ascending=False).reset_index(drop=True)

In [None]:
# NN
def NN_val_eval_for(Data_Team_Bias,Teams_list):
    NN_for_data = Data_Team_Bias.loc[:,["Team","NN for","NN for correct"]]
    NN_for_data = NN_for_data.loc[NN_for_data["Team"].isin(Teams_list)]
    NN_for_data["%"] = np.nan
    for i in range(NN_for_data.shape[0]):
        try:
            NN_for_data.loc[i,"%"] = round(NN_for_data.loc[i,"NN for correct"] / NN_for_data.loc[i,"NN for"] *100,2)
        except:
            NN_for_data.loc[i,"%"] = np.nan
            
    return NN_for_data.sort_values("%", ascending=False).reset_index(drop=True)

def NN_val_eval_against(Data_Team_Bias,Teams_list):
    NN_ag_data = Data_Team_Bias.loc[:,["Team","NN against","NN against correct"]]
    NN_ag_data = NN_ag_data.loc[NN_ag_data["Team"].isin(Teams_list)]
    NN_ag_data["%"] = np.nan
    for i in range(NN_ag_data.shape[0]):
        try:
            NN_ag_data.loc[i,"%"] = round(NN_ag_data.loc[i,"NN against correct"] / NN_ag_data.loc[i,"NN against"] *100,2)
        except:
            NN_ag_data.loc[i,"%"] = np.nan
    
    return NN_ag_data.sort_values("%", ascending=False).reset_index(drop=True)

In [None]:
# Democracy
def Democracy_val_eval_for(Data_Team_Bias,Teams_list):
    Democracy_for_data = Data_Team_Bias.loc[:,["Team","Democracy for","Democracy for correct"]]
    Democracy_for_data = Democracy_for_data.loc[Democracy_for_data["Team"].isin(Teams_list)]
    Democracy_for_data["%"] = np.nan
    for i in range(Democracy_for_data.shape[0]):
        try:
            Democracy_for_data.loc[i,"%"] = round(Democracy_for_data.loc[i,"Democracy for correct"] / Democracy_for_data.loc[i,"Democracy for"] *100,2)
        except:
            Democracy_for_data.loc[i,"%"] = np.nan
            
    return Democracy_for_data.sort_values("%", ascending=False).reset_index(drop=True)

def Democracy_val_eval_against(Data_Team_Bias,Teams_list):
    Democracy_ag_data = Data_Team_Bias.loc[:,["Team","Democracy against","Democracy against correct"]]
    Democracy_ag_data = Democracy_ag_data.loc[Democracy_ag_data["Team"].isin(Teams_list)]
    Democracy_ag_data["%"] = np.nan
    for i in range(Democracy_ag_data.shape[0]):
        try:
            Democracy_ag_data.loc[i,"%"] = round(Democracy_ag_data.loc[i,"Democracy against correct"] / Democracy_ag_data.loc[i,"Democracy against"] *100,2)
        except:
            Democracy_ag_data.loc[i,"%"] = np.nan
    
    return Democracy_ag_data.sort_values("%", ascending=False).reset_index(drop=True)

In [None]:
# Unanimous
def Unanimous_val_eval_for(Data_Team_Bias,Teams_list):
    Unanimous_for_data = Data_Team_Bias.loc[:,["Team","Unanimous for","Unanimous for correct"]]
    Unanimous_for_data = Unanimous_for_data.loc[Unanimous_for_data["Team"].isin(Teams_list)]
    Unanimous_for_data["%"] = np.nan
    for i in range(Unanimous_for_data.shape[0]):
        try:
            Unanimous_for_data.loc[i,"%"] = round(Unanimous_for_data.loc[i,"Unanimous for correct"] / Unanimous_for_data.loc[i,"Unanimous for"] *100,2)
        except:
            Unanimous_for_data.loc[i,"%"] = np.nan
            
    return Unanimous_for_data.sort_values("%", ascending=False).reset_index(drop=True)

def Unanimous_val_eval_against(Data_Team_Bias,Teams_list):
    Unanimous_ag_data = Data_Team_Bias.loc[:,["Team","Unanimous against","Unanimous against correct"]]
    Unanimous_ag_data = Unanimous_ag_data.loc[Unanimous_ag_data["Team"].isin(Teams_list)]
    Unanimous_ag_data["%"] = np.nan
    for i in range(Unanimous_ag_data.shape[0]):
        try:
            Unanimous_ag_data.loc[i,"%"] = round(Unanimous_ag_data.loc[i,"Unanimous against correct"] / Unanimous_ag_data.loc[i,"Unanimous against"] *100,2)
        except:
            Unanimous_ag_data.loc[i,"%"] = np.nan
    
    return Unanimous_ag_data.sort_values("%", ascending=False).reset_index(drop=True)

In [None]:
# Favorite
def Favorite_val_eval_for(Data_Team_Bias,Teams_list):
    Favorite_for_data = Data_Team_Bias.loc[:,["Team","Favorite for","Favorite for correct"]]
    Favorite_for_data = Favorite_for_data.loc[Favorite_for_data["Team"].isin(Teams_list)]
    Favorite_for_data["%"] = np.nan
    for i in range(Favorite_for_data.shape[0]):
        try:
            Favorite_for_data.loc[i,"%"] = round(Favorite_for_data.loc[i,"Favorite for correct"] / Favorite_for_data.loc[i,"Favorite for"] *100,2)
        except:
            Favorite_for_data.loc[i,"%"] = np.nan
            
    return Favorite_for_data.sort_values("%", ascending=False).reset_index(drop=True)

def Favorite_val_eval_against(Data_Team_Bias,Teams_list):
    Favorite_ag_data = Data_Team_Bias.loc[:,["Team","Favorite against","Favorite against correct"]]
    Favorite_ag_data = Favorite_ag_data.loc[Favorite_ag_data["Team"].isin(Teams_list)]
    Favorite_ag_data["%"] = np.nan
    for i in range(Favorite_ag_data.shape[0]):
        try:
            Favorite_ag_data.loc[i,"%"] = round(Favorite_ag_data.loc[i,"Favorite against correct"] / Favorite_ag_data.loc[i,"Favorite against"] *100,2)
        except:
            Favorite_ag_data.loc[i,"%"] = np.nan
    
    return Favorite_ag_data.sort_values("%", ascending=False).reset_index(drop=True)

In [None]:
# HP
def HP_val_eval_for(Data_Team_Bias,Teams_list):
    HP_for_data = Data_Team_Bias.loc[:,["Team","HP for","HP for correct"]]
    HP_for_data = HP_for_data.loc[HP_for_data["Team"].isin(Teams_list)]
    HP_for_data["%"] = np.nan
    for i in range(HP_for_data.shape[0]):
        try:
            HP_for_data.loc[i,"%"] = round(HP_for_data.loc[i,"HP for correct"] / HP_for_data.loc[i,"HP for"] *100,2)
        except:
            HP_for_data.loc[i,"%"] = np.nan
            
    return HP_for_data.sort_values("%", ascending=False).reset_index(drop=True)

def HP_val_eval_against(Data_Team_Bias,Teams_list):
    HP_ag_data = Data_Team_Bias.loc[:,["Team","HP against","HP against correct"]]
    HP_ag_data = HP_ag_data.loc[HP_ag_data["Team"].isin(Teams_list)]
    HP_ag_data["%"] = np.nan
    for i in range(HP_ag_data.shape[0]):
        try:
            HP_ag_data.loc[i,"%"] = round(HP_ag_data.loc[i,"HP against correct"] / HP_ag_data.loc[i,"HP against"] *100,2)
        except:
            HP_ag_data.loc[i,"%"] = np.nan
    
    return HP_ag_data.sort_values("%", ascending=False).reset_index(drop=True)

## Optimal number of Teams included for picking strategy

In [None]:
def number_of_teams_optimizer(Data_Team_Bias, Validation_evaluation,Teams_list, Model, a, b):
    
    if Model == "Tree":
        predicts_well_for = list(set(Tree_val_eval_for(Data_Team_Bias,Teams_list)["Team"]))[:a]
        predicts_well_against = list(set(Tree_val_eval_against(Data_Team_Bias,Teams_list)["Team"]))[:b]
    
        data = Validation_evaluation[["Home Team","Away Team", "Actual", "Tree Prediction","Odds H","Odds D","Odds A"]]
        df1 = data.loc[(data["Home Team"].isin(predicts_well_for))&(data["Tree Prediction"] == "H") |
                      (data["Away Team"].isin(predicts_well_for))&(data["Tree Prediction"] == "A") |
                      (data["Home Team"].isin(predicts_well_against))&(data["Tree Prediction"] == "A") |
                      (data["Away Team"].isin(predicts_well_against))&(data["Tree Prediction"] == "H")].reset_index(drop=True)
        
        df1["Correct"] = np.nan
                                     
        for i in range(df1.shape[0]):
            df1.loc[i, "Correct"] = np.where(df1.loc[i, "Actual"] == df1.loc[i, "Tree Prediction"],1,0)
                                     
        score = df1["Correct"].sum()
        Number_of_picks = df1.shape[0]
    
    elif Model == "SVM":
        predicts_well_for = list(set(SVM_val_eval_for(Data_Team_Bias,Teams_list)["Team"]))[:a]
        predicts_well_against = list(set(SVM_val_eval_against(Data_Team_Bias,Teams_list)["Team"]))[:b]
    
        data = Validation_evaluation[["Home Team","Away Team", "Actual", "SVM Prediction","Odds H","Odds D","Odds A"]]
        df1 = data.loc[(data["Home Team"].isin(predicts_well_for))&(data["SVM Prediction"] == "H") |
                      (data["Away Team"].isin(predicts_well_for))&(data["SVM Prediction"] == "A") |
                      (data["Home Team"].isin(predicts_well_against))&(data["SVM Prediction"] == "A") |
                      (data["Away Team"].isin(predicts_well_against))&(data["SVM Prediction"] == "H")].reset_index(drop=True)
        
        df1["Correct"] = np.nan
                                     
        for i in range(df1.shape[0]):
            df1.loc[i, "Correct"] = np.where(df1.loc[i, "Actual"] == df1.loc[i, "SVM Prediction"],1,0)
                                     
        score = df1["Correct"].sum()
        Number_of_picks = df1.shape[0]                                 
    
    elif Model == "NN":
        predicts_well_for = list(set(NN_val_eval_for(Data_Team_Bias,Teams_list)["Team"]))[:a]
        predicts_well_against = list(set(NN_val_eval_against(Data_Team_Bias,Teams_list)["Team"]))[:b]
    
        data = Validation_evaluation[["Home Team","Away Team", "Actual", "NN Prediction","Odds H","Odds D","Odds A"]]
        df1 = data.loc[(data["Home Team"].isin(predicts_well_for))&(data["NN Prediction"] == "H") |
                      (data["Away Team"].isin(predicts_well_for))&(data["NN Prediction"] == "A") |
                      (data["Home Team"].isin(predicts_well_against))&(data["NN Prediction"] == "A") |
                      (data["Away Team"].isin(predicts_well_against))&(data["NN Prediction"] == "H")].reset_index(drop=True)
        
        df1["Correct"] = np.nan
                                     
        for i in range(df1.shape[0]):
            df1.loc[i, "Correct"] = np.where(df1.loc[i, "Actual"] == df1.loc[i, "NN Prediction"],1,0)
                                     
        score = df1["Correct"].sum()
        Number_of_picks = df1.shape[0]                                 
    
    elif Model == "Democracy":
        predicts_well_for = list(set(Democracy_val_eval_for(Data_Team_Bias,Teams_list)["Team"]))[:a]
        predicts_well_against = list(set(Democracy_val_eval_against(Data_Team_Bias,Teams_list)["Team"]))[:b]
    
        data = Validation_evaluation[["Home Team","Away Team", "Actual", "Democracy","Odds H","Odds D","Odds A"]]
        df1 = data.loc[(data["Home Team"].isin(predicts_well_for))&(data["Democracy"] == "H") |
                      (data["Away Team"].isin(predicts_well_for))&(data["Democracy"] == "A") |
                      (data["Home Team"].isin(predicts_well_against))&(data["Democracy"] == "A") |
                      (data["Away Team"].isin(predicts_well_against))&(data["Democracy"] == "H")].reset_index(drop=True)
        
        df1["Correct"] = np.nan
                                     
        for i in range(df1.shape[0]):
            df1.loc[i, "Correct"] = np.where(df1.loc[i, "Actual"] == df1.loc[i, "Democracy"],1,0)
                                     
        score = df1["Correct"].sum()
        Number_of_picks = df1.shape[0] 
    
    elif Model == "Favorite":
        predicts_well_for = list(set(Favorite_val_eval_for(Data_Team_Bias,Teams_list)["Team"]))[:a]
        predicts_well_against = list(set(Favorite_val_eval_against(Data_Team_Bias,Teams_list)["Team"]))[:b]
    
        data = Validation_evaluation[["Home Team","Away Team", "Actual", "Fav Strategy","Odds H","Odds D","Odds A"]]
        df1 = data.loc[(data["Home Team"].isin(predicts_well_for))&(data["Fav Strategy"] == "H") |
                      (data["Away Team"].isin(predicts_well_for))&(data["Fav Strategy"] == "A") |
                      (data["Home Team"].isin(predicts_well_against))&(data["Fav Strategy"] == "A") |
                      (data["Away Team"].isin(predicts_well_against))&(data["Fav Strategy"] == "H")].reset_index(drop=True)
        
        df1["Correct"] = np.nan
                                     
        for i in range(df1.shape[0]):
            df1.loc[i, "Correct"] = np.where(df1.loc[i, "Actual"] == df1.loc[i, "Fav Strategy"],1,0)
                                     
        score = df1["Correct"].sum()
        Number_of_picks = df1.shape[0] 
    
    elif Model == "Unanimous":
        predicts_well_for = list(set(Unanimous_val_eval_for(Data_Team_Bias,Teams_list)["Team"]))[:a]
        predicts_well_against = list(set(Unanimous_val_eval_against(Data_Team_Bias,Teams_list)["Team"]))[:b]
    
        data = Validation_evaluation[["Home Team","Away Team", "Actual", "Unanimous","Odds H","Odds D","Odds A"]]
        df1 = data.loc[(data["Home Team"].isin(predicts_well_for))&(data["Unanimous"] == "H") |
                      (data["Away Team"].isin(predicts_well_for))&(data["Unanimous"] == "A") |
                      (data["Home Team"].isin(predicts_well_against))&(data["Unanimous"] == "A") |
                      (data["Away Team"].isin(predicts_well_against))&(data["Unanimous"] == "H")].reset_index(drop=True)
        
        df1["Correct"] = np.nan
                                     
        for i in range(df1.shape[0]):
            df1.loc[i, "Correct"] = np.where(df1.loc[i, "Actual"] == df1.loc[i, "Unanimous"],1,0)
                                     
        score = df1["Correct"].sum()
        Number_of_picks = df1.shape[0] 
    
    elif Model == "HP":
        predicts_well_for = list(set(HP_val_eval_for(Data_Team_Bias,Teams_list)["Team"]))[:a]
        predicts_well_against = list(set(HP_val_eval_against(Data_Team_Bias,Teams_list)["Team"]))[:b]
    
        data = Validation_evaluation[["Home Team","Away Team", "Actual", "Pick Home","Odds H","Odds D","Odds A"]]
        df1 = data.loc[(data["Home Team"].isin(predicts_well_for))&(data["Pick Home"] == "H") |
                      (data["Away Team"].isin(predicts_well_for))&(data["Pick Home"] == "A") |
                      (data["Home Team"].isin(predicts_well_against))&(data["Pick Home"] == "A") |
                      (data["Away Team"].isin(predicts_well_against))&(data["Pick Home"] == "H")].reset_index(drop=True)
        
        df1["Correct"] = np.nan
                                     
        for i in range(df1.shape[0]):
            df1.loc[i, "Correct"] = np.where(df1.loc[i, "Actual"] == df1.loc[i, "Pick Home"],1,0)
                                     
        score = df1["Correct"].sum()
        Number_of_picks = df1.shape[0] 
    
    
    return score, Number_of_picks, df1

In [None]:
Validation_scoring = pd.DataFrame()
    
for Model in ["Tree","SVM","NN","Democracy","Unanimous","Favorite","HP"]:
    for a in range(4,15):
        for b in range(4,15):
                
            score, Number_of_picks, data = number_of_teams_optimizer(Data_Team_Bias, Validation_evaluation, Val_set_Teams, Model, a, b)
            params = [[Model, score, Number_of_picks, a, b]]
            Validation_scoring = Validation_scoring.append(params)
            #print(Model, a, b)
            
Validation_scoring = Validation_scoring.reset_index(drop=True)
Validation_scoring.columns = ["Model", "score","Number of picks","Hyperparam 'for'","Hyperparam 'against'"]

In [None]:
Validation_scoring["%"] = Validation_scoring["score"] / Validation_scoring["Number of picks"]
Validation_scoring["Ranker"] = Validation_scoring["score"] * Validation_scoring["%"] ** 2 # higher weight for a high percentage
Validation_scoring.head()

In [None]:
Ranked_val = Validation_scoring.sort_values("Ranker", ascending=False)
Ranked_val.head()

### Preparing the test-set evaluation

In [None]:
Result_evaluation = pd.DataFrame(columns=["Actual","Tree Prediction","SVM Prediction","NN Prediction"])
for i in range(Tree_test_target.shape[0]):
    Result_evaluation.loc[i,"Actual"] = Tree_test_target[i]
    Result_evaluation.loc[i,"Tree Prediction"] = Tree_test_prediction[i]
    Result_evaluation.loc[i,"SVM Prediction"] = SVM_test_prediction[i]
    Result_evaluation.loc[i,"NN Prediction"] = NN_test_Result_prediction.loc[i,"NN Prediction"]
    
Result_evaluation["Home Team"] = Test_set["HomeTeam"]
Result_evaluation["Away Team"] = Test_set["AwayTeam"]
Result_evaluation["Democracy"] = np.nan
Result_evaluation["Unanimous"] = np.nan
Result_evaluation["Fav Strategy"] = np.nan

for i in range(Result_evaluation.shape[0]):
    
    if (Result_evaluation.loc[i,"Tree Prediction"] == Result_evaluation.loc[i,"SVM Prediction"])&(Result_evaluation.loc[i,"Tree Prediction"] == Result_evaluation.loc[i,"NN Prediction"]):
        Result_evaluation.loc[i,"Unanimous"] = Result_evaluation.loc[i,"Tree Prediction"]
    
    #
    if (Result_evaluation.loc[i,"Tree Prediction"] == Result_evaluation.loc[i,"SVM Prediction"])&(Result_evaluation.loc[i,"Tree Prediction"] != Result_evaluation.loc[i,"NN Prediction"]):
        Result_evaluation.loc[i,"Democracy"] = Result_evaluation.loc[i,"Tree Prediction"]
    elif (Result_evaluation.loc[i,"Tree Prediction"] != Result_evaluation.loc[i,"SVM Prediction"])&(Result_evaluation.loc[i,"Tree Prediction"] == Result_evaluation.loc[i,"NN Prediction"]):
        Result_evaluation.loc[i,"Democracy"] = Result_evaluation.loc[i,"Tree Prediction"]
    elif (Result_evaluation.loc[i,"Tree Prediction"] != Result_evaluation.loc[i,"SVM Prediction"])&(Result_evaluation.loc[i,"SVM Prediction"] == Result_evaluation.loc[i,"NN Prediction"]):
        Result_evaluation.loc[i,"Democracy"] = Result_evaluation.loc[i,"SVM Prediction"]
    elif (Result_evaluation.loc[i,"Tree Prediction"] == Result_evaluation.loc[i,"SVM Prediction"])&(Result_evaluation.loc[i,"Tree Prediction"] == Result_evaluation.loc[i,"NN Prediction"]):
        Result_evaluation.loc[i,"Democracy"] = Result_evaluation.loc[i,"Tree Prediction"]
    #
    Result_evaluation.loc[i,"Fav Strategy"] = np.where((Test_set.loc[i,"Odds H"] < Test_set.loc[i,"Odds A"]) & (Test_set.loc[i,"Odds H"] < Test_set.loc[i,"Odds D"]), "H", Result_evaluation.loc[i,"Fav Strategy"])
    Result_evaluation.loc[i,"Fav Strategy"] = np.where((Test_set.loc[i,"Odds A"] < Test_set.loc[i,"Odds D"]) & (Test_set.loc[i,"Odds A"] < Test_set.loc[i,"Odds D"]), "A", Result_evaluation.loc[i,"Fav Strategy"])
    Result_evaluation.loc[i,"Fav Strategy"] = np.where((Test_set.loc[i,"Odds D"] < Test_set.loc[i,"Odds H"]) & (Test_set.loc[i,"Odds D"] < Test_set.loc[i,"Odds A"]), "D", Result_evaluation.loc[i,"Fav Strategy"])
    
Result_evaluation = Result_evaluation[["Home Team","Away Team", "Actual", "Tree Prediction", "SVM Prediction","NN Prediction","Democracy", "Unanimous","Fav Strategy"]]
Result_evaluation["Pick Home"] = "H"

In [None]:
Result_evaluation = pd.concat([Result_evaluation, Test_set[["Odds H","Odds D","Odds A"]]], axis = 1)
Result_evaluation.head()

#### Results of the strategy

In [None]:
Test_set_Teams = list(set(list(Test_set["HomeTeam"])+list(Test_set["AwayTeam"])))

In [None]:
score_Tree, Number_of_picks_Tree, data_Tree = number_of_teams_optimizer(Data_Team_Bias, Result_evaluation, Test_set_Teams, "Tree", Ranked_val.loc[Ranked_val["Model"] == "Tree"].reset_index(drop=True).loc[0,"Hyperparam 'for'"], Ranked_val.loc[Ranked_val["Model"] == "Tree"].reset_index(drop=True).loc[0,"Hyperparam 'against'"])
score_SVM, Number_of_picks_SVM, data_SVM = number_of_teams_optimizer(Data_Team_Bias, Result_evaluation, Test_set_Teams, "SVM", Ranked_val.loc[Ranked_val["Model"] == "SVM"].reset_index(drop=True).loc[0,"Hyperparam 'for'"], Ranked_val.loc[Ranked_val["Model"] == "SVM"].reset_index(drop=True).loc[0,"Hyperparam 'against'"])
score_NN, Number_of_picks_NN, data_NN = number_of_teams_optimizer(Data_Team_Bias, Result_evaluation, Test_set_Teams, "NN", Ranked_val.loc[Ranked_val["Model"] == "NN"].reset_index(drop=True).loc[0,"Hyperparam 'for'"], Ranked_val.loc[Ranked_val["Model"] == "NN"].reset_index(drop=True).loc[0,"Hyperparam 'against'"])

score_D, Number_of_picks_D, data_D = number_of_teams_optimizer(Data_Team_Bias, Result_evaluation, Test_set_Teams, "Democracy", Ranked_val.loc[Ranked_val["Model"] == "Democracy"].reset_index(drop=True).loc[0,"Hyperparam 'for'"], Ranked_val.loc[Ranked_val["Model"] == "NN"].reset_index(drop=True).loc[0,"Hyperparam 'against'"])
score_U, Number_of_picks_U, data_U = number_of_teams_optimizer(Data_Team_Bias, Result_evaluation, Test_set_Teams, "Unanimous", Ranked_val.loc[Ranked_val["Model"] == "Unanimous"].reset_index(drop=True).loc[0,"Hyperparam 'for'"], Ranked_val.loc[Ranked_val["Model"] == "NN"].reset_index(drop=True).loc[0,"Hyperparam 'against'"])
score_F, Number_of_picks_F, data_F = number_of_teams_optimizer(Data_Team_Bias, Result_evaluation, Test_set_Teams, "Favorite", Ranked_val.loc[Ranked_val["Model"] == "Favorite"].reset_index(drop=True).loc[0,"Hyperparam 'for'"], Ranked_val.loc[Ranked_val["Model"] == "NN"].reset_index(drop=True).loc[0,"Hyperparam 'against'"])
score_H, Number_of_picks_H, data_H = number_of_teams_optimizer(Data_Team_Bias, Result_evaluation, Test_set_Teams, "HP", Ranked_val.loc[Ranked_val["Model"] == "HP"].reset_index(drop=True).loc[0,"Hyperparam 'for'"], Ranked_val.loc[Ranked_val["Model"] == "NN"].reset_index(drop=True).loc[0,"Hyperparam 'against'"])


In [None]:
print(score_Tree, Number_of_picks_Tree, score_Tree/Number_of_picks_Tree)
print(score_SVM, Number_of_picks_SVM, score_SVM/Number_of_picks_SVM)
print(score_NN, Number_of_picks_NN, score_NN/ Number_of_picks_NN)

In [None]:
print(score_D, Number_of_picks_D, score_D/Number_of_picks_D)
print(score_U, Number_of_picks_U, score_U / Number_of_picks_U)
print(score_F, Number_of_picks_F, score_F/Number_of_picks_F)
print(score_H, Number_of_picks_H, score_H/Number_of_picks_H)

##### Saving the data to evaluate in another notebook

In [None]:
# General predictions
Result_evaluation.to_csv(r'C:\Users\### LOCAL PATH ###\Result_evaluation.txt', header = True, index=None, sep='\t')

#### Predicting 2018/19 season so far

In [None]:
# data import - cleaned 2018/19 data from the ETL notebook
Data_current_season = pd.read_csv(r"C:\Users\### LOCAL PATH ###\data_current_season.txt", sep = "\t")
Data_current_season.head()

In [None]:
Teams_list = list(set(list(set(Data_current_season["HomeTeam"]))+list(set(Data_current_season["AwayTeam"]))))
Seasons_list = list(Data_current_season["Season start"].drop_duplicates())

In [None]:
# preparing the data
# NN

# adding the prev season's data
Season_data_current = pd.DataFrame()

for Team in Teams_list:
    data1 = Data_current_season.loc[Data_current_season["HomeTeam"] == Team]
    data2 = Data_current_season.loc[Data_current_season["AwayTeam"] == Team]
    
    for year in Seasons_list:
        
        data1_b = data1.loc[data1["Season start"] == year]
        data2_b = data2.loc[data2["Season start"] == year]

        Goals_H_for = data1_b["Home Goals"].sum()
        Goals_H_against = data1_b["Away Goals"].sum()
        Goals_A_for = data2_b["Away Goals"].sum()
        Goals_A_against = data2_b["Home Goals"].sum()

        # "Counting" Wins, draws, losses - first at home then away
        Wins_H = data1_b["Result"].str.count("H").sum()
        Ties_H = data1_b["Result"].str.count("D").sum()
        Losses_H = data1_b["Result"].str.count("A").sum()
        
        Wins_A = data2_b["Result"].str.count("A").sum()
        Ties_A = data2_b["Result"].str.count("D").sum()
        Losses_A = data2_b["Result"].str.count("H").sum()
        
        Total_points = Wins_H * 3 + Ties_H + Wins_A * 3 + Ties_A
              
        Season_data_current = Season_data_current.append([[Team, year, Total_points, Wins_H, Ties_H, Losses_H, Goals_H_for, Goals_H_against, Wins_A, Ties_A, Losses_A, Goals_A_for, Goals_A_against]])

Season_data_current.columns = ["Team","Season start","Total Points","Home W","Home T","Home L","Home Goals For", "Home Goals Against","Away W","Away T","Away L","Away Goals For", "Away Goals Against"]
Season_data_current.reset_index(drop=True, inplace=True)

#
# Combining the data - prev season summary for each Game
Season_data_current["prev season"] = pd.to_numeric(Season_data_current["Season start"]) +1
Season_data_current = Season_data_current.drop("Season start", axis=1)
# To merge
Season_data_current2 = Season_data_current.rename(columns={"Team":"AwayTeam","Total Points":"T2 Total Points","Home W":"T2 Home W","Home T":"T2 Home T","Home L":"T2 Home L", "Home Goals For":"T2 Home Goals For","Home Goals Against":"T2 Home Goals Against","Away W":"T2 Away W","Away T":"T2 Away T","Away L":"T2 Away L","Away Goals For":"T2 Away Goals For","Away Goals Against":"T2 Away Goals Against"})
Season_data_current = Season_data_current.rename(columns={"Team":"HomeTeam","Total Points":"T1 Total Points","Home W":"T1 Home W","Home T":"T1 Home T","Home L":"T1 Home L", "Home Goals For":"T1 Home Goals For","Home Goals Against":"T1 Home Goals Against","Away W":"T1 Away W","Away T":"T1 Away T","Away L":"T1 Away L","Away Goals For":"T1 Away Goals For","Away Goals Against":"T1 Away Goals Against"})

#
Data_current_season["Season start"] = pd.to_numeric(Data_current_season["Season start"])
Data_current_season["prev season"] = pd.to_numeric(Data_current_season["Season start"])

#
# merging season data
Data_current_season = Data_current_season.merge(Season_data_current, how="left",on=["prev season","HomeTeam"])
Data_current_season = Data_current_season.merge(Season_data_current2, how="left", on=["prev season","AwayTeam"])



In [None]:

### Model data prep - NN
Dataset_step1, Indicator1 = prev_season_weight(NN_weight, Data_current_season)
Prev_Games_avg_data = prev_games_avg_data(NN_n, Indicator1, Dataset_step1)
Prev_Games_avg_data_for_rank, Indicator2 = prev_games_avg_data_for_rank(NN_m, Indicator1, Dataset_step1)
Prev_Games_ranking = prev_games_ranking(Indicator2, Prev_Games_avg_data_for_rank)              
Data_for_model = data_merge(Prev_Games_avg_data, Prev_Games_avg_data_for_rank, Prev_Games_ranking, Dataset_step1).dropna().reset_index(drop=True)


In [None]:

####
# Curent Season NN
Current_season_Target = Data_for_model.loc[:, "Result"]
Current_season_Features = Data_for_model.iloc[:, 7:].values

encoded_Y3 = encoder.transform(Current_season_Target)
NN_Current_season_target = np_utils.to_categorical(encoded_Y3)
    
# scaling

if NN_Scaler == 2:
    Current_season_Features = scaleData(Current_season_Features)
elif NN_Scaler == 1:
    Current_season_Features = scaleData2(Current_season_Features)
    
else:
    for i in range(Current_season_Features.shape[0]):
        for j in range(Current_season_Features.shape[1]):
            Current_season_Features[i,j] = 2 * (Current_season_Features[i,j] - Current_season_Features[:,j].min()) / (Current_season_Features[:,j].max() - Current_season_Features[:,j].min())-1


In [None]:
## SVM
Dataset_step1, Indicator1 = prev_season_weight(SVM_weight, Data_current_season)
Prev_Games_avg_data = prev_games_avg_data(SVM_n, Indicator1, Dataset_step1)
Prev_Games_avg_data_for_rank, Indicator2 = prev_games_avg_data_for_rank(SVM_m, Indicator1, Dataset_step1)
Prev_Games_ranking = prev_games_ranking(Indicator2, Prev_Games_avg_data_for_rank)              
Data_for_model_SVM = data_merge(Prev_Games_avg_data, Prev_Games_avg_data_for_rank, Prev_Games_ranking, Dataset_step1).dropna().reset_index(drop=True)

In [None]:
## SVM
#
Current_season_Features_SVM = Data_for_model_SVM.iloc[:, 7:].values


# scaling
if SVM_Scaler == 2:
    Current_season_Features_SVM = scaleData(Current_season_Features_SVM)

elif SVM_Scaler == 1:
    Current_season_Features_SVM = scaleData2(Current_season_Features_SVM)
    
else:
    for i in range(Current_season_Features_SVM.shape[0]):
        for j in range(Current_season_Features_SVM.shape[1]):
            Current_season_Features_SVM[i,j] = 2 * (Current_season_Features_SVM[i,j] - Current_season_Features_SVM[:,j].min()) / (Current_season_Features_SVM[:,j].max() - Current_season_Features_SVM[:,j].min())-1


In [None]:
## Tree
Dataset_step1, Indicator1 = prev_season_weight(Tree_weight, Data_current_season)
Prev_Games_avg_data = prev_games_avg_data(Tree_n, Indicator1, Dataset_step1)
Prev_Games_avg_data_for_rank, Indicator2 = prev_games_avg_data_for_rank(Tree_m, Indicator1, Dataset_step1)
Prev_Games_ranking = prev_games_ranking(Indicator2, Prev_Games_avg_data_for_rank)              
Data_for_model_fit_tree = data_merge(Prev_Games_avg_data, Prev_Games_avg_data_for_rank, Prev_Games_ranking, Dataset_step1).dropna().reset_index(drop=True)

In [None]:
## Tree
# Getting dummies from the trained data

Current_season_Features_tree = Data_for_model_fit_tree.iloc[:, 7:].values

# scaling
if Tree_Scaler == 2:
    Current_season_Features_tree = scaleData(Current_season_Features_tree)

elif Tree_Scaler == 1:
    Current_season_Features_tree = scaleData2(Current_season_Features_tree)
    

else:
    if Tree_Team_Dummy == 0:
        for i in range(Current_season_Features_tree.shape[0]-helper):
            for j in range(Current_season_Features_tree.shape[1]):
                Current_season_Features_tree[i,j] = 2 * (Current_season_Features_tree[i,j] - Current_season_Features_tree[:,j].min()) / (Current_season_Features_tree[:,j].max() - Current_season_Features_tree[:,j].min())-1



In [None]:
NN_model.evaluate(Current_season_Features, NN_Current_season_target)

In [None]:
Current_season_prediction = NN_model.predict(Current_season_Features)

In [None]:
NN_Current_season_Result_prediction = pd.DataFrame(columns=["NN Prediction"])

for i in range(NN_Current_season_target.shape[0]):
    if (Current_season_prediction[i,0] > Current_season_prediction[i,1]) & (Current_season_prediction[i,0] >Current_season_prediction[i,2]):
        NN_Current_season_Result_prediction.loc[i,"NN Prediction"] = "A"
        
    elif (Current_season_prediction[i,1] > Current_season_prediction[i,0]) & (Current_season_prediction[i,1] >Current_season_prediction[i,2]):
        NN_Current_season_Result_prediction.loc[i,"NN Prediction"] = "D"    
    
    elif (Current_season_prediction[i,2] > Current_season_prediction[i,0]) & (Current_season_prediction[i,2] >Current_season_prediction[i,1]):
        NN_Current_season_Result_prediction.loc[i,"NN Prediction"] = "H"
        
    else:
        NN_Current_season_Result_prediction.loc[i,"NN Prediction"] = np.nan

In [None]:
metrics.accuracy_score(Current_season_Target, SVM_clf.predict(Current_season_Features_SVM))

In [None]:
SVM_prediction = SVM_clf.predict(Current_season_Features)

In [None]:
metrics.accuracy_score(Current_season_Target, Result_Tree.predict(Current_season_Features_tree))

In [None]:
Tree_prediction = Result_Tree.predict(Current_season_Features_tree)

In [None]:
Current_season = Data_for_model.loc[:,["Season start","Gameday","HomeTeam","AwayTeam","Result","Odds H","Odds D","Odds A"]]
                  
Current_season["Tree Prediction"] =np.nan
Current_season["SVM Prediction"] =np.nan
Current_season["NN Prediction"] =np.nan
                  
for i in range(Current_season.shape[0]):
    
    Current_season.loc[i,"Tree Prediction"] = Tree_prediction[i]
    Current_season.loc[i,"SVM Prediction"] = SVM_prediction[i]
    Current_season.loc[i,"NN Prediction"] = NN_Current_season_Result_prediction.loc[i,"NN Prediction"]

In [None]:
Current_season["Democracy"] =np.nan
Current_season["Unanimous"] =np.nan
Current_season["Fav Strategy"] = np.nan
Current_season["Home Pick"] = "H"

    
for i in range(Current_season.shape[0]):
    
    if (Current_season.loc[i,"Tree Prediction"] == Current_season.loc[i,"SVM Prediction"])&(Current_season.loc[i,"Tree Prediction"] == Current_season.loc[i,"NN Prediction"]):
        Current_season.loc[i,"Unanimous"] = Current_season.loc[i,"Tree Prediction"]
    
    #
    if (Current_season.loc[i,"Tree Prediction"] == Current_season.loc[i,"SVM Prediction"])&(Current_season.loc[i,"Tree Prediction"] != Current_season.loc[i,"NN Prediction"]):
        Current_season.loc[i,"Democracy"] = Current_season.loc[i,"Tree Prediction"]
    elif (Current_season.loc[i,"Tree Prediction"] != Current_season.loc[i,"SVM Prediction"])&(Current_season.loc[i,"Tree Prediction"] == Current_season.loc[i,"NN Prediction"]):
        Current_season.loc[i,"Democracy"] = Current_season.loc[i,"Tree Prediction"]
    elif (Current_season.loc[i,"Tree Prediction"] != Current_season.loc[i,"SVM Prediction"])&(Current_season.loc[i,"SVM Prediction"] == Current_season.loc[i,"NN Prediction"]):
        Current_season.loc[i,"Democracy"] = Current_season.loc[i,"SVM Prediction"]
    elif (Current_season.loc[i,"Tree Prediction"] == Current_season.loc[i,"SVM Prediction"])&(Current_season.loc[i,"Tree Prediction"] == Current_season.loc[i,"NN Prediction"]):
        Current_season.loc[i,"Democracy"] = Current_season.loc[i,"Tree Prediction"]
    
    
    Current_season.loc[i,"Fav Strategy"] = np.where((Current_season.loc[i,"Odds H"] < Current_season.loc[i,"Odds A"]) & (Current_season.loc[i,"Odds H"] < Current_season.loc[i,"Odds D"]), "H", Current_season.loc[i,"Fav Strategy"])
    Current_season.loc[i,"Fav Strategy"] = np.where((Current_season.loc[i,"Odds A"] < Current_season.loc[i,"Odds D"]) & (Current_season.loc[i,"Odds A"] < Current_season.loc[i,"Odds D"]), "A", Current_season.loc[i,"Fav Strategy"])
    Current_season.loc[i,"Fav Strategy"] = np.where((Current_season.loc[i,"Odds D"] < Current_season.loc[i,"Odds H"]) & (Current_season.loc[i,"Odds D"] < Current_season.loc[i,"Odds A"]), "D", Current_season.loc[i,"Fav Strategy"])
    
    
    

In [None]:
Current_season.head()

In [None]:
Accuracy = pd.DataFrame(columns=["Tree Prediction","SVM Prediction","NN Prediction","Democracy","Unanimous","Fav Strategy","Home Pick"])

for i in range(Current_season.shape[0]):
    for column in Accuracy.columns:
        if str(Current_season.loc[i,column]) == "nan":
            Accuracy.loc[i,column] = np.nan
        
        else:
            Accuracy.loc[i,column] = np.where(Current_season.loc[i,"Result"] == Current_season.loc[i,column],1,0)

In [None]:

Current_Accuracy = pd.concat([Current_season[["Season start","Gameday","HomeTeam","AwayTeam","Result"]], Accuracy], axis=1)

In [None]:
Current_Accuracy.head()

In [None]:
Current_Accuracy_correct = Current_Accuracy.groupby("Gameday", as_index=False)["Tree Prediction","SVM Prediction","NN Prediction","Democracy","Unanimous","Fav Strategy","Home Pick"].sum()

In [None]:
Current_Accuracy_count = Current_Accuracy.groupby("Gameday", as_index=False)["Tree Prediction","SVM Prediction","NN Prediction","Democracy","Unanimous","Fav Strategy","Home Pick"].count()

In [None]:
for i in range(Current_Accuracy_correct.shape[0]):
    for j in range(1,Current_Accuracy_correct.shape[1]):
        Current_Accuracy_correct.iloc[i,j] = round(Current_Accuracy_correct.iloc[i,j] / Current_Accuracy_count.iloc[i,j],2)
        

In [None]:
Current_Accuracy_correct

In [None]:
Current_Accuracy_count

In [None]:
print("Current season accuracy by approach: \n \n" ,
      '{:10}'.format("Tree: ")+ '{:>13}'.format(str(round((Current_Accuracy["Tree Prediction"].sum() / Current_Accuracy["Tree Prediction"].dropna().shape[0])*100,2))+"% \n"), 
      '{:10}'.format("SVM: ")+'{:>13}'.format(str(round((Current_Accuracy["SVM Prediction"].sum()/ Current_Accuracy["SVM Prediction"].dropna().shape[0])*100,2))+"% \n"),
     '{:10}'.format("NN: ")+ '{:>13}'.format(str(round((Current_Accuracy["NN Prediction"].sum()/ Current_Accuracy["NN Prediction"].dropna().shape[0])*100,2))+"% \n"),
    '{:10}'.format("Democracy: ")+'{:>12}'.format(str(round((Current_Accuracy["Democracy"].sum()/ Current_Accuracy["Democracy"].dropna().shape[0])*100,2))+"% \n"),
     '{:10}'.format("Unanimous: ")+ '{:>12}'.format(str(round((Current_Accuracy["Unanimous"].sum()/ Current_Accuracy["Unanimous"].dropna().shape[0])*100,2))+"% \n"),
      '{:10}'.format("Fav Strategy: ")+'{:>9}'.format(str(round((Current_Accuracy["Fav Strategy"].sum()/ Current_Accuracy["Fav Strategy"].dropna().shape[0])*100,2))+"% \n"),
     '{:10}'.format("Home Pick: ")+'{:>10}'.format(str(round((Current_Accuracy["Home Pick"].sum()/ Current_Accuracy["Home Pick"].dropna().shape[0])*100,2))+"%"))