In [None]:
#import needed Python libraries
import numpy as np
import csv
import random
import datetime as dt
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as scistats
import math
import pylab
import statsmodels as sm
import tensorflow as tf

In [None]:
#graphics parameters of the notebook
# display graphs inline
%matplotlib inline

# Make graphs prettier
pd.set_option('display.max_columns', 15)
pd.set_option('display.width', 400)
pd.set_option('plotting.matplotlib.register_converters', True)

# Make the fonts bigger
plt.rc('figure', figsize=(14, 7))
plt.rc('font', family='normal', weight='bold', size=15)
#inegrate data from 2009-2010 to 2018-2019 seasons from different files
data_18_19 = pd.read_csv("./data/2018_2019.csv", parse_dates=True)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     
data_17_18 = pd.read_csv("./data/2017_2018.csv", parse_dates=True)
data_16_17 = pd.read_csv("./data/2016_2017.csv", parse_dates=True)
data_15_16 = pd.read_csv("./data/2015_2016.csv", parse_dates=True)
data_14_15 = pd.read_csv("./data/2014_2015.csv", parse_dates=True)
data_13_14 = pd.read_csv("./data/2013_2014.csv", parse_dates=True)
data_12_13 = pd.read_csv("./data/2012_2013.csv", parse_dates=True)
data_11_12 = pd.read_csv("./data/2011_2012.csv", parse_dates=True)
data_10_11 = pd.read_csv("./data/2010_2011.csv", parse_dates=True)
data_09_10 = pd.read_csv("./data/2009_2010.csv", parse_dates=True)
data_08_09 = pd.read_csv("./data/2008_2009.csv", parse_dates=True)
#data_07_08 = pd.read_csv("./data/2007_2008.csv", parse_dates=True)
#data_06_07 = pd.read_csv("./data/2006_2007.csv", parse_dates=True)
#data_05_06 = pd.read_csv("./data/2005_2006.csv", parse_dates=True)

#integrate data in a single df
raw_data = pd.concat([data_18_19, data_17_18, data_16_17, data_15_16, data_14_15, data_13_14, data_12_13, data_11_12, data_10_11, data_09_10, data_08_09])
print(raw_data)

#Select useful features for datavisualization and analysis purposes
data_table = raw_data[["Date", "HomeTeam", "AwayTeam", "FTHG", "FTAG",
                    "FTR", "HTAG", 'B365A', 'B365D', 'B365H', 'BSA', 
                    'BSD', 'BSH', 'BWA', 'BWD', 'BWH', 'GBA', 'GBD',
                    'GBH', 'IWA', 'IWD', 'IWH', 'LBA', 'LBD', 'LBH',
                    'PSA', 'PSD', 'PSH', 'SBA', 'SBD', 'SBH', 'SJA',
                    'SJD', 'SJH', 'VCA', 'VCD', 'VCH', 'WHA','WHD', 'WHH']]

#convert date format to YYYY-MM-DD classic format
data_table.Date = data_table.Date.map(lambda x : "20" + x[6:8] + "-" + x[3:5] + "-" + x[0:2])

#sort data by date
data_table.sort_values('Date', inplace=True)

#reset data indexes
data_table = data_table.reset_index(drop=True)

#create matchID column
data_table['matchID'] = data_table.index

#create season feature
data_table['Season'] = 0
data_table.Season = data_table.Date.map(lambda x : int(x[0:4]) if int(x[5:7]) > 6 else int(x[0:4]) - 1)

#null values test
data_table.isnull().any()

#create teams list
teams = data_table['HomeTeam'].unique()
print(teams)

#create seasons list
seasons = np.sort(data_table['Season'].unique())
print(seasons)

#create dictionary containing teams list by season
teams_by_season = {season : data_table[data_table['Season'] == season]['HomeTeam'].unique() for season in seasons}

In [None]:
#match day feature construction for HomeTeam and AwayTeam (1st match of a season --> 1, last --> 38 because 20 team play by season)
data_table_HT_grpby = data_table.groupby('HomeTeam')[['Date']]
data_table_AT_grpby = data_table.groupby('AwayTeam')[['Date']]

# Calcola la giornata di campionato
def row2matchWeek_HT(row):
    x = row['HomeTeam']
    y = row['Date']
    df1 = data_table_HT_grpby.get_group(x)
    df2 = data_table_AT_grpby.get_group(x)
    df1 = df1[df1['Date'] < y]
    df2 = df2[df2['Date'] < y]
    day = (1 + len(df1) + len(df2)) % 38
    return 38 if day == 0 else day 

def row2matchWeek_AT(row):
    x = row['AwayTeam']
    y = row['Date']
    df1 = data_table_HT_grpby.get_group(x)
    df2 = data_table_AT_grpby.get_group(x)
    df1 = df1[df1['Date'] < y]
    df2 = df2[df2['Date'] < y]
    day = (1 + len(df1) + len(df2)) % 38
    return 38 if day == 0 else day

data_table['HomeTeamDay'] = data_table.apply(row2matchWeek_HT, axis=1)
data_table['AwayTeamDay'] = data_table.apply(row2matchWeek_AT, axis=1)

# Calcolo il numero di partite giocate in casa e fuori per ogni team in ogni stagione
data_table['ones'] = 1
for season in seasons:
    for team in teams_by_season[season]:
        sH = data_table[(data_table['HomeTeam'] == team) & (data_table['Season'] == season)]['ones']
        data_table.loc[sH.index, 'HomeTeamHomeDay'] = sH.cumsum()
        
        sA = data_table[(data_table['AwayTeam'] == team) & (data_table['Season'] == season)]['ones']
        data_table.loc[sA.index, 'AwayTeamAwayDay'] = sA.cumsum()
        
# Nel dataset ho una colonna FTR (full time result) che riporta H se ha vinto la squadra in casa, altrimenti A
def resultConverter(A):
    if A == 'H':
        return 'W'
    elif A =='A':
        return 'L'
    else:
        return 'D'

def resultInverser(A):
    if A == 'W':
        return 'L'
    elif A == 'L':
        return 'W'
    else:
        return 'D'
        
def ordinalResultConverter(A):
    if A == 'W':
        return 1
    elif A == 'L':
        return 0
    else:
        return 0.5

#make dummies variables for FTR (result of match), HW = Home Win, AW = Away Win, D = draw
data_table['HW'] = data_table.FTR.map(lambda x : 1 if x == 'H' else 0)
data_table['AW'] = data_table.FTR.map(lambda x : 1 if x == 'A' else 0)
data_table['D']= data_table.FTR.map(lambda x : 1 if x == 'D' else 0)

#make 2 different variable for the result of a match : 1 for the home team point of view, the other for the away team pt of view
data_table['HR'] = data_table.FTR.map(lambda x : resultConverter(x))
data_table['AR'] = data_table.HR.map(lambda x : resultInverser(x))

#make ordinal variable for the home team point of view result (1 = win, 0.5 = Draw, 0 = loss)
data_table['ordinalHR'] = data_table.HR.map(lambda x : ordinalResultConverter(x))   

In [None]:
grp_by_HT = data_table.groupby('HomeTeam')
grp_by_AT = data_table.groupby('AwayTeam')
 
grp_by_HT_and_season = data_table.groupby(['HomeTeam', 'Season'])
grp_by_AT_and_season = data_table.groupby(['AwayTeam', 'Season'])

#past performance features engineering
for team in teams:
    
    #we retrieve results series of the team
    teamHomeResults_s = grp_by_HT.get_group(team)['HR']
    teamAwayResults_s = grp_by_AT.get_group(team)['AR']
    #combine these 2 series and sort the obtained serie
    teamResults_s = pd.concat([teamHomeResults_s, teamAwayResults_s]).sort_index()

    #(i) compute k_last_HR and k_last_AR --> 6 features
    # Dizionario {<partita>:<risultato>}
    # {0: nan, 21: 'W', 49: 'W', 69: 'L', 96: 'W', 113: 'D', ..., 4073: 'W', 4097: 'W', 4113: 'W', 4142: 'W', 4166: 'L'}
    # TODO: commenta cosa fa
    lag1TeamResults_d = teamResults_s.shift(1).to_dict()
    lag2TeamResults_d = teamResults_s.shift(2).to_dict()
    lag3TeamResults_d = teamResults_s.shift(3).to_dict()
    
    #k_last_HTR and k_last_ATR are just shifted versions of the results series
    data_table.loc[teamHomeResults_s.index,'1_last_HTR'] = data_table.loc[teamHomeResults_s.index,:].index.map(lambda x : lag1TeamResults_d[x])
    data_table.loc[teamHomeResults_s.index,'2_last_HTR'] = data_table.loc[teamHomeResults_s.index,:].index.map(lambda x : lag2TeamResults_d[x])
    data_table.loc[teamHomeResults_s.index,'3_last_HTR'] = data_table.loc[teamHomeResults_s.index,:].index.map(lambda x : lag3TeamResults_d[x])
    data_table.loc[teamAwayResults_s.index,'1_last_ATR'] = data_table.loc[teamAwayResults_s.index,:].index.map(lambda x : lag1TeamResults_d[x])
    data_table.loc[teamAwayResults_s.index,'2_last_ATR'] = data_table.loc[teamAwayResults_s.index,:].index.map(lambda x : lag2TeamResults_d[x])
    data_table.loc[teamAwayResults_s.index,'3_last_ATR'] = data_table.loc[teamAwayResults_s.index,:].index.map(lambda x : lag3TeamResults_d[x])
    
    #(ii) Compute k_last_HTRH and k_last ATAR --> 4 features
    #we need here to diferentiate home results and past results. Python dictionaries allows the program to access to
    #needed data faster than with a Pandas serie
    # Dovendo predire il risultato finale, assumiamo che sia più dipendente dai risultati a fine match (full time - FT), 
    # piuttosto che dai risultati a metà match (half time - HT) => consideriamo quindi sono gli half time dell'ultima e penultima partita
    lag1TeamHomeResults_d = teamHomeResults_s.shift(1).to_dict()
    lag2TeamHomeResults_d = teamHomeResults_s.shift(2).to_dict()
    lag1TeamAwayResults_d = teamAwayResults_s.shift(1).to_dict()
    lag2TeamAwayResults_d = teamAwayResults_s.shift(2).to_dict()
    
    data_table.loc[teamHomeResults_s.index,'1_last_HTHR'] = data_table.loc[teamHomeResults_s.index,:].index.map(lambda x : lag1TeamHomeResults_d[x])
    data_table.loc[teamHomeResults_s.index,'2_last_HTHR'] = data_table.loc[teamHomeResults_s.index,:].index.map(lambda x : lag2TeamHomeResults_d[x])
    data_table.loc[teamAwayResults_s.index,'1_last_ATAR'] = data_table.loc[teamAwayResults_s.index,:].index.map(lambda x : lag1TeamAwayResults_d[x])
    data_table.loc[teamAwayResults_s.index,'2_last_ATAR'] = data_table.loc[teamAwayResults_s.index,:].index.map(lambda x : lag2TeamAwayResults_d[x])
    
    #(iii) rates based features : we need to get only season specific results series (to avoid taking previous season results into season rates)
    for season in seasons:
        
        if team in teams_by_season[season]:
            #retrieve season specific results serie (1 win serie, 1 draw serie the loss  will be computed thanks to
            #the 2 others)
            teamHomeResultsW_s = grp_by_HT_and_season.get_group((team,season))['HW']
            teamAwayResultsW_s = grp_by_AT_and_season.get_group((team,season))['AW']
            teamResultsW_s = pd.concat([teamHomeResultsW_s, teamAwayResultsW_s]).sort_index()

            teamHomeResultsD_s = grp_by_HT_and_season.get_group((team,season))['D']
            teamAwayResultsD_s = grp_by_AT_and_season.get_group((team,season))['D']
            teamResultsD_s = pd.concat([teamHomeResultsD_s, teamAwayResultsD_s]).sort_index()
        
            #(0) compute HW rates, HL rates, AW rates, LW rates since begining of season
            teamResultsWCumul_d = teamResultsW_s.shift(1).cumsum().to_dict()
            teamResultsDCumul_d = teamResultsD_s.shift(1).cumsum().to_dict()

            #(i) compute 7_HTW_rate, 12_HTW_rate, 7_HTD_rate, 12_HTD_rate, 7_ATW_rate, 12_ATW_rate, 7_ATD_rate, 12_ATD_rate --> 8 features
            # Esempio del 7_HTW_rate:
            #   1. Mi posiziono su una partita X e voglio calcolare la media M dei risultati precedenti alla partita (X-1, X-2, ..., X-7), quindi shifto a dx
            #       per non considerare nella M la partita stessa X e per avere M nella stessa posizione di X (sfrutto le stesse matrici su cui eseguo i calcoli) 
            #   2. Poi si fissa una finestra larga 7, e si calcola la media di un sottoinsieme di dati, ma per farlo è necessario 
            #       ci siano almeno 5 valori nella finestra. La finestra parte con il sottoinsieme di dati {0}, poi {0, 15} -> {0, 15, 21} -> ...
            # {<matchID>: <media>}
            # {0: nan, 15: nan, 21: nan, 30: nan, 43: nan, 49: 0.8, 60: 0.6666666666666666, 69: 0.5714285714285714, ...}
            win7TeamResultsW_d = teamResultsW_s.shift(1).rolling(window = 7, min_periods = 5).mean().to_dict()
            win12TeamResultsW_d = teamResultsW_s.shift(1).rolling(window = 12, min_periods = 8).mean().to_dict()
            win7TeamResultsD_d = teamResultsD_s.shift(1).rolling( window = 7, min_periods = 5).mean().to_dict()
            win12TeamResultsD_d = teamResultsD_s.shift(1).rolling( window = 12, min_periods = 8).mean().to_dict()
        
            data_table.loc[teamHomeResultsW_s.index,'HTW_rate'] = data_table.loc[teamHomeResultsW_s.index,:].index.map(lambda x : teamResultsWCumul_d[x])
            data_table.loc[teamAwayResultsW_s.index,'ATW_rate'] = data_table.loc[teamAwayResultsW_s.index,:].index.map(lambda x : teamResultsWCumul_d[x])
            data_table.loc[teamHomeResultsW_s.index,'HTD_rate'] = data_table.loc[teamHomeResultsW_s.index,:].index.map(lambda x : teamResultsDCumul_d[x])
            data_table.loc[teamAwayResultsW_s.index,'ATD_rate'] = data_table.loc[teamAwayResultsW_s.index,:].index.map(lambda x : teamResultsDCumul_d[x])
        
            data_table.loc[teamHomeResultsW_s.index,'7_HTW_rate'] = data_table.loc[teamHomeResultsW_s.index,:].index.map(lambda x : win7TeamResultsW_d[x])
            data_table.loc[teamHomeResultsW_s.index,'12_HTW_rate'] = data_table.loc[teamHomeResultsW_s.index,:].index.map(lambda x : win12TeamResultsW_d[x])
            data_table.loc[teamAwayResultsW_s.index,'7_ATW_rate'] = data_table.loc[teamAwayResultsW_s.index,:].index.map(lambda x : win7TeamResultsW_d[x])
            data_table.loc[teamAwayResultsW_s.index,'12_ATW_rate'] = data_table.loc[teamAwayResultsW_s.index,:].index.map(lambda x : win12TeamResultsW_d[x])
        
            data_table.loc[teamHomeResultsD_s.index,'7_HTD_rate'] = data_table.loc[teamHomeResultsD_s.index,:].index.map(lambda x : win7TeamResultsD_d[x])
            data_table.loc[teamHomeResultsD_s.index,'12_HTD_rate'] = data_table.loc[teamHomeResultsD_s.index,:].index.map(lambda x : win12TeamResultsD_d[x])
            data_table.loc[teamAwayResultsD_s.index,'7_ATD_rate'] = data_table.loc[teamAwayResultsD_s.index,:].index.map(lambda x : win7TeamResultsD_d[x])
            data_table.loc[teamAwayResultsD_s.index,'12_ATD_rate'] = data_table.loc[teamAwayResultsD_s.index,:].index.map(lambda x : win12TeamResultsD_d[x])

        #(ii) compute 5_HTHW_rate and 5_ATAW_rate
        win5TeamResultsHomeW_d = teamHomeResultsW_s.shift(1).rolling( window = 5, min_periods = 3).mean().to_dict()
        win5TeamResultsAwayW_d = teamAwayResultsW_s.shift(1).rolling( window = 5, min_periods = 3).mean().to_dict()
        data_table.loc[teamHomeResultsW_s.index,'5_HTHW_rate'] = data_table.loc[teamHomeResultsW_s.index,:].index.map(lambda x : win5TeamResultsHomeW_d[x])
        data_table.loc[teamAwayResultsW_s.index,'5_ATAW_rate'] = data_table.loc[teamAwayResultsW_s.index,:].index.map(lambda x : win5TeamResultsAwayW_d[x])
        
        #(iii) compute HTHW_rate, ATAW_rate, HTHD_rate, ATAD_rate
        teamHomeResultsCumulW_d = teamHomeResultsW_s.shift(1).cumsum().to_dict()
        teamHomeResultsCumulD_d = teamHomeResultsD_s.shift(1).cumsum().to_dict()
        teamAwayResultsCumulW_d = teamAwayResultsW_s.shift(1).cumsum().to_dict()
        teamAwayResultsCumulD_d = teamAwayResultsD_s.shift(1).cumsum().to_dict()
        data_table.loc[teamHomeResultsW_s.index,'HTHW_rate'] = data_table.loc[teamHomeResultsW_s.index,:].index.map(lambda x : teamHomeResultsCumulW_d[x])
        data_table.loc[teamHomeResultsW_s.index,'HTHD_rate'] = data_table.loc[teamHomeResultsW_s.index,:].index.map(lambda x : teamHomeResultsCumulD_d[x])
        data_table.loc[teamAwayResultsW_s.index,'ATAW_rate'] = data_table.loc[teamAwayResultsW_s.index,:].index.map(lambda x : teamAwayResultsCumulW_d[x])
        data_table.loc[teamAwayResultsW_s.index,'ATAD_rate'] = data_table.loc[teamAwayResultsW_s.index,:].index.map(lambda x : teamAwayResultsCumulD_d[x])


        
#compute missing features k_XTL_rate thanks to the k_XTW_rate and k_XTD_rate features
data_table.loc[:,'7_HTL_rate'] = 1 - (data_table['7_HTW_rate'] + data_table['7_HTD_rate'])
data_table.loc[:,'12_HTL_rate'] = 1 - (data_table['7_HTW_rate'] + data_table['7_HTD_rate'])
data_table.loc[:,'7_ATL_rate'] = 1 - (data_table['7_ATW_rate'] + data_table['7_ATD_rate'])
data_table.loc[:,'12_ATL_rate'] = 1 - (data_table['7_ATW_rate'] + data_table['7_ATD_rate'])

#compute missing HTL_rate, ATL_rate with features with the wins and draws features
data_table.loc[:,'HTW_rate'] = data_table['HTW_rate']/data_table['HomeTeamDay']
data_table.loc[:,'ATW_rate'] = data_table['ATW_rate']/data_table['AwayTeamDay']
data_table.loc[:,'HTD_rate'] = data_table['HTD_rate']/data_table['HomeTeamDay']
data_table.loc[:,'ATD_rate'] = data_table['ATD_rate']/data_table['AwayTeamDay']
data_table.loc[:,'HTL_rate'] = 1 - (data_table['HTW_rate'] + data_table['HTD_rate'])
data_table.loc[:,'ATL_rate'] = 1 - (data_table['ATW_rate'] + data_table['ATD_rate'])

#we finish to compute HTHW_rate, ..., ATAD_rate features and compute corresponding loss features
data_table.loc[:,'HTHW_rate'] = data_table['HTHW_rate']/data_table['HomeTeamHomeDay']
data_table.loc[:,'ATAW_rate'] = data_table['ATAW_rate']/data_table['AwayTeamAwayDay']
data_table.loc[:,'HTHD_rate'] = data_table['HTHD_rate']/data_table['HomeTeamHomeDay']
data_table.loc[:,'ATAD_rate'] = data_table['ATAD_rate']/data_table['AwayTeamAwayDay']
data_table.loc[:,'HTHL_rate'] = 1 - (data_table['HTHW_rate'] + data_table['HTHD_rate'])
data_table.loc[:,'ATAL_rate'] = 1 - (data_table['ATAW_rate'] + data_table['ATAD_rate'])

data_table.to_csv("./data/features.csv")


# ELO score

In [None]:
#Elo ranking method parameters
k = 20.0
d = 400.0
c = 10.0

#Initialization of output containers
ELO_dict = dict()
gammaHT_dict = dict()
gammaAT_dict = dict()

#intermediate data containers initilization
latest_update_date = dict() #contains latest updates in date of ELO_dict
prev_date_ELO_score = dict() #contains latest ELO_score given to a team for computing new one

prev_season_teams = [team for team in teams] #contains list of teams for the current season
last_teams_ELO_average = 0.0 #contains ELO average of last previous season teams

for team in teams:
    latest_update_date[team] = '2001-01-01'
    prev_date_ELO_score[team] = 0.0

for season in data_table.Season.unique():
    season_match_dates = data_table[data_table['Season'] == season].Date.unique()
    season_teams = data_table[data_table['Season'] == season].HomeTeam.unique()
    last_season_date = season_match_dates[len(season_match_dates) - 1]
    
    for Steam in season_teams:
        if not (Steam in prev_season_teams):
            prev_date_ELO_score[Steam] = last_teams_ELO_average
            
    for date in season_match_dates:
        for team in teams:
            # Se non trovo, per una certa data un certo team, allora per quel team e quella data l'ELO non cambia
            if not ((team in data_table[data_table['Date'] == date]['HomeTeam'].values) | (team in data_table[data_table['Date'] == date]['AwayTeam'].values)):
                ELO_dict[(team, date)] = prev_date_ELO_score[team]
                latest_update_date[team] = date
            # Altrimenti lo aggiorno sulla base del risultato
            else:
                if latest_update_date[team] < date:
                    Hteam = data_table[(data_table['Date'] == date) & ((data_table['HomeTeam'] == team) | (data_table['AwayTeam'] == team))]['HomeTeam'].values[0]
                    Ateam = data_table[(data_table['Date'] == date) & ((data_table['HomeTeam'] == team) | (data_table['AwayTeam'] == team))]['AwayTeam'].values[0]
            
                    l0H = prev_date_ELO_score[Hteam]
                    l0A = prev_date_ELO_score[Ateam]
                    gammaH = 1.0/(1.0 + c**((l0A - l0H)/d))
                    gammaA = 1.0 - gammaH
                    # .values ritorna un array quindi devo prendere il valore e non l'indice
                    alphaH = data_table[(data_table['Date'] == date) & (data_table['HomeTeam'] == Hteam)]['ordinalHR'].values[0]
                    alphaA = 1 - alphaH
                    
                    #compute new scores
                    new_HT_ELO_score = l0H + k * (alphaH - gammaH)
                    new_AT_ELO_score = l0A + k * (alphaA - gammaA)

                    #put new scores in ELO_dict
                    ELO_dict[(Hteam, date)] = new_HT_ELO_score
                    ELO_dict[(Ateam, date)] = new_AT_ELO_score
                    gammaHT_dict[(Hteam, date)] = gammaH
                    gammaAT_dict[(Ateam, date)] = gammaA
                    latest_update_date[Hteam] = date
                    latest_update_date[Ateam] = date
            
                    #update prev_date_ELO_score and latest_update_date
                    prev_date_ELO_score[Hteam] = new_HT_ELO_score
                    prev_date_ELO_score[Ateam] = new_AT_ELO_score
        
        if date == last_season_date:
            ELOs = np.array([prev_date_ELO_score[Steam] for Steam in season_teams])
            ELOs.sort()
            last_teams_ELO_average = np.mean(ELOs[0:-17])
            prev_season_teams = season_teams
            
                        
#make HTeamEloScore, ATeamEloScore and gammaHome features from previously computed dictionaries

def HomeTeamEloScore(row):
    return ELO_dict[(row['HomeTeam'], row['Date'])]

def AwayTeamEloScore(row):
    return ELO_dict[(row['AwayTeam'], row['Date'])]

def gammaHTeamDate(row):
    return gammaHT_dict[(row['HomeTeam'], row['Date'])]

#compute resulting Elo scores important features
data_table.loc[:,'HTeamEloScore'] = data_table.apply(HomeTeamEloScore, axis=1) 
data_table.loc[:,'ATeamEloScore'] = data_table.apply(AwayTeamEloScore, axis=1) 
data_table.loc[:,'gammaHome'] = data_table.apply(gammaHTeamDate, axis=1)


In [None]:

for team in teams:
    homeMatchDates_s = data_table[data_table['HomeTeam'] == team]['Date']
    awayMatchDates_s = data_table[data_table['AwayTeam'] == team]['Date']
    matchDates_s = pd.concat([homeMatchDates_s, awayMatchDates_s]).sort_index()
    lastMatchDates_s = matchDates_s.shift(1)
    matchDates = matchDates_s.values
        
    data_table.loc[data_table['HomeTeam'] == team, 'HTLastMatchDate'] = data_table.loc[data_table['HomeTeam'] == team].index.map(lambda x : lastMatchDates_s[x])
    data_table.loc[data_table['AwayTeam'] == team, 'ATLastMatchDate'] = data_table.loc[data_table['AwayTeam'] == team].index.map(lambda x : lastMatchDates_s[x])
    
def HTdaysBetweenDates(row):
    if not (pd.isnull(row['HTLastMatchDate'])):
        currDate = pd.to_datetime(row['Date'])
        prevDate = pd.to_datetime(row['HTLastMatchDate'])
        ndays = (currDate - prevDate).days 
        if ndays < 20:
            return ndays
        else: 
            return np.nan
    else:
        return np.nan 
    
def ATdaysBetweenDates(row):
    if not (pd.isnull(row['ATLastMatchDate'])):
        currDate = pd.to_datetime(row['Date'])
        prevDate = pd.to_datetime(row['ATLastMatchDate'])
        return (currDate - prevDate).days
    else:
        return np.nan
    
data_table.loc[:, 'HTdaysSinceLastMatch'] = data_table.apply(HTdaysBetweenDates, axis=1)
data_table.loc[:, 'ATdaysSinceLastMatch'] = data_table.apply(ATdaysBetweenDates, axis=1)
data_table.loc[:,'DaysSinceLastMatchRate'] = data_table['HTdaysSinceLastMatch'].astype(float)/data_table['ATdaysSinceLastMatch'].astype(float)


In [None]:

data_table['1_last_HTR_isW'] = data_table['1_last_HTR'].map(lambda x : 1 if x == 'W' else 0)
data_table['1_last_HTR_isL'] = data_table['1_last_HTR'].map(lambda x : 1 if x == 'L' else 0) 
data_table['2_last_HTR_isW'] = data_table['2_last_HTR'].map(lambda x : 1 if x == 'W' else 0)
data_table['2_last_HTR_isL'] = data_table['2_last_HTR'].map(lambda x : 1 if x == 'L' else 0) 
data_table['3_last_HTR_isW'] = data_table['3_last_HTR'].map(lambda x : 1 if x == 'W' else 0)
data_table['3_last_HTR_isL'] = data_table['3_last_HTR'].map(lambda x : 1 if x == 'L' else 0) 

data_table['1_last_ATR_isW'] = data_table['1_last_ATR'].map(lambda x : 1 if x == 'W' else 0)
data_table['1_last_ATR_isL'] = data_table['1_last_ATR'].map(lambda x : 1 if x == 'L' else 0) 
data_table['2_last_ATR_isW'] = data_table['2_last_ATR'].map(lambda x : 1 if x == 'W' else 0)
data_table['2_last_ATR_isL'] = data_table['2_last_ATR'].map(lambda x : 1 if x == 'L' else 0) 
data_table['3_last_ATR_isW'] = data_table['3_last_ATR'].map(lambda x : 1 if x == 'W' else 0)
data_table['3_last_ATR_isL'] = data_table['3_last_ATR'].map(lambda x : 1 if x == 'L' else 0) 

data_table['1_last_HTHR_isW'] = data_table['1_last_HTHR'].map(lambda x : 1 if x == 'W' else 0)
data_table['1_last_HTHR_isL'] = data_table['1_last_HTHR'].map(lambda x : 1 if x == 'L' else 0) 
data_table['2_last_HTHR_isW'] = data_table['2_last_HTHR'].map(lambda x : 1 if x == 'W' else 0)
data_table['2_last_HTHR_isL'] = data_table['2_last_HTHR'].map(lambda x : 1 if x == 'L' else 0)

data_table['1_last_ATAR_isW'] = data_table['1_last_ATAR'].map(lambda x : 1 if x == 'W' else 0)
data_table['1_last_ATAR_isL'] = data_table['1_last_ATAR'].map(lambda x : 1 if x == 'L' else 0) 
data_table['2_last_ATAR_isW'] = data_table['2_last_ATAR'].map(lambda x : 1 if x == 'W' else 0)
data_table['2_last_ATAR_isL'] = data_table['2_last_ATAR'].map(lambda x : 1 if x == 'L' else 0)


In [None]:
# Folds generations custom function 


def foldsGenerator(ixSet, foldMinSize, foldMaxSize, trInitSize, trOptimalSize = -1):
    
    subsetsList = []
    subsetsList.append(ixSet[0:trInitSize])
    Nsubsets = 1
    
    ixSetLength = ixSet.size
    
    unfoldedSetSize = ixSetLength - trInitSize
    prevSubsetStop = trInitSize
    
    while (unfoldedSetSize > foldMaxSize):
        nextFoldSize = random.randint(foldMinSize, foldMaxSize)
        
        subsetsList.append(ixSet[prevSubsetStop:(prevSubsetStop + nextFoldSize)])
        
        unfoldedSetSize -= nextFoldSize
        prevSubsetStop += nextFoldSize
        Nsubsets += 1
    
    subsetsList.append(ixSet[prevSubsetStop:])
    Nsubsets += 1    
    return (Nsubsets, subsetsList)

#test
#sub = foldsGenerator(data_table_tr.index, 40, 55, 700)


In [None]:
#Scores functions implementation

def brierScore(probW, probL, probD, true, classLabels):
    
    trueW = true.map(lambda x : 1 if x == classLabels['W'] else 0).values
    trueL = true.map(lambda x : 1 if x == classLabels['L'] else 0).values
    trueD = true.map(lambda x : 1 if x == classLabels['D'] else 0).values
    
    cumulScore = (probW - trueW)*(probW - trueW) + (probL - trueL)*(probL - trueL) + (probD - trueD)*(probD - trueD)
    
    return float(np.sum(cumulScore))/float(true.index.size)

def rankProbabilityScore(probW, probL, probD, true, classLabels):
    trueW = true.map(lambda x : 1 if x == classLabels['W'] else 0).values
    trueL = true.map(lambda x : 1 if x == classLabels['L'] else 0).values
    trueD = true.map(lambda x : 1 if x == classLabels['D'] else 0).values
    
    true1 = trueL
    true2 = trueL + trueD
    
    prob1 = probL
    prob2 = probL + probD
    
    cumulScore = (prob1 - true1)*(prob1 - true1) + (prob2 - true2) * (prob2 - true2)
    
    return(float(np.sum(cumulScore))/(2.0 * float(true.index.size)))

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from IPython.display import display

#Create X and Y
X = pd.get_dummies(data_table[['HomeTeam', 'AwayTeam', 'HTeamEloScore', 'ATeamEloScore', 'HTdaysSinceLastMatch',
                            'ATdaysSinceLastMatch', 'HTW_rate', 'ATW_rate',
                            'ATD_rate', 'HTD_rate', 
                '7_HTW_rate', '12_HTW_rate', '7_ATW_rate', '12_ATW_rate', 
                '7_HTD_rate', '12_HTD_rate', '7_ATD_rate', '12_ATD_rate',
                '7_HTL_rate', '12_HTL_rate', '7_ATL_rate', '12_ATL_rate',
                '5_HTHW_rate', '5_ATAW_rate']])
Y = data_table[['FTR']]

#X preprocessing
imputer = SimpleImputer()
X_imputed = imputer.fit_transform(X)

#Split X and Y into training and Test Sets
x_train, x_test, y_train, y_test = train_test_split(X_imputed, Y, shuffle=True)

#Logistic Regression Model Setup
model = LogisticRegression()
model.fit(x_train, y_train)

#Logistic Regression Model Metrics
print("Logestic Regression")
print("Train Score: ", model.score(x_train, y_train))
print("Test Score: ", model.score(x_test, y_test))
print(classification_report(y_test, model.predict(x_test), digits=3))

#Forest model setup
forest = RandomForestClassifier(n_estimators=2, random_state=2)
forest.fit(x_train, y_train)

#Forest Model Metrics
print("Forest Classifier")
print("Train Score: ", forest.score(x_train, y_train))
print("Test Score: ", forest.score(x_test, y_test))
print(classification_report(y_test, forest.predict(x_test), digits=3))

print(y_test.shape)
print(forest.predict(x_test).shape)

In [None]:
pd.set_option('display.max_columns', None)

display(data_table.head(20))

In [None]:
import tensorflow.compat.v1 as tf

#make TF work like the TF in the HWs
tf.disable_v2_behavior()

#Other feature selections
# X = pd.get_dummies(data_table[['HomeTeam', 'AwayTeam', 'HTeamEloScore', 'ATeamEloScore', 'HTdaysSinceLastMatch',
#                             'ATdaysSinceLastMatch']])
# X = pd.get_dummies(data_table[['HomeTeam', 'AwayTeam', 'FTAG', 'FTHG']])
# X = pd.get_dummies(data_table[['HomeTeam', 'AwayTeam', 'FTR']])
# X = pd.get_dummies(data_table.drop(['FTR',  'HTAG', 'Date', 'matchID', 'HW', 'AW', 'D', 'AR',
#                                  'ordinalHR', 'Season'], axis=1))

#Setup Data
X = pd.get_dummies(data_table[['HomeTeam', 'AwayTeam', 'HTeamEloScore', 'ATeamEloScore', 'HTdaysSinceLastMatch',
                            'ATdaysSinceLastMatch', 'HTW_rate', 'ATW_rate',
                            'ATD_rate', 'HTD_rate', 
                '7_HTW_rate', '12_HTW_rate', '7_ATW_rate', '12_ATW_rate', 
                '7_HTD_rate', '12_HTD_rate', '7_ATD_rate', '12_ATD_rate',
                '7_HTL_rate', '12_HTL_rate', '7_ATL_rate', '12_ATL_rate',
                '5_HTHW_rate', '5_ATAW_rate']])

# trasforma i valori delle vittorie come 2 e dei pareggi come 1
Y = data_table[['ordinalHR']].to_numpy().ravel()*2


#X preprocessing
imputer = SimpleImputer()
X_imputed = imputer.fit_transform(X)

#Split X and Y into training and Test Sets
x_train, x_test, y_train, y_test = train_test_split(X_imputed, Y, shuffle=True)

#Neural Network Setup
#Numero di feature del dataset
n_inputs = X.shape[1]
n_hidden1 = 300
n_hidden2 = 100
n_outputs = 3

#Tensorflow X and Y
X = tf.placeholder(tf.float32, shape = (None, n_inputs), name = 'X')
Y = tf.placeholder(tf.int32, shape = (None), name = 'Y')

#Function used to better display model metrics
def convert_ordinalHR(x):
    y=[]
    for i in range(x.size):
        if x[i] == 0:
            y.append('A')
        elif x[i] == 2:
            y.append('H')
        elif x[i] == 1:
            y.append('D')
    return y

#General Function for neural layer setup
def neuron_layer(X, n_neurons, name, activation = None):
    with tf.name_scope(name):
        n_inputs = int(X.get_shape()[1])
        stddev = 2 / np.sqrt(n_inputs + n_neurons)
        init = tf.truncated_normal((n_inputs, n_neurons), stddev=stddev)
        W = tf.Variable(init, name = 'kernel')
        b = tf.Variable(tf.zeros([n_neurons]), name='bias')
        L2 = tf.nn.l2_loss(W)
        Z = tf.matmul(X,W)+b
        if activation is not None:
            return activation(Z), L2
        else:
            return Z, L2

#Tensorflow neural layer setup
with tf.name_scope('dnn'):
    hidden1, L2_1 = neuron_layer(X, n_hidden1, name='hidden1', activation=tf.nn.relu)
    hidden2, L2_2 = neuron_layer(hidden1, n_hidden2, name='hidden2', activation=tf.nn.relu)
    logits, L2_3 = neuron_layer(hidden2, n_outputs, name='outputs', activation=None)


#Loss function
beta=0.01
with tf.name_scope('loss'):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels = Y, logits=logits)
    loss = tf.reduce_mean(xentropy+beta*(L2_1+L2_2+L2_3), name='loss')


#Optimizer
learning_rate = 0.000001
with tf.name_scope('train'):
    optimizer = tf.train.AdamOptimizer(learning_rate)
    training_op = optimizer.minimize(loss)

#Metric analysis setup
with tf.name_scope('eval'):
    correct = tf.nn.in_top_k(logits, Y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
    preds = tf.argmax(input=logits, axis=1)
    
#Initialize the above tensorflow variables    
init = tf.global_variables_initializer()

#Runtime and batchsetup
n_epochs = 30000
batch_size = 50

#Run the model
with tf.Session() as sess:
    init.run()
    #Model Loop
    for epoch in range(n_epochs):
        sess.run(training_op, feed_dict={X: x_train, Y: y_train})
        acc_train = accuracy.eval(feed_dict={X: x_train, Y: y_train})
        acc_val = accuracy.eval(feed_dict={X: x_test, Y: y_test})
        if(epoch % 200 == 0 or epoch==n_epochs-1):
            print(epoch, 'Train accuracy:', acc_train, 'Val accuracy:', acc_val)
    
    #Retrive Metrics
    acc_val = accuracy.eval(feed_dict={X: x_test, Y: y_test})
    preds = preds.eval(feed_dict = {X:x_test})
    log = logits.eval(feed_dict = {X:x_test})
    preds=convert_ordinalHR(preds)
    y_test=convert_ordinalHR(y_test)
    
    #Print Metrics
    print('Train accuracy:', acc_train, 'Val accuracy:', acc_val)
    print(classification_report(y_test, preds, digits=3))

In [None]:
from cgi import print_arguments
import tensorflow as tf
from tensorflow.keras import layers

#Functions to manipulate data for use in the model
# Home win = [1,0,0]
# Away win = [0,1,0]
# None win = [0,0,1]
def one_hot_y(Y):
    Y_new = np.zeros((Y.shape[0],3))
    for i in range(Y.shape[0]-1):
        if (Y[i] == 'H'):
            Y_new[i]=[1,0,0]
        elif (Y[i] == 'A'):
            Y_new[i]=[0,1,0]
        elif (Y[i] == 'D'):
            Y_new[i]=[0,0,1]
    return Y_new

def revert_yoh(Y):
    Y_new = np.empty([Y.shape[0],Y.shape[1]], dtype="<U1")
    #Y_new = np.zeros((Y.shape[0],Y.shape[1]))
    for i in range(Y.shape[0]):
        for j in range(Y.shape[1]):
            if (Y[i, j] == 0):
                Y_new[i, j]= 'H'
            elif (Y[i, j] == 1):
                Y_new[i, j]= 'A'
            elif (Y[i, j] == 2):
                Y_new[i, j]='D'
    return Y_new

trn_ssn = [2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015]
trn_ssn_len = len(trn_ssn)
tst_ssn = [2016,2017,2018] 
tst_ssn_len = len(tst_ssn)
    
# è stata riscritta una funzione di reshape inutile 
# e.g. (a_prev=x_train, season=trn_ssn)
def reshape_to_inputshape(a_prev,season):
    totalMatches = len(season)*38
    input_step = int(a_prev.shape[0]/totalMatches)
    prev_f = a_prev.shape[1]
    return np.reshape(a_prev, (totalMatches, input_step, prev_f))
'''X = pd.get_dummies(data_table[['HomeTeam', 'AwayTeam', 'HTeamEloScore', 'ATeamEloScore', 'HTdaysSinceLastMatch',
                            'ATdaysSinceLastMatch', 'HTdaysSinceLastMatch', 'ATdaysSinceLastMatch', 'HTW_rate', 'ATW_rate',
                            'ATD_rate', 'HTD_rate', '1_last_HTR_isW', '1_last_HTR_isL', '1_last_ATR_isW', '1_last_ATR_isL']])'''

#   Survived  Pclass     Sex   Age     Fare
#0         0       3    male  22.0   7.2500
#1         1       1  female  38.0  71.2833
#2         1       3  female  26.0   7.9250
#3         1       1  female  35.0  53.1000
#4         0       3    male  35.0   8.0500
# -----------------------------------------
# get_dummies(dataset, column = ['sex']): associa codifica a valori testuali e scompone la colonna in pià colonne con le codifiche relative
#   Survived  Pclass  Age     Fare  Sex_female  Sex_male
#0         0       3   22   7.2500           0         1
#1         1       1   38  71.2833           1         0
#2         1       3   26   7.9250           1         0
#3         1       1   35  53.1000           1         0
#4         0       3   35   8.0500           0         1
# -----------------------------------------
# get_dummies(dataset['sex']):
#   female  male
#0       0     1
#1       1     0
#2       1     0
#3       1     0
#4       0     1
features = ['HomeTeam', 'AwayTeam', 
            'HTeamEloScore', 'ATeamEloScore', 
            'HTdaysSinceLastMatch', 'ATdaysSinceLastMatch', 
            'HTW_rate', 'ATW_rate', 'ATD_rate', 'HTD_rate', 
            '7_HTW_rate', '12_HTW_rate', '7_ATW_rate', '12_ATW_rate', 
            '7_HTD_rate', '12_HTD_rate', '7_ATD_rate', '12_ATD_rate',
            '7_HTL_rate', '12_HTL_rate', '7_ATL_rate', '12_ATL_rate',
            '5_HTHW_rate', '5_ATAW_rate']
X = pd.get_dummies(data_table[features])

Y = data_table[['FTR']].to_numpy().ravel()

#XY preprocessing
# Replace missing values using a descriptive statistic (e.g. mean, median, or most frequent) along each column, or using a constant value
# Default = mean
imputer = SimpleImputer()
X_imputed = imputer.fit_transform(X)
Y = one_hot_y(Y)

test_size = float(tst_ssn_len)/(tst_ssn_len+trn_ssn_len)
#Split X and Y into training and Test Sets
x_train, x_test, y_train, y_test = train_test_split(X_imputed, Y, shuffle=False, test_size=test_size)

#Setup XY to have 10 game steps
x_train = reshape_to_inputshape(x_train,trn_ssn)

y_train = reshape_to_inputshape(y_train,trn_ssn)
y_train = np.moveaxis(y_train, 0, 1)
x_test = reshape_to_inputshape(x_test,tst_ssn)
y_test = reshape_to_inputshape(y_test,tst_ssn)
y_test = np.moveaxis(y_test, 0, 1)

Tx= x_train.shape[1] #Time steps
Ty= y_train.shape[0] #Time Steps
num_features = x_train.shape[2] #Features per step

In [None]:
#Create and Setup Model
fbmodel = tf.keras.Sequential()
inputs = tf.keras.Input(shape=(Tx, num_features))
outputs = []
for t in range(Ty):
    x = tf.keras.layers.Lambda(lambda z: inputs[:, t,:])(inputs)

    x = tf.keras.layers.Reshape((1, num_features))(x)
    
    x = tf.keras.layers.LSTM(units=16, kernel_regularizer=tf.keras.regularizers.l1_l2(l1=1e-5, l2=1e-1))(x)

    x = tf.keras.layers.Dropout(rate=0.8)(x)
    
    out = tf.keras.layers.Dense(3, activation='softmax')(x)
    
    outputs.append(out)
    
fbmodel = tf.keras.Model(inputs=inputs, outputs=outputs)
fbmodel.summary()

fbmodel.compile(
    loss='categorical_crossentropy',
    optimizer=tf.keras.optimizers.Adam(0.001)
#    ,metrics=[tf.keras.metrics.Accuracy()]
)

In [None]:
fbmodel.summary()

In [None]:
#Train Model
history = fbmodel.fit(
    x_train, list(y_train),
    epochs=100,
    batch_size=64,
    #validation_split=0.272727,
    #show = epoch%100==0,
    verbose=1,
    shuffle=False,
    
)

In [None]:
#Model Metrics Data Setup
y_pred = fbmodel.predict(x_test)
y_predm = np.asarray(y_pred)
print("shape y_predm", y_predm.shape)
y_predm = np.argmax(y_predm, axis=2)
print("shape y_predm", y_predm.shape)
y_testm = np.argmax(y_test, axis = 2)
print("shape y_testm", y_testm.shape)

# y_pred_train = fbmodel.predict(x_train)
# y_pred_train = np.asarray(y_pred_train)
# y_predm_train = np.argmax(y_pred_train, axis=2)
# y_trainm = np.argmax(y_train, axis = 2)

y_predm = revert_yoh(y_predm).ravel()
print("shape y_predm", y_predm.shape)
y_testm = revert_yoh(y_testm).ravel()
print("shape y_testm", y_testm.shape)

# y_predm_train = revert_yoh(y_predm_train).ravel()
# y_trainm = revert_yoh(y_trainm).ravel()

#Model Metrics
# print('Train Score: ', accuracy_score(y_trainm,y_predm_train))
# print('Test Score: ', accuracy_score(y_testm, y_predm))
print(classification_report(y_testm, y_predm, digits=3))

## Classification report

| precision | recall | f1-score | support |
| ----------- | ------ | -------- | ------- |
| A | 0.586 | 0.522 | 0.552 | 345 |
| D | 0.252 | 0.123 | 0.165 | 253 |
| H | 0.582 | 0.762 | 0.660 | 542 |
| accuracy | |  | 0.547 | 1140 |
| macro avg | 0.473 | 0.469 | 0.459 | 1140 |
| weighted avg | 0.510 | 0.547 | 0.517 | 1140 |