In [3]:
import numpy as np
import pandas as pd

In [4]:
def read_and_prepare_data():
    df = pd.read_excel('../data/data_from_uefa.xlsx', encoding = 'iso-8859-1')
    
    df = df.drop(columns = ['URL']).rename(columns = {'Team_Home' : 'team_home',
                                                      'Team_Away' : 'team_away',
                                                      'Date' : 'date'
                                                     })
    
    df['date'] = pd.to_datetime(df['date'])
    
    cols_to_transform = ['possession_home', 'possession_away']

    for col in cols_to_transform:
        df[col] = df[col].apply(lambda possession_string : possession_string[:-1]).astype(int)
        
    cols_teams = ['team_home', 'team_away']

    for col in cols_teams:
        df[col] = df[col].apply(lambda team : team if team != 'FYR Macedonia' else 'North Macedonia')
        
    return df

In [5]:
def change_name_home_away(name):
    if name[-4:] == 'home':
        return name[:-4] + 'away'
    elif name[-4:] == 'away':
        return name[:-4] + 'home'
    else:
        return name

In [6]:
df_data = read_and_prepare_data()

In [7]:
df_data.head(10)

Unnamed: 0,team_home,team_away,date,goals_home,goals_away,attempts_total_home,attempts_total_away,attempts_off_target_home,attempts_off_target_away,attempts_on_target_home,...,balls_recovered_home,balls_recovered_away,tackles_home,tackles_away,blocks_home,blocks_away,clearances_home,clearances_away,passes_accuracy_home,passes_accuracy_away
0,Italy,Bosnia and Herzegovina,2020-09-04 18:45:00+00:00,1,1,19,9,11,4,4,...,33,33,2,1,1,4,0,0,87,78
1,Netherlands,Poland,2020-09-04 18:45:00+00:00,1,0,15,2,6,1,4,...,30,33,3,4,0,5,0,0,87,77
2,Bosnia and Herzegovina,Poland,2020-09-07 18:45:00+00:00,1,2,7,14,4,7,2,...,28,25,3,6,2,1,0,0,79,86
3,Netherlands,Italy,2020-09-07 18:45:00+00:00,0,1,11,17,4,10,3,...,36,44,7,4,3,4,0,0,80,89
4,Bosnia and Herzegovina,Netherlands,2020-10-11 16:00:00+00:00,0,0,6,14,2,8,1,...,38,42,3,2,2,3,0,0,78,88
5,Poland,Italy,2020-10-11 18:45:00+00:00,0,0,4,16,0,5,3,...,31,48,2,10,8,1,0,0,69,84
6,Italy,Netherlands,2020-10-14 18:45:00+00:00,1,1,7,13,1,6,6,...,48,32,7,3,5,0,0,0,86,82
7,Poland,Bosnia and Herzegovina,2020-10-14 18:45:00+00:00,3,0,20,6,4,3,8,...,39,36,4,0,1,8,0,0,89,78
8,Netherlands,Bosnia and Herzegovina,2020-11-15 17:00:00+00:00,3,1,17,11,3,6,10,...,35,31,0,1,3,4,0,0,89,84
9,Italy,Poland,2020-11-15 19:45:00+00:00,2,0,19,2,6,2,4,...,33,29,5,8,0,9,0,0,88,83


In [8]:
df_data

Unnamed: 0,team_home,team_away,date,goals_home,goals_away,attempts_total_home,attempts_total_away,attempts_off_target_home,attempts_off_target_away,attempts_on_target_home,...,balls_recovered_home,balls_recovered_away,tackles_home,tackles_away,blocks_home,blocks_away,clearances_home,clearances_away,passes_accuracy_home,passes_accuracy_away
0,Italy,Bosnia and Herzegovina,2020-09-04 18:45:00+00:00,1,1,19,9,11,4,4,...,33,33,2,1,1,4,0,0,87,78
1,Netherlands,Poland,2020-09-04 18:45:00+00:00,1,0,15,2,6,1,4,...,30,33,3,4,0,5,0,0,87,77
2,Bosnia and Herzegovina,Poland,2020-09-07 18:45:00+00:00,1,2,7,14,4,7,2,...,28,25,3,6,2,1,0,0,79,86
3,Netherlands,Italy,2020-09-07 18:45:00+00:00,0,1,11,17,4,10,3,...,36,44,7,4,3,4,0,0,80,89
4,Bosnia and Herzegovina,Netherlands,2020-10-11 16:00:00+00:00,0,0,6,14,2,8,1,...,38,42,3,2,2,3,0,0,78,88
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
559,Russia,Sweden,2018-10-11 19:45:00+00:00,0,0,12,10,6,6,1,...,42,38,5,8,3,4,0,0,83,77
560,Russia,Turkey,2018-10-14 16:00:00+00:00,2,0,11,9,5,5,5,...,44,38,2,3,0,1,0,0,75,80
561,Northern Ireland,Bosnia and Herzegovina,2018-09-08 13:00:00+00:00,1,2,26,5,8,2,8,...,31,31,3,7,0,10,0,0,84,72
562,Switzerland,Georgia,2019-11-15 19:45:00+00:00,1,0,25,14,7,6,10,...,43,41,4,5,4,8,0,0,90,80


In [9]:
df_data.columns.values

array(['team_home', 'team_away', 'date', 'goals_home', 'goals_away',
       'attempts_total_home', 'attempts_total_away',
       'attempts_off_target_home', 'attempts_off_target_away',
       'attempts_on_target_home', 'attempts_on_target_away',
       'attempts_blocked_home', 'attempts_blocked_away', 'corners_home',
       'corners_away', 'offsides_home', 'offsides_away',
       'possession_home', 'possession_away', 'passes_home', 'passes_away',
       'passes_completed_home', 'passes_completed_away',
       'balls_recovered_home', 'balls_recovered_away', 'tackles_home',
       'tackles_away', 'blocks_home', 'blocks_away', 'clearances_home',
       'clearances_away', 'passes_accuracy_home', 'passes_accuracy_away'],
      dtype=object)

In [10]:
import re

In [11]:
def extract_features_home_away_team(df, home_away):
    '''
    Restricts the dataframe to the columns of the home/away-team and the date.
    '''
    cols = ['date'] + [col for col in df.columns if home_away in col]
    
    df = df[cols]
    
    cols_new = [col.replace('_' + home_away, '').replace('_' + home_away, '') for col in cols]
    
    df.columns = cols_new
    
    return df

In [12]:
def prepare_data_for_feature_engineering(df_data):
    '''
    
    '''
    dfs_home_away = [extract_features_home_away_team(df_data.copy(), home_away) for home_away in ['away', 'home']]
    
    df = pd.concat(dfs_home_away).reset_index(drop = True)
    
    return df

In [13]:
df_feat_engi_base = prepare_data_for_feature_engineering(df_data)

In [14]:
df_feat_engi_base

Unnamed: 0,date,team,goals,attempts_total,attempts_off_target,attempts_on_target,attempts_blocked,corners,offsides,possession,passes,passes_completed,balls_recovered,tackles,blocks,clearances,passes_accuracy
0,2020-09-04 18:45:00+00:00,Bosnia and Herzegovina,1,9,4,4,1,2,0,41,411,321,33,1,4,0,78
1,2020-09-04 18:45:00+00:00,Poland,0,2,1,1,0,4,4,35,346,265,33,4,5,0,77
2,2020-09-07 18:45:00+00:00,Poland,2,14,7,5,2,5,1,55,474,409,25,6,1,0,86
3,2020-09-07 18:45:00+00:00,Italy,1,17,10,4,3,5,4,57,595,528,44,4,4,0,89
4,2020-10-11 16:00:00+00:00,Netherlands,0,14,8,4,2,7,2,68,734,646,42,2,3,0,88
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1123,2018-10-11 19:45:00+00:00,Russia,0,12,6,1,5,5,1,59,550,455,42,5,3,0,83
1124,2018-10-14 16:00:00+00:00,Russia,2,11,5,5,1,7,4,44,334,250,44,2,0,0,75
1125,2018-09-08 13:00:00+00:00,Northern Ireland,1,26,8,8,10,18,1,67,554,463,31,3,0,0,84
1126,2019-11-15 19:45:00+00:00,Switzerland,1,25,7,10,8,9,4,63,727,654,43,4,4,0,90


In [15]:
# def transform_to_mirrored_data(df_data):
#     cols_mirrored = [change_name_home_away(col) for col in df_data.columns]
    
#     df_mirrored = df_data.copy()
#     df_mirrored.columns = cols_mirrored
    
#     df_all = pd.concat([df_data, df_mirrored]).reset_index(drop = True)
    
#     cols_to_keep = [col for col in df_all.columns if '_away' not in col] 
    
#     df_all = df_all[cols_to_keep]
    
#     cols_to_keep_2 = [col[:-5] if col[-5:] == '_home' else col for col in cols_to_keep]
    
#     df_all.columns = cols_to_keep_2
    
#     return df_all

In [16]:
# df_data_mirrored = transform_to_mirrored_data(df_data)

In [17]:
def weighted_mean(c, arr):
    '''
    Calculates a weighted mean with exponentially decreasing weights for the last but one entries of the array.
    '''
    n = len(arr)
    
    weights = [2**(-c*m) for m in range(0, n)][::-1]
    
    return np.dot(weights[1:], arr[:-1])/sum(weights)

In [18]:
def normal_mean(arr):
    '''
    Calculates the mean for the last but one entries of the array.
    '''
    n = len(arr)
    
    return sum(arr[:-1])/(n-1)

In [19]:
def perform_feat_engineering_for_country(df_feat_engi_base, country):
    df = df_feat_engi_base
    df = df[df['team'] == country].sort_values(by = ['date'], ascending = [1]).reset_index(drop = True)
    
    df_first_game_duplicated = pd.concat([df.iloc[:1] for i in range(9)])

    df_feat_engi = pd.concat([df_first_game_duplicated, df]).reset_index(drop = True)
    
    col_feat = [col for col in df_train.columns if col not in ['team', 'date']]
    
    # Compute (weighted) average for each feature on the previous games
    for col in col_feat:
        # Compute weighted mean for feature for last 5 and last 10 games
        df_feat_engi[col + '_weighted_mean_5']  = df_feat_engi[col].rolling(5+1).apply(lambda arr : weighted_mean(0.25, arr))
        df_feat_engi[col + '_weighted_mean_10'] = df_feat_engi[col].rolling(10+1).apply(lambda arr : weighted_mean(0.25, arr))
        
        # Compute normal mean for feature for last 5 and last 10 games
        df_feat_engi[col + '_normal_mean_5']  = df_feat_engi[col].rolling(5+1).apply(lambda arr : normal_mean(arr))
        df_feat_engi[col + '_normal_mean_10'] = df_feat_engi[col].rolling(10+1).apply(lambda arr : normal_mean(arr))
    
    df_feat_engi = df_feat_engi[10:].reset_index(drop = True)
    
    return df_feat_engi

In [20]:
def aggregate_data_for_all_countries(df_feat_engi_base):
    countries_all = df_feat_engi_base['team'].unique()
    
    dfs_feat_engi_country = [perform_feat_engineering_for_country(df_feat_engi_base, country) for country in countries_all]
    
    df_feat_engi = pd.concat(dfs_feat_engi_country)
    
    cols_to_drop = ['attempts_blocked', 'attempts_off_target', 'attempts_on_target',
                    'attempts_total', 'balls_recovered', 'blocks', 'clearances', 'corners',
                    'offsides', 'passes_accuracy', 'passes_completed', 'passes', 'possession', 'tackles']
    
    df_feat_engi = df_feat_engi.drop(columns = cols_to_drop).dropna().reset_index(drop = True)
    
    return df_feat_engi

In [21]:
df_feat_engi = aggregate_data_for_all_countries(df_feat_engi_base)

NameError: name 'compute_features_for_country' is not defined

In [None]:
df_feat_engi

In [None]:
df_last_game = df_feat_engi.groupby(['team']).tail(1).reset_index(drop = True).drop(columns = ['date', 'goals'])

In [None]:
# df_last_game.to_excel('features_last_game.xlsx', index = False, encoding = 'iso-8859-1')

In [None]:
def prepare_df_for_model_training(df_data, df_feat_engi):
    df_1 = df_data[['date', 'team_home', 'team_away']].rename(columns = {'team_home' : 'team_A',
                                                                          'team_away' : 'team_B'
                                                                         })
    
    df_2 = df_data[['date', 'team_home', 'team_away']].copy().rename(columns = {'team_home' : 'team_B',
                                                                                 'team_away' : 'team_A'
                                                                                })
    
    df_3 = pd.concat([df_1, df_2]).reset_index(drop = True)
    
    cols_a = [col + '_A' if col not in ['date'] else col for col in df_feat_engi.columns]
    cols_b = [col + '_B' if col not in ['date'] else col for col in df_feat_engi.columns]
    
    df_A = df_feat_engi.copy()
    df_A.columns = cols_a
    
    df_B = df_feat_engi.copy()
    df_B.columns = cols_b
    
    df_merge = pd.merge(df_3, df_A, how = 'inner', on = ['date', 'team_A'])
    df_merge = pd.merge(df_merge, df_B, how = 'inner', on = ['date', 'team_B'])
    
    return df_merge

In [None]:
df_train = prepare_df_for_model_training(df_data, df_feat_engi)

## Training Model

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif, mutual_info_classif

In [None]:
from sklearn.pipeline import Pipeline

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
class FeatureSelectionTransformer(BaseEstimator, TransformerMixin):
#     def __init__(self):
#         pass
    
    def __init__(self, type_feat = 'weighted_mean_10'):
        self.type_feat = type_feat
    
    def fit(self, X, y = None):
        return self
    
    def transform(self, X):
        cols_feat = X.columns
        cols_feat_selected = [col for col in cols_feat if self.type_feat in col]
        
        return X[cols_feat_selected]

In [None]:
cols_blacklist = ['date', 'team_A', 'team_B', 'goals_A', 'goals_B']
cols_to_keep = [col for col in df_train.columns if col not in cols_blacklist]

In [None]:
X = df_train[cols_to_keep]
y = df_train['goals_A'].astype(int)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)

In [None]:
pipe = Pipeline([('Transformer_FeatSelection', FeatureSelectionTransformer()),
                 ('clf', RandomForestClassifier(n_estimators = 200, n_jobs = 4))
                ])

In [None]:
parameters = {'Transformer_FeatSelection__type_feat' : ['weighted_mean_5', 'weighted_mean_10', 'normal_mean_5', 'normal_mean_10'],
              'clf__max_depth' : [3, 5, 10, 20]
             }

In [None]:
cv = GridSearchCV(pipe, param_grid = parameters, verbose = 3)

In [None]:
cv.fit(X_train, y_train)

In [None]:
import pickle

In [None]:
filename = 'model_goals.pkl'
pickle.dump(cv, open(filename, 'wb'))

In [None]:
df_train

## More Stuff

In [None]:
cv.best_params_

In [None]:
cv.predict(X_test)

In [None]:
cv.predict_proba(X_test)[0]

In [None]:
cv.predict_proba(X_test)[1]

In [None]:
cv.predict_proba(X_test)[2]

In [None]:
p1 = cv.predict_proba(X_test)[0]
p2 = cv.predict_proba(X_test)[1]

In [None]:
np.transpose(p2)

In [None]:
# >>> from random import choices
# >>> population = [1, 2, 3, 4, 5, 6]
# >>> weights = [0.1, 0.05, 0.05, 0.2, 0.4, 0.2]
# Now choices(population, weights) generates a single sample:

# >>> choices(population, weights)

In [None]:
from random import choices

In [None]:
[i for i in range(10)]

In [None]:
p1

In [None]:
p1.T

In [None]:
p1_T = np.reshape(p1, (10, 1))

In [None]:
p2_x = np.reshape(p2, (1, 10))

In [None]:
np.dot(p1_T, p2)

In [None]:
np.matmul(p1_T, p2_x)

In [None]:
p1_T.shape

In [None]:
p2

In [None]:
def get_result_max_points(p_goals_A, p_goals_B):
    p_goals_A = np.reshape(p_goals_A, (10, 1))
    p_goals_B = np.reshape(p_goals_B, (1, 10))
    
    mat_prob = np.matmul(p_goals_A, p_goals_B)
    print(mat_prob)
    
    max_expected_points_draw = 2*sum(np.diagonal(mat_prob, 0)) +2*max(np.diagonal(mat_prob, 0))
    print(expected_points_draw)
    
    return mat_prob

In [None]:
A = [0.25, 0.25, 0.5, 0, 0, 0, 0, 0, 0, 0]
B = [0.0, 0.5, 0.5, 0, 0, 0, 0, 0, 0, 0]

mat = get_result_max_points(A, B)

In [None]:
mat

In [None]:
np.diagonal(mat, 1)

In [None]:
def simulate_game(p_goals_A, p_goals_B):
    population_goals = [i for i in range(10)]
    goals_sim_A = choices(population_goals, p_goals_A)[0]
    goals_sim_B = choices(population_goals, p_goals_B)[0]
    
    return goals_sim_A, goals_sim_B

In [None]:
[simulate_game(p1, p2) for i in range(50)]

In [None]:
np.outer(p1, p2)

In [None]:
np.dot(p1, p2)

In [None]:
cv.classes_

In [None]:
cv.param_grid

In [None]:
cv.cv_results_

In [None]:
trf = FeatureSelectionTransformer()

In [None]:
trf.transform(X)

In [None]:
# >>> from sklearn.datasets import load_iris
# >>> from sklearn.feature_selection import SelectKBest
# >>> from sklearn.feature_selection import chi2
# >>> X, y = load_iris(return_X_y=True)
# >>> X.shape
# (150, 4)
# >>> X_new = SelectKBest(chi2, k=2).fit_transform(X, y)
# >>> X_new.shape
# (150, 2)

In [None]:
cols = [col for col in df_tt.columns if col not in ['date', 'team_A', 'team_B', 'goals_A', 'goals_B']]

In [None]:
X = df_tt[cols]
y = df_tt['goals_A']

In [None]:
mmm = SelectKBest(f_classif, k = 5).fit(X, y)

In [None]:
mmm.get_support

In [None]:
df_data['team_home'].unique()