In [23]:
import numpy as np
import pandas as pd

In [24]:
import pickle

In [25]:
from sklearn.base import BaseEstimator, TransformerMixin

In [26]:
class FeatureSelectionTransformer(BaseEstimator, TransformerMixin):
#     def __init__(self):
#         pass
    
    def __init__(self, type_feat = 'weighted_mean_10'):
        self.type_feat = type_feat
    
    def fit(self, X, y = None):
        return self
    
    def transform(self, X):
        cols_feat = X.columns
        cols_feat_selected = [col for col in cols_feat if self.type_feat in col]
        
        cols_feat_selected.sort()
        
        return X[cols_feat_selected]

In [27]:
model = pickle.load(open('model_goals.pkl', 'rb'))

In [28]:
model_2 = pickle.load(open('model_goals_2.pkl', 'rb'))

In [29]:
data_schedule = [['Italy', 'Belgium', 'Netherlands', 'England', 'Sweden', 'France'],
                 ['Wales', 'Denmark', 'Austria', 'Croatia', 'Spain', 'Germany'],
                 ['Switzerland', 'Finland', 'Ukraine', 'Czech Republic', 'Slovakia', 'Portugal'],
                 ['Turkey', 'Russia', 'North Macedonia', 'Scotland', 'Poland', 'Hungary']
                ]

In [30]:
df_schedule = pd.DataFrame(data = data_schedule, columns = ['Group ' + L for L in ['A', 'B', 'C', 'D', 'E', 'F']])

In [31]:
df_schedule

Unnamed: 0,Group A,Group B,Group C,Group D,Group E,Group F
0,Italy,Belgium,Netherlands,England,Sweden,France
1,Wales,Denmark,Austria,Croatia,Spain,Germany
2,Switzerland,Finland,Ukraine,Czech Republic,Slovakia,Portugal
3,Turkey,Russia,North Macedonia,Scotland,Poland,Hungary


In [32]:
import itertools

In [33]:
df_feat_teams = pd.read_excel('features_last_game.xlsx', encoding = 'iso-8859-1')

In [34]:
def prepare_games_in_group(list_of_teams, df_feat_teams):
    df_schedule_group = pd.DataFrame(data = list(itertools.combinations(list_of_teams, 2)), columns = ['team_A', 'team_B'])
    
    df_feat_teams_A = df_feat_teams.copy()
    df_feat_teams_A.columns = [col + '_A' for col in df_feat_teams_A.columns]
    
    df_feat_teams_B = df_feat_teams.copy()
    df_feat_teams_B.columns = [col + '_B' for col in df_feat_teams_B.columns]
    
    df_schedule_group_feat = pd.merge(df_schedule_group, df_feat_teams_A, how = 'inner', on = 'team_A')
    df_schedule_group_feat = pd.merge(df_schedule_group_feat, df_feat_teams_B, how = 'inner', on = 'team_B')
    
    return df_schedule_group_feat

In [35]:
def change_A_B_in_cols(name):
    name_new = name
    if name[-1] == 'A':
        name_new = name.replace('_A', '_B')
    elif name[-1] == 'B':
        name_new = name.replace('_B', '_A')
    
    return name_new

In [36]:
from random import choices

In [37]:
from collections import Counter

In [38]:
def simulate_game(p_goals_A, p_goals_B):
    population_goals = [i for i in range(10)]
    goals_sim_A = choices(population_goals, p_goals_A)[0]
    goals_sim_B = choices(population_goals, p_goals_B)[0]
    
    return goals_sim_A, goals_sim_B

In [39]:
def compute_points(goals_1, goals_2):
    if goals_1 > goals_2:
        return 3
    elif goals_1 == goals_2:
        return 1
    else:
        return 0

In [40]:
def change_A_B_in_cols(name):
    if name[-2:] == '_A':
        return name[:-2] + '_B'
    else:
        return name[:-2] + '_A'

In [41]:
class Group():
    def __init__(self, list_of_teams, df_feat, model):
        self.list_of_teams = list_of_teams
        self.list_of_pairings = list(itertools.combinations(self.list_of_teams, 2))
        
        self.df_pairings = pd.DataFrame(data = self.get_pairings(), columns = ['team_A', 'team_B'])
        
        self.df_feat = df_feat[df_feat['team'].isin(list_of_teams)].reset_index(drop = True)
        
        df_feat_A = df_feat.copy()
        cols_A = [col + '_A' for col in df_feat_A.columns]
        df_feat_A.columns = cols_A
        
        df_feat_B = df_feat.copy()
        cols_B = [col + '_B' for col in df_feat_B.columns]
        df_feat_B.columns = cols_B
        
        df_feat_goals_A = pd.merge(self.df_pairings, df_feat_A, how = 'inner', on = 'team_A')
        df_feat_goals_A = pd.merge(df_feat_goals_A, df_feat_B, how = 'inner', on = 'team_B')
        
        df_feat_goals_B = self.df_pairings.copy()
        df_feat_goals_B.columns = ['team_B', 'team_A']
        df_feat_goals_B = pd.merge(df_feat_goals_B, df_feat_B, how = 'inner', on = 'team_B')
        df_feat_goals_B = pd.merge(df_feat_goals_B, df_feat_A, how = 'inner', on = 'team_A')
        
        self.prob_distr_goals_A = model.predict_proba(df_feat_goals_A)
        self.prob_distr_goals_B = model.predict_proba(df_feat_goals_B)
        
    def get_pairings(self):
        return self.list_of_pairings
    
    def simulate_group(self):
        population_goals = [i for i in range(10)]
        df_results = self.df_pairings.copy()
        
        goals_sim_team_A = [choices(population_goals, self.prob_distr_goals_A[i])[0] for i in range(len(self.prob_distr_goals_A))]
        goals_sim_team_B = [choices(population_goals, self.prob_distr_goals_B[i])[0] for i in range(len(self.prob_distr_goals_B))]
                            
        df_results['goals_sim_A'], df_results['goals_sim_B'] = goals_sim_team_A, goals_sim_team_B
        
        df_results['points_sim_A'] = df_results.apply(lambda row : compute_points(row['goals_sim_A'], row['goals_sim_B']), axis = 1)
        df_results['points_sim_B'] = df_results.apply(lambda row : compute_points(row['goals_sim_B'], row['goals_sim_A']), axis = 1)
        
#         display(df_results)
        
        df_results_mirrored = df_results.copy()
        df_results_mirrored.columns = [change_A_B_in_cols(col) for col in df_results_mirrored.columns]
        
        df_results = pd.concat([df_results, df_results_mirrored], axis = 0).reset_index(drop = True)
        df_results['victories_sim_A'] = df_results['points_sim_A'].apply(lambda points : int(points == 3))
        
        df_results = df_results.groupby('team_A')['points_sim_A', 'goals_sim_A', 'goals_sim_B', 'victories_sim_A'].agg('sum').reset_index()
        df_results['difference_sim_A'] = df_results.apply(lambda row : row['goals_sim_A'] - row['goals_sim_B'], axis = 1)
        
        df_results = df_results.sort_values(by = ['points_sim_A', 'difference_sim_A', 'goals_sim_A'], ascending = [0, 0, 0])
        df_results = df_results.reset_index(drop = True)
        
#         print(df_results['team_A'].values[:2])
#         print(df_results.iloc[2].values)
        
        cols_third_team = ['team_A', 'points_sim_A', 'difference_sim_A', 'goals_sim_A', 'victories_sim_A']
    
        return list(df_results['team_A'].values[:2]) +  list(df_results[cols_third_team].iloc[2].values)

In [126]:
class EURO2020():
    def __init__(self, df_schedule, df_feat, model):
        self.groups = [Group(df_schedule[col].values, df_feat, model) for col in df_schedule]
    
    def matrix_best_thirds(self, df_best_4_thirds):
        best_4_thirds = df_best_4_thirds['group'].sum()
        
        dict_opponents = {'ABCD' : ['A', 'D', 'B', 'C'],
                          'ABCE' : ['A', 'E', 'B', 'C'],
                          'ABCF' : ['A', 'F', 'B', 'C'],
                          'ABDE' : ['D', 'E', 'A', 'B'],
                          'ABDF' : ['D', 'F', 'A', 'B'],
                          'ABEF' : ['E', 'F', 'B', 'A'],
                          'ACDE' : ['E', 'D', 'C', 'A'],
                          'ACDF' : ['F', 'D', 'C', 'A'],
                          'ACEF' : ['E', 'F', 'C', 'A'],
                          'ADEF' : ['E', 'F', 'D', 'A'],
                          'BCDE' : ['E', 'D', 'B', 'C'],
                          'BCDF' : ['F', 'D', 'C', 'B'],
                          'BCEF' : ['F', 'E', 'C', 'B'],
                          'BDEF' : ['F', 'E', 'D', 'B'],
                          'CDEF' : ['F', 'E', 'D', 'C']
                         }
    
    def compute_round_of_last_sixteen(self, results_groups):
        data_thirds = [result_group[2:] for result_group in results_groups]
        
        df_ranking_thirds = pd.DataFrame(data = data_thirds, columns = ['team', 'points', 'goal_difference', 'goals', 'victories'])
        df_ranking_thirds['group'] = ['A', 'B', 'C', 'D', 'E', 'F']
        
        df_ranking_thirds = df_ranking_thirds.sort_values(by = ['points', 'goal_difference', 'goals', 'victories'], ascending = [0, 0, 0, 0]).reset_index(drop = True)
        
        df_best_4_thirds = df_ranking_thirds[['team', 'group']].iloc[:4].sort_values(by = ['group']).reset_index(drop = True)
        
        best_4_thirds = df_best_4_thirds['group'].sum()
        
        dict_group_to_team = dict(zip(df_best_4_thirds['group'], df_best_4_thirds['team']))
        
        dict_opponents_thirds = {'ABCD' : ['A', 'D', 'B', 'C'],
                                 'ABCE' : ['A', 'E', 'B', 'C'],
                                 'ABCF' : ['A', 'F', 'B', 'C'],
                                 'ABDE' : ['D', 'E', 'A', 'B'],
                                 'ABDF' : ['D', 'F', 'A', 'B'],
                                 'ABEF' : ['E', 'F', 'B', 'A'],
                                 'ACDE' : ['E', 'D', 'C', 'A'],
                                 'ACDF' : ['F', 'D', 'C', 'A'],
                                 'ACEF' : ['E', 'F', 'C', 'A'],
                                 'ADEF' : ['E', 'F', 'D', 'A'],
                                 'BCDE' : ['E', 'D', 'B', 'C'],
                                 'BCDF' : ['F', 'D', 'C', 'B'],
                                 'BCEF' : ['F', 'E', 'C', 'B'],
                                 'BDEF' : ['F', 'E', 'D', 'B'],
                                 'CDEF' : ['F', 'E', 'D', 'C']
                                }
        
        round_of_last_sixteen = [[results_groups[1][0], dict_group_to_team[dict_opponents_thirds[best_4_thirds][0]]],
                                 [results_groups[0][0], results_groups[2][1]],
                                 [results_groups[5][0], dict_group_to_team[dict_opponents_thirds[best_4_thirds][3]]],
                                 [results_groups[3][1], results_groups[4][1]],
                                 [results_groups[4][0], dict_group_to_team[dict_opponents_thirds[best_4_thirds][2]]],
                                 [results_groups[3][0], results_groups[5][1]],
                                 [results_groups[2][0], dict_group_to_team[dict_opponents_thirds[best_4_thirds][1]]],
                                 [results_groups[0][1], results_groups[1][1]]
                                ]
        
        
        print(results_groups)
        display(df_ranking_thirds)
        display(df_best_4_thirds)
        
        print(round_of_last_sixteen)
        
        return round_of_last_sixteen
    
    def simulate_tournament(self):
        results_groups = [g.simulate_group() for g in self.groups]
        
        round_of_last_sixteen = self.compute_round_of_last_sixteen(results_groups)
    

In [110]:
euro2020 = EURO2020(df_schedule, df_feat_teams, model_2)

In [125]:
euro2020.simulate_tournament()

[['Turkey', 'Switzerland', 'Italy', 3, -1, 3, 1], ['Russia', 'Belgium', 'Denmark', 3, -2, 6, 1], ['Netherlands', 'Ukraine', 'Austria', 3, -1, 5, 1], ['Scotland', 'England', 'Czech Republic', 4, -1, 3, 1], ['Spain', 'Sweden', 'Slovakia', 3, -3, 1, 1], ['Germany', 'Hungary', 'France', 3, -1, 4, 1]]


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




Unnamed: 0,team,points,goal_difference,goals,victories,group
0,Czech Republic,4,-1,3,1,D
1,Austria,3,-1,5,1,C
2,France,3,-1,4,1,F
3,Italy,3,-1,3,1,A
4,Denmark,3,-2,6,1,B
5,Slovakia,3,-3,1,1,E


Unnamed: 0,team,group
0,Italy,A
1,Austria,C
2,Czech Republic,D
3,France,F


[['Russia', 'France'], ['Turkey', 'Ukraine'], ['Germany', 'Italy'], ['England', 'Sweden'], ['Spain', 'Austria'], ['Scotland', 'Hungary'], ['Netherlands', 'Czech Republic'], ['Switzerland', 'Belgium']]
