In [38]:
import numpy as np
import pandas as pd

In [39]:
def read_and_prepare_data():
    df = pd.read_excel('../data/data_from_uefa.xlsx', encoding = 'iso-8859-1')
    
    df = df.drop(columns = ['URL']).rename(columns = {'Team_Home' : 'team_home',
                                                      'Team_Away' : 'team_away',
                                                      'Date' : 'date'
                                                     })
    
    df['date'] = pd.to_datetime(df['date'])
    
    cols_to_transform = ['possession_home', 'possession_away']

    for col in cols_to_transform:
        df[col] = df[col].apply(lambda possession_string : possession_string[:-1]).astype(int)
        
    cols_teams = ['team_home', 'team_away']

    for col in cols_teams:
        df[col] = df[col].apply(lambda team : team if team != 'FYR Macedonia' else 'North Macedonia')
        
    return df

In [40]:
def change_name_home_away(name):
    if name[-4:] == 'home':
        return name[:-4] + 'away'
    elif name[-4:] == 'away':
        return name[:-4] + 'home'
    else:
        return name

In [41]:
df_data = read_and_prepare_data()

In [42]:
def transform_to_mirrored_data(df_data):
    cols_mirrored = [change_name_home_away(col) for col in df_data.columns]
    
    df_mirrored = df_data.copy()
    df_mirrored.columns = cols_mirrored
    
    df_all = pd.concat([df_data, df_mirrored]).reset_index(drop = True)
    
    cols_to_keep = [col for col in df_all.columns if '_away' not in col] 
    
    df_all = df_all[cols_to_keep]
    
    cols_to_keep_2 = [col[:-5] if col[-5:] == '_home' else col for col in cols_to_keep]
    
    df_all.columns = cols_to_keep_2
    
    return df_all

In [43]:
df_train = transform_to_mirrored_data(df_data)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  import sys


In [44]:
def weighted_mean(c, arr):
    n = len(arr)
    
    weights = [2**(-c*m) for m in range(0, n)][::-1]
    
    return np.dot(weights[1:], arr[:-1])/sum(weights)

In [45]:
def normal_mean(arr):
    n = len(arr)
    
    return sum(arr[:-1])/(n-1)

In [46]:
def compute_features_for_country(df_train, country):
    df = df_train.copy()
    df = df[df['team'] == country].sort_values(by = ['date'], ascending = [1]).reset_index(drop = True)
    
    df_first_game_duplicated = pd.concat([df.iloc[:1] for i in range(9)])

    df_base = pd.concat([df_first_game_duplicated, df]).reset_index(drop = True)
    
    col_feat = [col for col in df_train.columns if col not in ['team', 'date']]
    
    # Compute (weighted) average for each feature on the previous games
    for col in col_feat:
        # Compute weighted mean for feature for last 5 and last 10 games
        df_base[col + '_weighted_mean_5']  = df_base[col].rolling(5+1).apply(lambda arr : weighted_mean(0.25, arr))
        df_base[col + '_weighted_mean_10'] = df_base[col].rolling(10+1).apply(lambda arr : weighted_mean(0.25, arr))
        
        # Compute normal mean for feature for last 5 and last 10 games
        df_base[col + '_normal_mean_5']  = df_base[col].rolling(5+1).apply(lambda arr : normal_mean(arr))
        df_base[col + '_normal_mean_10'] = df_base[col].rolling(10+1).apply(lambda arr : normal_mean(arr))
    
    df_base = df_base[10:].reset_index(drop = True)
    
    return df_base

In [47]:
dfuu = compute_features_for_country(df_train, 'Albania')

  
  from ipykernel import kernelapp as app


In [48]:
countries_all = df_train['team'].unique()

In [49]:
dfs_data_countries = [compute_features_for_country(df_train, country) for country in countries_all]

  
  from ipykernel import kernelapp as app


In [50]:
df_data_countries = pd.concat(dfs_data_countries)

In [51]:
cols_to_drop = ['attempts_blocked', 'attempts_off_target', 'attempts_on_target',
                'attempts_total', 'balls_recovered', 'blocks', 'clearances', 'corners',
                'offsides', 'passes_accuracy', 'passes_completed', 'passes', 'possession', 'tackles']

In [52]:
df_data_countries = df_data_countries.drop(columns = cols_to_drop)

In [53]:
df_data_countries

Unnamed: 0,date,goals,team,attempts_blocked_weighted_mean_5,attempts_blocked_weighted_mean_10,attempts_blocked_normal_mean_5,attempts_blocked_normal_mean_10,attempts_off_target_weighted_mean_5,attempts_off_target_weighted_mean_10,attempts_off_target_normal_mean_5,...,passes_normal_mean_5,passes_normal_mean_10,possession_weighted_mean_5,possession_weighted_mean_10,possession_normal_mean_5,possession_normal_mean_10,tackles_weighted_mean_5,tackles_weighted_mean_10,tackles_normal_mean_5,tackles_normal_mean_10
0,2018-09-10 18:45:00+00:00,0,Italy,3.586077,3.867853,4.0,4.0,6.275634,6.768743,7.0,...,555.0,555.0,52.894632,57.050832,59.0,59.0,1.793038,1.933927,2.0,2.0
1,2018-10-14 18:45:00+00:00,1,Italy,3.093836,3.494085,3.6,3.8,5.783394,6.394975,6.6,...,530.2,542.6,50.187310,54.995107,56.8,57.9,3.269760,3.055231,3.2,2.6
2,2018-11-17 19:45:00+00:00,0,Italy,3.418274,3.740437,3.8,3.9,5.861711,6.454442,6.6,...,553.0,554.0,52.833137,57.004137,58.6,58.8,3.527049,3.250595,3.6,2.8
3,2019-03-23 19:45:00+00:00,2,Italy,3.691092,3.947593,4.0,4.0,5.927568,6.504449,6.6,...,605.0,580.0,54.565762,58.319755,60.0,59.5,3.497282,3.227993,3.8,2.9
4,2019-03-26 19:45:00+00:00,6,Italy,2.936023,3.374254,3.4,3.7,5.982946,6.546499,6.6,...,629.2,592.1,53.561519,57.557212,59.4,59.2,3.964491,3.582754,4.4,3.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14,2019-11-18 19:45:00+00:00,0,Liechtenstein,1.131170,1.030355,1.2,0.9,2.230245,2.325129,2.8,...,311.6,281.6,28.141311,30.498868,31.6,32.1,3.511064,5.206962,4.4,6.3
15,2020-09-08 18:45:00+00:00,2,Liechtenstein,0.847716,0.833385,1.0,0.8,1.358001,1.823046,1.8,...,304.4,273.8,26.790826,29.304715,30.2,30.6,3.212556,4.927832,3.8,6.1
16,2020-10-10 16:00:00+00:00,0,Liechtenstein,1.205082,1.074559,1.4,1.0,2.800458,2.928955,2.8,...,333.6,285.7,33.128300,33.742867,35.8,33.0,2.508460,4.440154,2.6,5.6
17,2020-10-13 18:45:00+00:00,0,Liechtenstein,2.529229,2.211781,2.4,1.7,3.870775,3.705063,3.8,...,365.0,325.9,38.524609,38.882192,39.6,36.4,3.132995,4.502946,3.2,5.6


In [54]:
def uuuvvv(df_data, df_data_countries):
    df_xx = df_data[['date', 'team_home', 'team_away']].rename(columns = {'team_home' : 'team_A',
                                                                          'team_away' : 'team_B'
                                                                         })
    
    df_yy = df_data[['date', 'team_home', 'team_away']].copy().rename(columns = {'team_home' : 'team_B',
                                                                                 'team_away' : 'team_A'
                                                                                })
    
    df_xx = pd.concat([df_xx, df_yy]).reset_index(drop = True)
    
    cols_a = [col + '_A' if col not in ['date'] else col for col in df_data_countries.columns]
    cols_b = [col + '_B' if col not in ['date'] else col for col in df_data_countries.columns]
    
    df_A = df_data_countries.copy()
    df_A.columns = cols_a
    
    df_B = df_data_countries.copy()
    df_B.columns = cols_b
    
    df_merge = pd.merge(df_xx, df_A, how = 'left', on = ['date', 'team_A'])
    df_merge = pd.merge(df_merge, df_B, how = 'left', on = ['date', 'team_B'])
    
    return df_merge

In [55]:
df_pp = uuuvvv(df_data, df_data_countries)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  # Remove the CWD from sys.path while we load stuff.


In [56]:
df_tt = df_pp.dropna().reset_index(drop = True)

In [57]:
from sklearn.base import BaseEstimator, TransformerMixin

In [58]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif, mutual_info_classif

In [59]:
from sklearn.pipeline import Pipeline

In [69]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [61]:
from sklearn.ensemble import RandomForestClassifier

In [135]:
class FeatureSelectionTransformer(BaseEstimator, TransformerMixin):
#     def __init__(self):
#         pass
    
    def __init__(self, type_feat = 'weighted_mean_10'):
        self.type_feat = type_feat
    
    def fit(self, X, y = None):
        return self
    
    def transform(self, X):
        cols_feat = X.columns
        cols_feat_selected = [col for col in cols_feat if self.type_feat in col]
        
        return X[cols_feat_selected]

In [68]:
df_tt

Unnamed: 0,date,team_A,team_B,goals_A,attempts_blocked_weighted_mean_5_A,attempts_blocked_weighted_mean_10_A,attempts_blocked_normal_mean_5_A,attempts_blocked_normal_mean_10_A,attempts_off_target_weighted_mean_5_A,attempts_off_target_weighted_mean_10_A,...,passes_normal_mean_5_B,passes_normal_mean_10_B,possession_weighted_mean_5_B,possession_weighted_mean_10_B,possession_normal_mean_5_B,possession_normal_mean_10_B,tackles_weighted_mean_5_B,tackles_weighted_mean_10_B,tackles_normal_mean_5_B,tackles_normal_mean_10_B
0,2020-09-04 18:45:00+00:00,Italy,Bosnia and Herzegovina,1.0,4.115240,4.350733,5.2,4.9,6.271875,6.997977,...,555.8,526.2,50.896133,53.608191,56.0,54.0,2.893939,3.650661,3.6,4.4
1,2020-09-04 18:45:00+00:00,Netherlands,Poland,1.0,3.308853,3.900806,4.0,4.4,6.715011,6.242044,...,505.6,475.4,47.466051,51.484991,52.2,52.8,4.305637,4.555010,5.6,5.0
2,2020-09-07 18:45:00+00:00,Bosnia and Herzegovina,Poland,1.0,3.707657,4.064236,4.2,4.3,5.822317,6.209453,...,503.6,476.0,44.699450,48.479980,51.8,52.2,3.363306,4.478718,4.0,5.1
3,2020-09-07 18:45:00+00:00,Netherlands,Italy,0.0,3.495600,4.214594,4.0,4.9,6.502466,6.238070,...,688.2,676.3,58.205986,62.594606,66.2,65.6,2.675918,3.249303,3.2,4.0
4,2020-10-11 16:00:00+00:00,Bosnia and Herzegovina,Netherlands,0.0,3.520708,3.716894,4.2,4.1,6.502954,6.584102,...,671.6,660.6,54.721647,59.652674,63.4,62.3,3.973302,4.905887,4.0,5.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1059,2020-11-12 17:00:00+00:00,North Macedonia,Georgia,1.0,3.651100,3.676323,3.6,3.4,2.502230,3.046163,...,491.0,453.9,49.239538,50.970844,55.4,51.4,4.320377,4.071855,4.8,3.6
1060,2018-10-11 19:45:00+00:00,Sweden,Russia,0.0,0.000000,0.000000,0.0,0.0,5.379115,5.801780,...,328.0,328.0,34.964249,37.711567,39.0,39.0,5.379115,5.801780,6.0,6.0
1061,2018-10-14 16:00:00+00:00,Turkey,Russia,0.0,1.793038,1.933927,2.0,2.0,2.443437,2.714006,...,372.4,350.2,39.886653,41.449248,43.0,41.0,5.132995,5.614895,5.8,5.9
1062,2019-11-15 19:45:00+00:00,Georgia,Switzerland,0.0,4.318207,4.289955,4.6,4.1,5.347431,5.756696,...,515.0,533.4,52.018420,55.527279,57.2,56.7,2.972601,3.740749,3.4,3.9


In [85]:
cols_blacklist = ['date', 'team_A', 'team_B', 'goals_A', 'goals_B']
cols_to_keep = [col for col in df_tt.columns if col not in cols_blacklist]

In [101]:
X = df_tt[cols_to_keep]
y = df_tt['goals_A'].astype(int)

In [99]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)

In [139]:
pipe = Pipeline([('Transformer_FeatSelection', FeatureSelectionTransformer()),
                 ('clf', RandomForestClassifier(n_estimators = 200, n_jobs = 4))
                ])

In [155]:
parameters = {'Transformer_FeatSelection__type_feat' : ['weighted_mean_5', 'weighted_mean_10', 'normal_mean_5', 'normal_mean_10'],
              'clf__max_depth' : [3, 5, 10, 20]
             }

In [156]:
cv = GridSearchCV(pipe, param_grid = parameters, verbose = 3)

In [157]:
cv.fit(X_train, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] Transformer_FeatSelection__type_feat=weighted_mean_5, clf__max_depth=3 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  Transformer_FeatSelection__type_feat=weighted_mean_5, clf__max_depth=3, score=0.319, total=   2.4s
[CV] Transformer_FeatSelection__type_feat=weighted_mean_5, clf__max_depth=3 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.3s remaining:    0.0s


[CV]  Transformer_FeatSelection__type_feat=weighted_mean_5, clf__max_depth=3, score=0.338, total=   0.7s
[CV] Transformer_FeatSelection__type_feat=weighted_mean_5, clf__max_depth=3 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    3.0s remaining:    0.0s


[CV]  Transformer_FeatSelection__type_feat=weighted_mean_5, clf__max_depth=3, score=0.356, total=   0.8s
[CV] Transformer_FeatSelection__type_feat=weighted_mean_5, clf__max_depth=3 
[CV]  Transformer_FeatSelection__type_feat=weighted_mean_5, clf__max_depth=3, score=0.377, total=   0.7s
[CV] Transformer_FeatSelection__type_feat=weighted_mean_5, clf__max_depth=3 
[CV]  Transformer_FeatSelection__type_feat=weighted_mean_5, clf__max_depth=3, score=0.384, total=   0.7s
[CV] Transformer_FeatSelection__type_feat=weighted_mean_5, clf__max_depth=5 
[CV]  Transformer_FeatSelection__type_feat=weighted_mean_5, clf__max_depth=5, score=0.325, total=   0.7s
[CV] Transformer_FeatSelection__type_feat=weighted_mean_5, clf__max_depth=5 
[CV]  Transformer_FeatSelection__type_feat=weighted_mean_5, clf__max_depth=5, score=0.356, total=   0.7s
[CV] Transformer_FeatSelection__type_feat=weighted_mean_5, clf__max_depth=5 
[CV]  Transformer_FeatSelection__type_feat=weighted_mean_5, clf__max_depth=5, score=0.338,

[CV]  Transformer_FeatSelection__type_feat=normal_mean_5, clf__max_depth=5, score=0.356, total=   0.7s
[CV] Transformer_FeatSelection__type_feat=normal_mean_5, clf__max_depth=5 
[CV]  Transformer_FeatSelection__type_feat=normal_mean_5, clf__max_depth=5, score=0.377, total=   0.7s
[CV] Transformer_FeatSelection__type_feat=normal_mean_5, clf__max_depth=5 
[CV]  Transformer_FeatSelection__type_feat=normal_mean_5, clf__max_depth=5, score=0.390, total=   0.7s
[CV] Transformer_FeatSelection__type_feat=normal_mean_5, clf__max_depth=10 
[CV]  Transformer_FeatSelection__type_feat=normal_mean_5, clf__max_depth=10, score=0.362, total=   0.8s
[CV] Transformer_FeatSelection__type_feat=normal_mean_5, clf__max_depth=10 
[CV]  Transformer_FeatSelection__type_feat=normal_mean_5, clf__max_depth=10, score=0.331, total=   0.7s
[CV] Transformer_FeatSelection__type_feat=normal_mean_5, clf__max_depth=10 
[CV]  Transformer_FeatSelection__type_feat=normal_mean_5, clf__max_depth=10, score=0.319, total=   0.7s
[

[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:  1.0min finished


GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('Transformer_FeatSelection',
                                        FeatureSelectionTransformer(type_feat='weighted_mean_10')),
                                       ('clf',
                                        RandomForestClassifier(bootstrap=True,
                                                               ccp_alpha=0.0,
                                                               class_weight=None,
                                                               criterion='gini',
                                                               max_depth=None,
                                                               max_features='auto',
                                                               max_leaf_nodes=None,
                                                               max_samples=None,
                                                    

In [158]:
cv.best_params_

{'Transformer_FeatSelection__type_feat': 'weighted_mean_10',
 'clf__max_depth': 3}

In [161]:
cv.predict(X_test)

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 0., 1., 0., 0., 1.,
       1., 0., 1., 1., 0., 0., 0., 1., 1., 1., 1., 0., 0., 1., 1., 1., 2.,
       1., 0., 1., 1., 1., 0., 1., 0., 1., 0., 0., 1., 0., 1., 2., 1., 1.,
       1., 0., 0., 1., 2., 1., 1., 1., 0., 0., 1., 1., 1., 1., 1., 1., 0.,
       2., 1., 0., 0., 1., 1., 0., 1., 0., 1., 1., 1., 1., 1., 1., 2., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0.,
       1., 1., 1., 1., 1., 1., 1., 0., 1., 0., 1., 0., 1., 1., 0., 1., 1.,
       0., 1., 1., 1., 1., 1., 1., 1., 2., 0., 1., 1., 1., 1., 1., 1., 2.,
       1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 0.,
       1., 0., 1., 1., 0., 0., 2., 1., 1., 1., 1., 1., 1., 1., 0., 0., 1.,
       1., 0., 1., 2., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       0., 1., 1., 1., 0., 1., 0., 1., 1., 0., 0., 1., 0., 1., 1., 1., 1.,
       0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 1., 1., 1., 1., 1.,
       1., 0., 1., 1., 1.

In [167]:
cv.predict_proba(X_test)[0]

array([0.20275701, 0.27568007, 0.27407264, 0.1071674 , 0.07632032,
       0.0320423 , 0.0195317 , 0.00628393, 0.00323491, 0.00290971])

In [169]:
cv.predict_proba(X_test)[1]

array([0.23533503, 0.30444064, 0.24306479, 0.11409837, 0.05492369,
       0.03108053, 0.01007087, 0.00297745, 0.00261363, 0.00139501])

In [171]:
cv.predict_proba(X_test)[2]

array([0.27805535, 0.35292796, 0.21073321, 0.08598977, 0.04414463,
       0.0198928 , 0.00563296, 0.00143805, 0.00060396, 0.00058131])

In [172]:
p1 = cv.predict_proba(X_test)[0]
p2 = cv.predict_proba(X_test)[1]

In [174]:
np.transpose(p2)

array([0.23533503, 0.30444064, 0.24306479, 0.11409837, 0.05492369,
       0.03108053, 0.01007087, 0.00297745, 0.00261363, 0.00139501])

In [177]:
# >>> from random import choices
# >>> population = [1, 2, 3, 4, 5, 6]
# >>> weights = [0.1, 0.05, 0.05, 0.2, 0.4, 0.2]
# Now choices(population, weights) generates a single sample:

# >>> choices(population, weights)

In [178]:
from random import choices

In [180]:
[i for i in range(10)]

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [194]:
p1

array([0.20275701, 0.27568007, 0.27407264, 0.1071674 , 0.07632032,
       0.0320423 , 0.0195317 , 0.00628393, 0.00323491, 0.00290971])

In [196]:
p1.T

array([0.20275701, 0.27568007, 0.27407264, 0.1071674 , 0.07632032,
       0.0320423 , 0.0195317 , 0.00628393, 0.00323491, 0.00290971])

In [198]:
p1_T = np.reshape(p1, (10, 1))

In [205]:
p2_x = np.reshape(p2, (1, 10))

In [200]:
np.dot(p1_T, p2)

ValueError: shapes (10,1) and (10,) not aligned: 1 (dim 1) != 10 (dim 0)

In [208]:
np.matmul(p1_T, p2_x)

array([[4.77158254e-02, 6.17274735e-02, 4.92830884e-02, 2.31342436e-02,
        1.11361633e-02, 6.30179563e-03, 2.04193849e-03, 6.03698017e-04,
        5.29931903e-04, 2.82847600e-04],
       [6.48771773e-02, 8.39282190e-02, 6.70081181e-02, 3.14546466e-02,
        1.51413674e-02, 8.56828336e-03, 2.77633687e-03, 8.20822505e-04,
        7.20525859e-04, 3.84575846e-04],
       [6.44988930e-02, 8.34388522e-02, 6.66174087e-02, 3.12712415e-02,
        1.50530814e-02, 8.51832361e-03, 2.76014867e-03, 8.16036472e-04,
        7.16324633e-04, 3.82333470e-04],
       [2.52202426e-02, 3.26261119e-02, 2.60486209e-02, 1.22276253e-02,
        5.88602917e-03, 3.33081977e-03, 1.07926843e-03, 3.19085132e-04,
        2.80095986e-04, 1.49499355e-04],
       [1.79608451e-02, 2.32350081e-02, 1.85507829e-02, 8.70802428e-03,
        4.19179386e-03, 2.37207623e-03, 7.68611683e-04, 2.27239632e-04,
        1.99473126e-04, 1.06467443e-04],
       [7.54067666e-03, 9.75497992e-03, 7.78835599e-03, 3.65597471e-03,
   

In [203]:
p1_T.shape

(10, 1)

In [204]:
p2

array([0.23533503, 0.30444064, 0.24306479, 0.11409837, 0.05492369,
       0.03108053, 0.01007087, 0.00297745, 0.00261363, 0.00139501])

In [212]:
def get_result_max_points(p_goals_A, p_goals_B):
    p_goals_A = np.reshape(p_goals_A, (10, 1))
    p_goals_B = np.reshape(p_goals_B, (1, 10))
    
    mat_prob = np.matmul(p_goals_A, p_goals_B)
    
    return mat_prob

In [216]:
A = [0.2, 0.2, 0.6, 0, 0, 0, 0, 0, 0, 0]
B = [0.3, 0.3, 0.4, 0, 0, 0, 0, 0, 0, 0]

mat = get_result_max_points(A, B)

In [217]:
mat

array([[0.06, 0.06, 0.08, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ],
       [0.06, 0.06, 0.08, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ],
       [0.18, 0.18, 0.24, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ],
       [0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ],
       [0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ],
       [0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ],
       [0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ],
       [0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ],
       [0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ],
       [0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ]])

In [223]:
np.diagonal(mat, 1)

array([0.06, 0.08, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ])

In [186]:
def simulate_game(p_goals_A, p_goals_B):
    population_goals = [i for i in range(10)]
    goals_sim_A = choices(population_goals, p_goals_A)[0]
    goals_sim_B = choices(population_goals, p_goals_B)[0]
    
    return goals_sim_A, goals_sim_B

In [195]:
[simulate_game(p1, p2) for i in range(50)]

[(1, 2),
 (2, 4),
 (2, 0),
 (5, 4),
 (2, 1),
 (0, 2),
 (0, 0),
 (1, 2),
 (1, 0),
 (2, 2),
 (4, 2),
 (1, 1),
 (3, 3),
 (3, 2),
 (1, 1),
 (0, 4),
 (3, 0),
 (0, 1),
 (1, 1),
 (6, 0),
 (2, 1),
 (2, 0),
 (0, 3),
 (3, 1),
 (1, 4),
 (0, 1),
 (2, 1),
 (0, 1),
 (2, 1),
 (2, 1),
 (3, 1),
 (4, 0),
 (2, 1),
 (2, 0),
 (1, 1),
 (0, 0),
 (2, 0),
 (1, 1),
 (3, 1),
 (8, 3),
 (1, 1),
 (2, 2),
 (5, 3),
 (0, 1),
 (3, 0),
 (4, 1),
 (3, 1),
 (1, 0),
 (2, 1),
 (2, 1)]

In [176]:
np.outer(p1, p2)

array([[4.77158254e-02, 6.17274735e-02, 4.92830884e-02, 2.31342436e-02,
        1.11361633e-02, 6.30179563e-03, 2.04193849e-03, 6.03698017e-04,
        5.29931903e-04, 2.82847600e-04],
       [6.48771773e-02, 8.39282190e-02, 6.70081181e-02, 3.14546466e-02,
        1.51413674e-02, 8.56828336e-03, 2.77633687e-03, 8.20822505e-04,
        7.20525859e-04, 3.84575846e-04],
       [6.44988930e-02, 8.34388522e-02, 6.66174087e-02, 3.12712415e-02,
        1.50530814e-02, 8.51832361e-03, 2.76014867e-03, 8.16036472e-04,
        7.16324633e-04, 3.82333470e-04],
       [2.52202426e-02, 3.26261119e-02, 2.60486209e-02, 1.22276253e-02,
        5.88602917e-03, 3.33081977e-03, 1.07926843e-03, 3.19085132e-04,
        2.80095986e-04, 1.49499355e-04],
       [1.79608451e-02, 2.32350081e-02, 1.85507829e-02, 8.70802428e-03,
        4.19179386e-03, 2.37207623e-03, 7.68611683e-04, 2.27239632e-04,
        1.99473126e-04, 1.06467443e-04],
       [7.54067666e-03, 9.75497992e-03, 7.78835599e-03, 3.65597471e-03,
   

In [173]:
np.dot(p1, p2)

0.21590468933629348

In [170]:
cv.classes_

array([0., 1., 2., 3., 4., 5., 6., 7., 8., 9.])

In [146]:
cv.param_grid

{'Transformer_FeatSelection__type_feat': ['weighted_mean_5',
  'weighted_mean_10'],
 'clf__max_depth': [3, 5, 10, 20]}

In [153]:
cv.cv_results_

{'mean_fit_time': array([0.99605694, 0.66063781, 0.69343972, 0.64923706, 0.49782839,
        0.55343161, 0.64843702, 0.66663823]),
 'std_fit_time': array([0.85361018, 0.07191399, 0.04220954, 0.0193857 , 0.00487467,
        0.01908624, 0.043184  , 0.03449022]),
 'mean_score_time': array([0.11340656, 0.10620604, 0.10580606, 0.10560608, 0.10520611,
        0.10520597, 0.10520616, 0.10620613]),
 'std_score_time': array([0.01630568, 0.0007483 , 0.00040004, 0.00048988, 0.00074843,
        0.00039997, 0.00040011, 0.00097972]),
 'param_Transformer_FeatSelection__type_feat': masked_array(data=['weighted_mean_5', 'weighted_mean_5',
                    'weighted_mean_5', 'weighted_mean_5',
                    'weighted_mean_10', 'weighted_mean_10',
                    'weighted_mean_10', 'weighted_mean_10'],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_clf__max_depth': masked_array(data=[3, 5, 10, 20, 3, 5

In [134]:
trf = FeatureSelectionTransformer()

In [132]:
trf.transform(X)

Unnamed: 0,attempts_blocked_weighted_mean_10_A,attempts_off_target_weighted_mean_10_A,attempts_on_target_weighted_mean_10_A,attempts_total_weighted_mean_10_A,balls_recovered_weighted_mean_10_A,blocks_weighted_mean_10_A,clearances_weighted_mean_10_A,corners_weighted_mean_10_A,goals_weighted_mean_10_A,offsides_weighted_mean_10_A,...,blocks_weighted_mean_10_B,clearances_weighted_mean_10_B,corners_weighted_mean_10_B,goals_weighted_mean_10_B,offsides_weighted_mean_10_B,passes_accuracy_weighted_mean_10_B,passes_completed_weighted_mean_10_B,passes_weighted_mean_10_B,possession_weighted_mean_10_B,tackles_weighted_mean_10_B
0,4.350733,6.997977,8.269185,19.617894,32.394677,2.537389,0.000000,6.899966,4.116726,2.230526,...,2.709952,0.000000,6.461972,1.955137,2.981539,84.556986,474.059219,537.252444,53.608191,3.650661
1,3.900806,6.242044,7.412890,17.555740,37.947339,2.086250,1.327467,4.785528,2.637424,2.143022,...,2.688630,0.000000,5.470276,1.925197,2.365329,81.102871,401.922191,476.360472,51.484991,4.555010
2,4.064236,6.209453,5.941799,16.215489,31.365653,3.026326,0.000000,5.477250,1.764878,2.474128,...,3.063133,0.000000,5.116214,1.585854,2.736533,80.144467,379.206983,453.999206,48.479980,4.478718
3,4.214594,6.238070,6.782789,17.235452,36.525201,1.622173,1.116262,4.859443,2.272537,2.329675,...,2.221455,0.000000,6.912125,3.582551,2.084226,86.418340,580.143191,648.230190,62.594606,3.249303
4,3.716894,6.584102,5.492932,15.793928,31.013933,3.618265,0.000000,5.532134,1.536431,2.268905,...,1.891695,0.938661,4.910262,1.844895,2.200636,83.784879,545.029763,625.112855,59.652674,4.905887
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1059,3.676323,3.046163,3.926705,10.649190,34.552728,2.167265,0.000000,3.808335,1.537300,2.310749,...,3.021370,0.000000,3.319682,1.075403,1.956042,81.154173,381.265915,452.018575,50.970844,4.071855
1060,0.000000,5.801780,4.834816,10.636596,43.513346,1.933927,0.000000,3.867853,1.933927,1.933927,...,1.933927,0.000000,2.900890,1.933927,1.933927,71.555281,235.939034,317.163947,37.711567,5.801780
1061,1.933927,2.714006,5.614895,10.262828,31.844701,1.560158,0.000000,5.614895,1.340731,2.275152,...,2.120811,0.000000,3.274658,1.560158,1.747042,73.237237,275.371572,358.652210,41.449248,5.614895
1062,4.289955,5.756696,3.561331,13.607982,33.374911,3.005356,0.039288,3.963334,1.227806,2.178716,...,1.632866,3.716428,7.640405,1.713576,1.629652,81.562770,426.377891,505.071362,55.527279,3.740749


In [63]:
# >>> from sklearn.datasets import load_iris
# >>> from sklearn.feature_selection import SelectKBest
# >>> from sklearn.feature_selection import chi2
# >>> X, y = load_iris(return_X_y=True)
# >>> X.shape
# (150, 4)
# >>> X_new = SelectKBest(chi2, k=2).fit_transform(X, y)
# >>> X_new.shape
# (150, 2)

In [64]:
cols = [col for col in df_tt.columns if col not in ['date', 'team_A', 'team_B', 'goals_A', 'goals_B']]

In [65]:
X = df_tt[cols]
y = df_tt['goals_A']

In [66]:
mmm = SelectKBest(f_classif, k = 5).fit(X, y)

In [67]:
mmm.get_support

<bound method SelectorMixin.get_support of SelectKBest(k=5, score_func=<function f_classif at 0x000000000AF80730>)>