In [52]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn import model_selection
from sklearn.metrics import accuracy_score
import xgboost as xgb
import warnings
import matplotlib.pyplot as plt
import scipy.stats as stats

warnings.filterwarnings('ignore')

# Load the data

In [3]:
train_home_team_statistics_df = pd.read_csv('./datas_final/train_home_team_statistics_df.csv', index_col=0)
train_away_team_statistics_df = pd.read_csv('./datas_final/train_away_team_statistics_df.csv', index_col=0)

train_scores = pd.read_csv('./datas_final/Y_train.csv', index_col=0)

train_home = train_home_team_statistics_df.iloc[:,2:]
train_away = train_away_team_statistics_df.iloc[:,2:]

train_home.columns = 'HOME_' + train_home.columns
train_away.columns = 'AWAY_' + train_away.columns

train_data =  pd.concat([train_home,train_away],join='inner',axis=1)
train_scores = train_scores.loc[train_data.index]

train_data = train_data.replace({np.inf:np.nan,-np.inf:np.nan})


In [4]:
test_home_team_statistics_df = pd.read_csv('./Test_Data/test_home_team_statistics_df.csv', index_col=0)
test_away_team_statistics_df = pd.read_csv('./Test_Data/test_away_team_statistics_df.csv', index_col=0)

test_home = train_home_team_statistics_df.iloc[:,2:]
test_away = train_away_team_statistics_df.iloc[:,2:]

test_home.columns = 'HOME_' + train_home.columns
test_away.columns = 'AWAY_' + train_away.columns

test_data =  pd.concat([train_home,train_away],join='inner',axis=1)

test_data = test_data.replace({np.inf:np.nan,-np.inf:np.nan})


In [None]:
train_home_player_statistics_df = pd.read_csv('./datas_final/train_home_player_statistics_df.csv', index_col=0)
train_away_player_statistics_df = pd.read_csv('./datas_final/train_away_player_statistics_df.csv', index_col=0)

In [5]:
train_home_player_statistics_df = pd.read_csv('./Test_Data/test_home_player_statistics_df.csv', index_col=0)
train_away_player_statistics_df = pd.read_csv('./Test_Data/test_away_player_statistics_df.csv', index_col=0)

KeyboardInterrupt: 

In [8]:
#### Number of data in the train and testing set 

train_data.shape, test_data.shape ### same number of training data and testing data, 280 features only if we do not consider the players

((12303, 280), (12303, 280))

# Distribution of the features

## Sanity check: symmetry of the distribution of the home and away features

In [67]:
features_home[0][5:]

'TEAM_SHOTS_TOTAL_season_sum'

In [68]:
def compare_moments(df1, df2, features_home, features_away):
    comparison = pd.DataFrame(index=[feature[5:] for feature in features_home], columns=['Mean1', 'Mean2', 'Var1', 'Var2', 'Skew1', 'Skew2', 'Kurt1', 'Kurt2'])

    for idx in range(len(features_home)):
        feature_home, feature_away = features_home[idx], feature_away[idx]
        comparison.at[feature_home, 'Mean1'] = df1[feature_home].mean()
        comparison.at[feature_away, 'Mean2'] = df2[feature_away].mean()
        comparison.at[feature_home, 'Var1'] = df1[feature_home].var()
        comparison.at[feature_away, 'Var2'] = df2[feature_away].var()
        comparison.at[feature_home, 'Skew1'] = df1[feature_home].skew()
        comparison.at[feature_away, 'Skew2'] = df2[feature_away].skew()
        comparison.at[feature_home, 'Kurt1'] = df1[feature_home].kurtosis()
        comparison.at[feature_away, 'Kurt2'] = df2[feature_away].kurtosis()
        
    return comparison

In [47]:
features = train_data.columns.tolist() #home and away should be symmetric 
features_home, features_away = [feature for feature in features if feature.split('_')[0] == 'HOME'], [feature for feature in features if feature.split('_')[0] == 'AWAY']
len(features_home), len(features_away)

(140, 140)

In [62]:
train_data_home, train_data_away = train_data[features_home], train_data[features_away]

In [63]:
for idx in range(len(features_home)):
    print(idx)
    statistic, p_value = stats.ks_2samp(train_data_home[features_home[idx]], train_data_away[features_away[idx]])
    print(f"Feature: {idx}, KS Statistic: {statistic}, P-Value: {p_value}")

0
Feature: 0, KS Statistic: 0.01422417296594325, P-Value: 0.16425339390990823
1
Feature: 1, KS Statistic: 0.010403966512232798, P-Value: 0.5149190180904335
2
Feature: 2, KS Statistic: 0.01284239616353744, P-Value: 0.26004703772282367
3
Feature: 3, KS Statistic: 0.012842396163537328, P-Value: 0.2600470377228331
4
Feature: 4, KS Statistic: 0.012679834186783645, P-Value: 0.27359629866351676
5
Feature: 5, KS Statistic: 0.008778346744696353, P-Value: 0.7267588907781861
6
Feature: 6, KS Statistic: 0.005283264244493213, P-Value: 0.995094102413647
7
Feature: 7, KS Statistic: 0.007396569942290543, P-Value: 0.886871182666131
8
Feature: 8, KS Statistic: 0.014386734942696822, P-Value: 0.15511394426689762
9
Feature: 9, KS Statistic: 0.0035763634885800366, P-Value: 0.9999982666055263
10
Feature: 10, KS Statistic: 0.011379338372754622, P-Value: 0.4001558043849973
11
Feature: 11, KS Statistic: 0.00674632203527592, P-Value: 0.9403847024296159
12
Feature: 12, KS Statistic: 0.004958140290985957, P-Value:

In [None]:
comp = compare_moments(train_data_home, train_data_away, features_home, features_away)

In [37]:
'AWAY_TEAM_GOALS_5_last_match_std'.split('_')[0] == 'HOME'

False

In [32]:
[feature for feature in features if features[0].split('_')[0] == 'HOME']

['HOME_TEAM_SHOTS_TOTAL_season_sum',
 'HOME_TEAM_SHOTS_INSIDEBOX_season_sum',
 'HOME_TEAM_SHOTS_OFF_TARGET_season_sum',
 'HOME_TEAM_SHOTS_ON_TARGET_season_sum',
 'HOME_TEAM_SHOTS_OUTSIDEBOX_season_sum',
 'HOME_TEAM_PASSES_season_sum',
 'HOME_TEAM_SUCCESSFUL_PASSES_season_sum',
 'HOME_TEAM_SAVES_season_sum',
 'HOME_TEAM_CORNERS_season_sum',
 'HOME_TEAM_FOULS_season_sum',
 'HOME_TEAM_YELLOWCARDS_season_sum',
 'HOME_TEAM_REDCARDS_season_sum',
 'HOME_TEAM_OFFSIDES_season_sum',
 'HOME_TEAM_ATTACKS_season_sum',
 'HOME_TEAM_PENALTIES_season_sum',
 'HOME_TEAM_SUBSTITUTIONS_season_sum',
 'HOME_TEAM_BALL_SAFE_season_sum',
 'HOME_TEAM_DANGEROUS_ATTACKS_season_sum',
 'HOME_TEAM_INJURIES_season_sum',
 'HOME_TEAM_GOALS_season_sum',
 'HOME_TEAM_GAME_WON_season_sum',
 'HOME_TEAM_GAME_DRAW_season_sum',
 'HOME_TEAM_GAME_LOST_season_sum',
 'HOME_TEAM_SHOTS_TOTAL_season_average',
 'HOME_TEAM_SHOTS_INSIDEBOX_season_average',
 'HOME_TEAM_SHOTS_OFF_TARGET_season_average',
 'HOME_TEAM_SHOTS_ON_TARGET_season_a