# ML_Football: Feature Engeniring and feature selection

In [1]:
import os
import re
import datetime
import warnings

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt

import nltk
nltk.download('punkt')
nltk.download('wordnet')

from scipy import stats
from scipy.stats import (
    pearsonr, chisquare, chi2_contingency,
    f_oneway, ks_2samp, norm
)

import statsmodels.api as sm
import statsmodels.formula.api as smf

from sklearn import ensemble, linear_model
from sklearn.tree import DecisionTreeClassifier  # import tree here to avoid duplicate
from sklearn.metrics import accuracy_score
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import Lasso, Ridge
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVC

import pyodbc
# import missingno as msno
# from pandas_profiling import ProfileReport

warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\eitanb\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\eitanb\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
path = r"C:\Users\eitanb\Documents\GitHub\ML_Football_DS18\DATA"
path_result = r"C:\Users\eitanb\Documents\GitHub\ML_Football_DS18\result"

In [3]:
df=pd.read_csv(path_result+"/"+"after_missing_value_KNN.csv", index_col=0)

In [5]:
features_to_drop = [
    
    "homeGoals", "awayGoals",
    
  
    "result_h",    'date', 'homeGoals', 'awayGoals', 'homeGoalsHalfTime', 'awayGoalsHalfTime',
    'redCards_h', 'redCards_a', 'ownGoals_h', 'ownGoals_a',
    'yellowCards_h', 'yellowCards_a', 'total_assists_h', 'total_assists_a',
   'teamgoals_a','teamgoals_h','homeGoals_cat', 'awayGoals_cat',
       'homeGoalsHT_cat', 'awayGoalsHT_cat','fouls_h','fouls_a',
]
feature_with_strong_correlation=['game_year','gameID','total_xGoalsBuildup_h','total_xGoalsBuildup_a']

features_to_drop_from_model=features_to_drop+feature_with_strong_correlation

df.drop(features_to_drop_from_model, axis=1, inplace=True)

##### Use KNN for missing values that was acquired from the average of the last 5 games, every team will be at least 5 mising values

In [6]:
df_feature_EN=df.copy()
from sklearn.impute import KNNImputer

imp_cols = df_feature_EN.select_dtypes(include = ['float64', 'int64','category']).columns
knn_imputer = KNNImputer(n_neighbors=1)
imputed_data = pd.DataFrame(knn_imputer.fit_transform(df_feature_EN[imp_cols]),columns = imp_cols)
df_feature_EN[imp_cols] = imputed_data
df_feature_EN.isnull().sum().sum()

0

##### droping values data have strong correlation between them or featurs that we use them to create categories

In [7]:
#checking values counts against all the numeric data
for col in df_feature_EN.columns:
    print(f"\nColumn: {col}")
    print(df_feature_EN[col].value_counts(dropna=False))


Column: leagueID
leagueID
1.0    2660
2.0    2660
4.0    2660
5.0    2558
3.0    2142
Name: count, dtype: int64

Column: season
season
2015.0    1826
2014.0    1826
2017.0    1826
2018.0    1826
2020.0    1826
2016.0    1825
2019.0    1725
Name: count, dtype: int64

Column: homeTeamID
homeTeamID
89.0     133
113.0    133
146.0    133
102.0    133
104.0    133
        ... 
202.0     19
203.0     19
206.0     19
234.0     17
262.0     17
Name: count, Length: 146, dtype: int64

Column: awayTeamID
awayTeamID
82.0     133
106.0    133
101.0    133
105.0    133
107.0    133
        ... 
203.0     19
227.0     19
206.0     19
234.0     17
262.0     17
Name: count, Length: 146, dtype: int64

Column: xGoals_h
xGoals_h
0.000000    1738
1.485590       4
1.768180       4
1.926690       3
2.099380       3
            ... 
1.451740       1
0.807441       1
1.014280       1
1.145080       1
0.323960       1
Name: count, Length: 10207, dtype: int64

Column: shots_h
shots_h
14.0    1019
11.0     999
1

In [8]:
df_feature_EN.to_csv(path_result+'/'+'df_after_Feature_Engeniring.csv')

### Feature Selection

#### Choosing the most effective variables

In [9]:
y=df_feature_EN['result']
X = df_feature_EN.drop(columns=['result'])

## Hyperparameters

### Multivariable Analysis

##### Summarization and Selection of Variables

In [10]:
# Fit models and determine if a feature is selected (1) or not (0)
lasso = Lasso(alpha=0.01).fit(X, y)
lasso_selected = (np.abs(lasso.coef_) > 0).astype(int)

# Fit Ridge model
ridge = Ridge(alpha=0.01).fit(X, y)
ridge_selected = (np.abs(ridge.coef_) > 0).astype(int)

svm = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X, y)
svm_selected = (np.abs(svm.coef_[0]) > 0).astype(int)

gb = GradientBoostingClassifier().fit(X, y)
gb_selected = (gb.feature_importances_ > 0).astype(int)

rf = RandomForestClassifier().fit(X, y)
rf_selected = (rf.feature_importances_ > 0).astype(int)

# Create a DataFrame to store results
selection_df = pd.DataFrame({
    'Feature': X.columns,
    'Lasso': lasso_selected, 
    'SVM': svm_selected,
    'GradientBoost': gb_selected,
    'RandomForest': rf_selected,
    'Ridge': ridge_selected
})

# Sum the number of selections for each feature
selection_df['Sum'] = selection_df[['Lasso', 'SVM', 'GradientBoost', 'RandomForest','Ridge']].sum(axis=1)

# Output the results
print(selection_df)

                  Feature  Lasso  SVM  GradientBoost  RandomForest  Ridge  Sum
0                leagueID      0    0              1             1      1    3
1                  season      1    1              1             1      1    5
2              homeTeamID      1    1              1             1      1    5
3              awayTeamID      1    1              1             1      1    5
4                xGoals_h      1    1              1             1      1    5
5                 shots_h      0    1              1             1      1    4
6         shotsOnTarget_h      1    1              1             1      1    5
7                  deep_h      0    1              1             1      1    4
8                  ppda_h      1    1              1             1      1    5
9               corners_h      1    0              1             1      1    4
10       total_xAssists_h      0    0              1             1      1    3
11     total_key_passes_h      1    1              1

In [11]:
 # Selecting variables with a sum of selections >= 4
final_var = selection_df[selection_df['Sum'] >= 4]['Feature'].tolist()

In [12]:
final_var

['season',
 'homeTeamID',
 'awayTeamID',
 'xGoals_h',
 'shots_h',
 'shotsOnTarget_h',
 'deep_h',
 'ppda_h',
 'corners_h',
 'total_key_passes_h',
 'total_xGoalsChain_h',
 'total_blocked_shots_h',
 'total_saved_shots_h',
 'xGoals_a',
 'shots_a',
 'shotsOnTarget_a',
 'deep_a',
 'ppda_a',
 'total_key_passes_a',
 'total_xGoalsChain_a',
 'total_saved_shots_a',
 'game_month',
 'game_day',
 'yellowCards_h_cat',
 'yellowCards_a_cat',
 'total_assists_h_cat',
 'total_assists_a_cat']

### Creating DataFrame with most valuable variables

##### Selected variables - recommended by 3 or more models

In [13]:
 # Selecting variables with a sum of selections >= 4
#final_var = selection_df[selection_df['Sum'] > 4]['Feature'].tolist()

# Add target variable
final_var.append('result')


# Drop specific ID columns (if they exist)

#final_var.remove(['game_day','game_month'])

# Subset the dataframe
df_model = df_feature_EN[final_var].copy()

# Output the result to verify
df_model.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12680 entries, 0 to 12679
Data columns (total 28 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   season                 12680 non-null  float64
 1   homeTeamID             12680 non-null  float64
 2   awayTeamID             12680 non-null  float64
 3   xGoals_h               12680 non-null  float64
 4   shots_h                12680 non-null  float64
 5   shotsOnTarget_h        12680 non-null  float64
 6   deep_h                 12680 non-null  float64
 7   ppda_h                 12680 non-null  float64
 8   corners_h              12680 non-null  float64
 9   total_key_passes_h     12680 non-null  float64
 10  total_xGoalsChain_h    12680 non-null  float64
 11  total_blocked_shots_h  12680 non-null  float64
 12  total_saved_shots_h    12680 non-null  float64
 13  xGoals_a               12680 non-null  float64
 14  shots_a                12680 non-null  float64
 15  shotsOn

In [14]:
#checking values counts against all the numeric data
for col in df_model.columns:
    print(f"\nColumn: {col}")
    print(df_model[col].value_counts(dropna=False))


Column: season
season
2015.0    1826
2014.0    1826
2017.0    1826
2018.0    1826
2020.0    1826
2016.0    1825
2019.0    1725
Name: count, dtype: int64

Column: homeTeamID
homeTeamID
89.0     133
113.0    133
146.0    133
102.0    133
104.0    133
        ... 
202.0     19
203.0     19
206.0     19
234.0     17
262.0     17
Name: count, Length: 146, dtype: int64

Column: awayTeamID
awayTeamID
82.0     133
106.0    133
101.0    133
105.0    133
107.0    133
        ... 
203.0     19
227.0     19
206.0     19
234.0     17
262.0     17
Name: count, Length: 146, dtype: int64

Column: xGoals_h
xGoals_h
0.000000    1738
1.485590       4
1.768180       4
1.926690       3
2.099380       3
            ... 
1.451740       1
0.807441       1
1.014280       1
1.145080       1
0.323960       1
Name: count, Length: 10207, dtype: int64

Column: shots_h
shots_h
14.0    1019
11.0     999
12.0     979
13.0     970
10.0     883
15.0     851
9.0      807
16.0     803
8.0      654
17.0     647
18.0     5

In [15]:
df_model.to_csv(path_result+'/'+'model_after_feature_selection_prediction_during_game.csv')