# ML_Football: Feature Engeniring and feature selection

In [71]:
import os
import re
import datetime
import warnings

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt

import nltk
nltk.download('punkt')
nltk.download('wordnet')

from scipy import stats
from scipy.stats import (
    pearsonr, chisquare, chi2_contingency,
    f_oneway, ks_2samp, norm
)

import statsmodels.api as sm
import statsmodels.formula.api as smf

from sklearn import ensemble, linear_model
from sklearn.tree import DecisionTreeClassifier  # import tree here to avoid duplicate
from sklearn.metrics import accuracy_score
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import Lasso, Ridge
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVC

import pyodbc
# import missingno as msno
# from pandas_profiling import ProfileReport

warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\eitanb\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\eitanb\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [72]:
path = r"C:\Users\eitanb\Documents\GitHub\ML_Football_DS18\DATA"
path_result = r"C:\Users\eitanb\Documents\GitHub\ML_Football_DS18\result"

In [73]:
df=pd.read_csv(path_result+"/"+"after_missing_value_KNN.csv", index_col=0)

In [74]:
# Convert date and sort
df["date"] = pd.to_datetime(df["date"], errors='coerce')
df = df.sort_values("date")

# Redefine the rolling function
def add_rolling_features(df, team_col, prefix, columns, window=5):
    for col in columns:
        roll_name = f"{prefix}_{col}_rolling{window}"
        df[roll_name] = (
            df
            .groupby(team_col)[col]
            .shift(1)
            .rolling(window=window, min_periods=1)
            .mean()
            .reset_index(level=0, drop=True)
        )
    return df

# Map result to binary/win/points
df["home_win"] = df["result_h"].map({"W": 1, "D": 0, "L": 0})
df["away_win"] = df["result_h"].map({"W": 0, "D": 0, "L": 1})
df["home_points"] = df["result_h"].map({"W": 3, "D": 1, "L": 0})
df["away_points"] = df["result_h"].map({"W": 0, "D": 1, "L": 3})

# Rolling win rates
df["home_win_rate_5"] = (
    df.groupby("homeTeamID")["home_win"]
    .shift(1).rolling(window=5, min_periods=1).mean().reset_index(level=0, drop=True)
)
df["away_win_rate_5"] = (
    df.groupby("awayTeamID")["away_win"]
    .shift(1).rolling(window=5, min_periods=1).mean().reset_index(level=0, drop=True)
)

# Match-level derived features
#df["xG_diff"] = df["xGoals_h"] - df["xGoals_a"]
#df["goal_diff"] = df["homeGoals"] - df["awayGoals"]

# Define rolling features
rolling_features = [
    "xGoals_h", "shots_h", "shotsOnTarget_h", "xGoals_a", "shots_a", "shotsOnTarget_a",
    "homeGoals", "awayGoals", "fouls_h", "fouls_a", 
    "ppda_h", "ppda_a", "deep_h", "deep_a",
     "home_points", "away_points",'corners_h','corners_a','total_key_passes_h','total_key_passes_a','total_blocked_shots_h',
       'total_saved_shots_h','total_blocked_shots_a',
       'total_saved_shots_a','total_assists_a','total_assists_h',
]

# Apply rolling features
df = add_rolling_features(df, "homeTeamID", "home", rolling_features, window=5)
df = add_rolling_features(df, "awayTeamID", "away", rolling_features, window=5)

# Add difference features
#df["xGoals_rolling_diff"] = df["home_xGoals_h_rolling5"] - df["away_xGoals_a_rolling5"]
#df["win_rate_diff_5"] = df["home_win_rate_5"] - df["away_win_rate_5"]
#df["points_avg_diff_5"] = df["home_home_points_rolling5"] - df["away_away_points_rolling5"]
#df["goal_diff_rolling_diff"] = df["home_goal_diff_rolling5"] - df["away_goal_diff_rolling5"]
#df["xG_diff_rolling_diff"] = df["home_xG_diff_rolling5"] - df["away_xG_diff_rolling5"]

# Display enhanced features
# Show the new columns added
columns_added = [col for col in df.columns if "rolling" in col or "win_rate" in col or "diff" in col or "points_avg" in col]
df[["gameID", "homeTeamID", "awayTeamID", "date"] + columns_added].head(10)



Unnamed: 0,gameID,homeTeamID,awayTeamID,date,home_win_rate_5,away_win_rate_5,home_xGoals_h_rolling5,home_shots_h_rolling5,home_shotsOnTarget_h_rolling5,home_xGoals_a_rolling5,home_shots_a_rolling5,home_shotsOnTarget_a_rolling5,home_homeGoals_rolling5,home_awayGoals_rolling5,home_fouls_h_rolling5,home_fouls_a_rolling5,home_ppda_h_rolling5,home_ppda_a_rolling5,home_deep_h_rolling5,home_deep_a_rolling5,home_home_points_rolling5,home_away_points_rolling5,home_corners_h_rolling5,home_corners_a_rolling5,home_total_key_passes_h_rolling5,home_total_key_passes_a_rolling5,home_total_blocked_shots_h_rolling5,home_total_saved_shots_h_rolling5,home_total_blocked_shots_a_rolling5,home_total_saved_shots_a_rolling5,home_total_assists_a_rolling5,home_total_assists_h_rolling5,away_xGoals_h_rolling5,away_shots_h_rolling5,away_shotsOnTarget_h_rolling5,away_xGoals_a_rolling5,away_shots_a_rolling5,away_shotsOnTarget_a_rolling5,away_homeGoals_rolling5,away_awayGoals_rolling5,away_fouls_h_rolling5,away_fouls_a_rolling5,away_ppda_h_rolling5,away_ppda_a_rolling5,away_deep_h_rolling5,away_deep_a_rolling5,away_home_points_rolling5,away_away_points_rolling5,away_corners_h_rolling5,away_corners_a_rolling5,away_total_key_passes_h_rolling5,away_total_key_passes_a_rolling5,away_total_blocked_shots_h_rolling5,away_total_saved_shots_h_rolling5,away_total_blocked_shots_a_rolling5,away_total_saved_shots_a_rolling5,away_total_assists_a_rolling5,away_total_assists_h_rolling5
5437,6185.0,177.0,161.0,2014-08-08 19:30:00,0.4,0.4,0.612752,15.0,6.2,0.592654,8.0,2.4,1.6,0.8,11.8,7.4,9.45924,16.91106,4.2,4.2,1.4,1.4,3.4,4.4,10.6,7.0,2.4,1.8,2.6,1.4,0.2,1.2,1.177608,11.0,5.2,0.898379,13.4,4.6,1.4,1.2,11.6,9.6,9.71878,10.99454,5.4,3.6,1.4,1.4,5.6,4.6,7.6,11.0,2.2,3.6,2.4,2.4,0.8,0.6
5443,6191.0,168.0,210.0,2014-08-09 20:00:00,0.4,0.2,0.63796,12.4,4.4,0.492725,14.4,4.8,1.4,1.4,10.0,9.2,10.47778,11.80558,7.4,2.8,1.4,1.4,3.6,3.0,8.8,11.2,2.2,3.0,1.8,2.4,0.8,0.8,0.663808,17.6,6.4,0.519442,6.4,2.4,2.8,1.0,10.0,5.8,6.39664,21.15582,6.2,1.6,2.4,0.6,3.6,3.4,13.6,6.0,3.0,1.8,1.4,1.4,0.8,2.4
5444,6192.0,170.0,174.0,2014-08-09 20:00:00,0.4,0.2,1.068028,12.8,4.6,0.518463,12.2,4.4,1.6,1.4,7.0,11.2,8.09466,10.29868,6.8,1.6,1.4,1.4,3.8,2.0,9.2,9.2,2.2,3.0,1.2,2.0,0.8,0.8,0.871684,14.6,4.8,0.663495,7.0,3.0,2.0,1.2,10.8,9.6,6.6911,15.47702,7.2,2.4,2.0,0.8,4.2,3.6,11.4,6.2,2.4,2.4,1.2,1.6,1.0,1.4
5442,6190.0,166.0,176.0,2014-08-09 20:00:00,0.4,0.0,1.150806,12.8,4.0,0.374059,11.2,4.2,1.2,1.8,9.4,6.8,9.80904,12.26182,7.2,2.4,1.4,1.4,3.6,4.0,9.8,9.0,2.0,2.4,2.2,1.4,1.2,0.8,0.512846,18.8,7.8,0.382525,8.4,2.8,3.2,0.8,10.2,5.6,9.35022,21.06452,6.6,2.8,3.0,0.0,4.4,4.0,14.0,7.8,3.6,2.8,2.2,2.0,0.8,2.8
5440,6188.0,169.0,175.0,2014-08-09 20:00:00,0.6,0.2,0.941866,16.4,6.2,0.455852,9.2,2.8,2.0,1.2,12.0,5.2,8.90644,16.6913,5.8,3.0,1.8,1.2,3.6,4.2,12.6,8.2,3.4,2.2,2.6,1.6,1.0,1.8,0.790904,16.8,8.0,0.492996,12.0,3.2,2.8,0.8,11.8,5.8,10.1097,14.84376,5.4,3.8,2.4,0.6,5.4,3.8,11.8,10.0,3.2,3.2,2.6,2.4,0.8,2.2
5439,6187.0,209.0,165.0,2014-08-09 20:00:00,0.6,0.2,0.790904,18.6,7.6,0.435749,9.4,2.8,2.2,1.2,12.0,5.6,10.13858,15.292,5.8,3.6,1.8,1.2,5.2,3.8,13.8,7.8,3.0,2.2,3.2,1.4,0.8,2.0,0.451528,15.6,7.4,0.507669,12.2,3.0,2.4,0.4,12.4,7.6,11.33094,16.02948,5.0,4.4,2.4,0.6,4.0,4.0,10.6,10.4,3.6,3.4,2.6,2.6,0.4,1.8
5438,6186.0,162.0,164.0,2014-08-09 20:00:00,0.4,0.2,0.952128,16.2,6.8,0.577981,7.8,2.6,2.0,1.2,11.2,5.6,8.238,15.72534,4.6,3.6,1.4,1.4,4.8,4.2,11.8,6.6,2.0,1.6,2.6,1.2,0.6,1.6,0.89955,13.2,6.0,0.898379,13.2,4.0,1.8,0.8,13.0,9.8,11.29518,11.08402,7.2,5.0,2.0,0.8,5.4,4.6,9.4,11.2,3.0,4.0,2.4,3.2,0.6,1.0
5441,6189.0,160.0,180.0,2014-08-09 20:00:00,0.6,0.2,0.941866,16.0,5.8,0.212896,10.6,3.6,2.0,1.6,11.6,4.8,9.35862,17.44728,4.8,2.4,1.8,1.2,3.6,4.2,12.0,9.0,2.8,2.0,2.8,1.0,1.2,1.8,0.790904,16.8,7.2,0.389863,8.8,2.8,2.4,0.8,9.2,5.2,9.85366,18.38572,6.0,3.0,2.4,0.6,3.6,4.6,12.4,8.0,3.8,2.8,2.6,2.0,0.8,2.2
5445,6193.0,178.0,163.0,2014-08-10 16:00:00,0.4,0.2,1.10716,13.0,5.0,0.381547,15.6,5.8,1.8,1.8,6.6,11.2,9.90854,10.83596,6.4,3.0,1.4,1.4,5.6,3.0,9.0,11.6,3.0,3.2,2.0,1.8,1.2,1.0,0.788906,15.4,5.0,0.633838,7.2,3.2,2.0,1.0,8.8,13.2,6.72704,15.27012,7.6,2.0,2.0,0.8,3.4,2.6,12.0,6.4,3.4,3.0,1.0,2.0,0.8,1.4
5446,6194.0,171.0,179.0,2014-08-10 20:00:00,0.4,0.2,1.036978,14.0,6.0,0.794271,14.6,5.4,2.2,1.6,8.2,13.6,8.65636,9.20476,8.2,4.4,1.6,1.0,4.6,3.2,9.8,10.8,3.2,2.6,1.6,2.4,1.2,1.2,0.788906,18.2,5.8,0.680484,7.8,3.2,2.2,1.2,8.8,11.4,6.90482,19.82076,7.6,2.2,2.0,0.8,4.8,2.2,14.8,6.6,3.8,2.4,1.4,1.8,1.0,1.6


In [75]:
features_to_drop = list(set([
    "xGoals_h", "shots_h", "shotsOnTarget_h",
    "xGoals_a", "shots_a", "shotsOnTarget_a",
    "homeGoals", "awayGoals", "homeGoalsHalfTime", "awayGoalsHalfTime",
    "home_win", "away_win", "home_points", "away_points",
    "result_h", "date",
    "redCards_h", "redCards_a", "ownGoals_h", "ownGoals_a",
    "yellowCards_h", "yellowCards_a", "total_assists_h", "total_assists_a",
    "teamgoals_a", "teamgoals_h", "homeGoals_cat", "awayGoals_cat",
    "homeGoalsHT_cat", "awayGoalsHT_cat",
    "fouls_h", "fouls_a", "ppda_h", "ppda_a", "deep_h", "deep_a", 'corners_h',
 'total_key_passes_h',
 'total_xGoalsChain_h',
 'total_blocked_shots_h',
 'total_saved_shots_h',
 'total_key_passes_a',
 'total_xGoalsChain_a',
 'total_blocked_shots_a',
 'total_saved_shots_a',
 'game_month',
 'game_day',
 'yellowCards_h_cat',
 'total_assists_h_cat',
 'total_assists_a_cat','corners_a',
 'yellowCards_a_cat',
]))
feature_with_strong_correlation = [
    'total_xAssists_h', 'total_xAssists_a', 'game_year',
    'gameID', 'total_xGoalsBuildup_h', 'total_xGoalsBuildup_a'
]

features_to_drop_from_model=features_to_drop+feature_with_strong_correlation

df.drop(features_to_drop_from_model, axis=1, inplace=True)

##### Use KNN for missing values that was acquired from the average of the last 5 games, every team will be at least 5 mising values

In [76]:
df_feature_EN=df.copy()
from sklearn.impute import KNNImputer

imp_cols = df_feature_EN.select_dtypes(include = ['float64', 'int64','category']).columns
knn_imputer = KNNImputer(n_neighbors=1)
imputed_data = pd.DataFrame(knn_imputer.fit_transform(df_feature_EN[imp_cols]),columns = imp_cols)
df_feature_EN[imp_cols] = imputed_data
df_feature_EN.isnull().sum().sum()

0

##### droping values data have strong correlation between them or featurs that we use them to create categories

In [77]:
#checking values counts against all the numeric data
for col in df_feature_EN.columns:
    print(f"\nColumn: {col}")
    print(df_feature_EN[col].value_counts(dropna=False))


Column: leagueID
leagueID
2.0    2660
4.0    2660
1.0    2660
5.0    2558
3.0    2142
Name: count, dtype: int64

Column: season
season
2015.0    1826
2014.0    1826
2017.0    1826
2018.0    1826
2020.0    1826
2016.0    1825
2019.0    1725
Name: count, dtype: int64

Column: homeTeamID
homeTeamID
88.0     133
72.0     133
74.0     133
83.0     133
150.0    133
        ... 
206.0     19
103.0     19
173.0     19
234.0     17
262.0     17
Name: count, Length: 146, dtype: int64

Column: awayTeamID
awayTeamID
146.0    133
113.0    133
101.0    133
98.0     133
88.0     133
        ... 
203.0     19
173.0     19
208.0     19
262.0     17
234.0     17
Name: count, Length: 146, dtype: int64

Column: redCards_h_cat
redCards_h_cat
0.0    12680
Name: count, dtype: int64

Column: redCards_a_cat
redCards_a_cat
0.0    12680
Name: count, dtype: int64

Column: ownGoals_h_cat
ownGoals_h_cat
0.0    12680
Name: count, dtype: int64

Column: ownGoals_a_cat
ownGoals_a_cat
0.0    12680
Name: count, dtype: i

In [78]:
df_feature_EN.to_csv(path_result+'/'+'df_after_Feature_Engeniring.csv')

### Feature Selection

#### Choosing the most effective variables

In [79]:
y=df_feature_EN['result']
X = df_feature_EN.drop(columns=['result'])

## Hyperparameters

### Multivariable Analysis

##### Summarization and Selection of Variables

In [80]:
# Fit models and determine if a feature is selected (1) or not (0)
lasso = Lasso(alpha=0.01).fit(X, y)
lasso_selected = (np.abs(lasso.coef_) > 0).astype(int)

# Fit Ridge model
ridge = Ridge(alpha=0.01).fit(X, y)
ridge_selected = (np.abs(ridge.coef_) > 0).astype(int)

svm = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X, y)
svm_selected = (np.abs(svm.coef_[0]) > 0).astype(int)

gb = GradientBoostingClassifier().fit(X, y)
gb_selected = (gb.feature_importances_ > 0).astype(int)

rf = RandomForestClassifier().fit(X, y)
rf_selected = (rf.feature_importances_ > 0).astype(int)

# Create a DataFrame to store results
selection_df = pd.DataFrame({
    'Feature': X.columns,
    'Lasso': lasso_selected, 
    'SVM': svm_selected,
    'GradientBoost': gb_selected,
    'RandomForest': rf_selected,
    'Ridge': ridge_selected
})

# Sum the number of selections for each feature
selection_df['Sum'] = selection_df[['Lasso', 'SVM', 'GradientBoost', 'RandomForest','Ridge']].sum(axis=1)

# Output the results
print(selection_df)

                                Feature  Lasso  SVM  GradientBoost  \
0                              leagueID      0    1              1   
1                                season      1    1              1   
2                            homeTeamID      1    1              1   
3                            awayTeamID      1    1              1   
4                        redCards_h_cat      0    0              0   
..                                  ...    ...  ...            ...   
57    away_total_saved_shots_h_rolling5      0    1              1   
58  away_total_blocked_shots_a_rolling5      0    0              1   
59    away_total_saved_shots_a_rolling5      0    1              1   
60        away_total_assists_a_rolling5      0    0              1   
61        away_total_assists_h_rolling5      0    0              1   

    RandomForest  Ridge  Sum  
0              1      1    4  
1              1      1    5  
2              1      1    5  
3              1      1    5  
4   

In [85]:
 # Selecting variables with a sum of selections >= 4
final_var = selection_df[selection_df['Sum'] >= 4]['Feature'].tolist()

In [86]:
final_var

['leagueID',
 'season',
 'homeTeamID',
 'awayTeamID',
 'home_shots_h_rolling5',
 'home_shotsOnTarget_h_rolling5',
 'home_shots_a_rolling5',
 'home_shotsOnTarget_a_rolling5',
 'home_fouls_h_rolling5',
 'home_fouls_a_rolling5',
 'home_ppda_h_rolling5',
 'home_ppda_a_rolling5',
 'home_deep_h_rolling5',
 'home_corners_h_rolling5',
 'home_total_key_passes_h_rolling5',
 'home_total_assists_a_rolling5',
 'away_shotsOnTarget_h_rolling5',
 'away_fouls_h_rolling5',
 'away_fouls_a_rolling5',
 'away_ppda_h_rolling5',
 'away_deep_h_rolling5',
 'away_deep_a_rolling5',
 'away_total_key_passes_h_rolling5',
 'away_total_key_passes_a_rolling5',
 'away_total_saved_shots_h_rolling5',
 'away_total_saved_shots_a_rolling5']

### Creating DataFrame with most valuable variables

##### Selected variables - recommended by 3 or more models

In [87]:
 # Selecting variables with a sum of selections >= 4
#final_var = selection_df[selection_df['Sum'] > 4]['Feature'].tolist()

# Add target variable
final_var.append('result')

# Append features we believe are important (use extend instead of append)
#final_var.extend(['yellowCards_a_cat', 'corners_a','total_blocked_shots_a', 'home_shots_a_rolling5', 
 #                 'home_fouls_h_rolling5','away_ppda_a_rolling5'])

# Drop specific ID columns (if they exist)

#final_var.remove(['game_day','game_month'])

# Subset the dataframe
df_model = df_feature_EN[final_var].copy()

# Output the result to verify
df_model.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12680 entries, 5437 to 12679
Data columns (total 27 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   leagueID                           12680 non-null  float64
 1   season                             12680 non-null  float64
 2   homeTeamID                         12680 non-null  float64
 3   awayTeamID                         12680 non-null  float64
 4   home_shots_h_rolling5              12680 non-null  float64
 5   home_shotsOnTarget_h_rolling5      12680 non-null  float64
 6   home_shots_a_rolling5              12680 non-null  float64
 7   home_shotsOnTarget_a_rolling5      12680 non-null  float64
 8   home_fouls_h_rolling5              12680 non-null  float64
 9   home_fouls_a_rolling5              12680 non-null  float64
 10  home_ppda_h_rolling5               12680 non-null  float64
 11  home_ppda_a_rolling5               12680 non-null  float

In [88]:
#checking values counts against all the numeric data
for col in df_model.columns:
    print(f"\nColumn: {col}")
    print(df_model[col].value_counts(dropna=False))


Column: leagueID
leagueID
2.0    2660
4.0    2660
1.0    2660
5.0    2558
3.0    2142
Name: count, dtype: int64

Column: season
season
2015.0    1826
2014.0    1826
2017.0    1826
2018.0    1826
2020.0    1826
2016.0    1825
2019.0    1725
Name: count, dtype: int64

Column: homeTeamID
homeTeamID
88.0     133
72.0     133
74.0     133
83.0     133
150.0    133
        ... 
206.0     19
103.0     19
173.0     19
234.0     17
262.0     17
Name: count, Length: 146, dtype: int64

Column: awayTeamID
awayTeamID
146.0    133
113.0    133
101.0    133
98.0     133
88.0     133
        ... 
203.0     19
173.0     19
208.0     19
262.0     17
234.0     17
Name: count, Length: 146, dtype: int64

Column: home_shots_h_rolling5
home_shots_h_rolling5
13.200000    443
14.200000    433
14.000000    405
12.800000    403
13.000000    403
            ... 
9.333333       1
19.250000      1
6.800000       1
19.750000      1
8.500000       1
Name: count, Length: 139, dtype: int64

Column: home_shotsOnTarget_

In [89]:
df_model.to_csv(path_result+'/'+'model_after_feature_selection.csv')