<a href="https://colab.research.google.com/github/EmiljaB/Auto-Feature-Selector-FIFA19/blob/Auto-FeatureSelector/Auto_Feature_Selector.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Task 7: AutoFeatureSelector Tool
## This task is to test your understanding of various Feature Selection methods outlined in the lecture and the ability to apply this knowledge in a real-world dataset to select best features and also to build an automated feature selection tool as your toolkit

### Use your knowledge of different feature selector methods to build an Automatic Feature Selection tool
- Pearson Correlation
- Chi-Square
- RFE
- Embedded
- Tree (Random Forest)
- Tree (Light GBM)

### Dataset: FIFA 19 Player Skills
#### Attributes: FIFA 2019 players attributes like Age, Nationality, Overall, Potential, Club, Value, Wage, Preferred Foot, International Reputation, Weak Foot, Skill Moves, Work Rate, Position, Jersey Number, Joined, Loaned From, Contract Valid Until, Height, Weight, LS, ST, RS, LW, LF, CF, RF, RW, LAM, CAM, RAM, LM, LCM, CM, RCM, RM, LWB, LDM, CDM, RDM, RWB, LB, LCB, CB, RCB, RB, Crossing, Finishing, Heading, Accuracy, ShortPassing, Volleys, Dribbling, Curve, FKAccuracy, LongPassing, BallControl, Acceleration, SprintSpeed, Agility, Reactions, Balance, ShotPower, Jumping, Stamina, Strength, LongShots, Aggression, Interceptions, Positioning, Vision, Penalties, Composure, Marking, StandingTackle, SlidingTackle, GKDiving, GKHandling, GKKicking, GKPositioning, GKReflexes, and Release Clause.

In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as ss
from collections import Counter
import math
from scipy import stats

In [None]:
player_df = pd.read_csv("fifa19.csv")

In [None]:
numcols = ['Overall', 'Crossing','Finishing',  'ShortPassing',  'Dribbling','LongPassing', 'BallControl', 'Acceleration','SprintSpeed', 'Agility',  'Stamina','Volleys','FKAccuracy','Reactions','Balance','ShotPower','Strength','LongShots','Aggression','Interceptions']
catcols = ['Preferred Foot','Position','Body Type','Nationality','Weak Foot']

In [None]:
player_df = player_df[numcols+catcols]

In [None]:
traindf = pd.concat([player_df[numcols], pd.get_dummies(player_df[catcols])],axis=1)
features = traindf.columns

traindf = traindf.dropna()

In [None]:
traindf = pd.DataFrame(traindf,columns=features)

In [None]:
y = traindf['Overall']>=87
X = traindf.copy()
del X['Overall']

In [None]:
X.head()

Unnamed: 0,Crossing,Finishing,ShortPassing,Dribbling,LongPassing,BallControl,Acceleration,SprintSpeed,Agility,Stamina,...,Nationality_Uganda,Nationality_Ukraine,Nationality_United Arab Emirates,Nationality_United States,Nationality_Uruguay,Nationality_Uzbekistan,Nationality_Venezuela,Nationality_Wales,Nationality_Zambia,Nationality_Zimbabwe
0,84.0,95.0,90.0,97.0,87.0,96.0,91.0,86.0,91.0,72.0,...,False,False,False,False,False,False,False,False,False,False
1,84.0,94.0,81.0,88.0,77.0,94.0,89.0,91.0,87.0,88.0,...,False,False,False,False,False,False,False,False,False,False
2,79.0,87.0,84.0,96.0,78.0,95.0,94.0,90.0,96.0,81.0,...,False,False,False,False,False,False,False,False,False,False
3,17.0,13.0,50.0,18.0,51.0,42.0,57.0,58.0,60.0,43.0,...,False,False,False,False,False,False,False,False,False,False
4,93.0,82.0,92.0,86.0,91.0,91.0,78.0,76.0,79.0,90.0,...,False,False,False,False,False,False,False,False,False,False


In [None]:
len(X.columns)


223

Set some fixed nmber of features

In [None]:
feature_name = list(X.columns)
# no of maximum features we need to select
num_feats=30

# Filter Feature Selection - Pearson Correlation
### Pearson Correlation function

In [None]:
def cor_selector(X, y, num_feats):
    # Calculate the correlation of each feature with the target variable
    cor_list = X.corrwith(y).abs()  # Get absolute values of correlations with the target

    # Get the features with the highest absolute correlation with the target
    selected_features = cor_list.nlargest(num_feats).index

    # Create a boolean mask for the selected features
    cor_support = X.columns.isin(selected_features)

    return cor_support, selected_features  # Return boolean mask and selected features


In [None]:
cor_support, cor_feature = cor_selector(X, y,num_feats)
print(str(len(cor_feature)), 'selected features')

30 selected features


### List the selected features from Pearson Correlation

In [None]:
cor_feature

Index(['Reactions', 'Body Type_C. Ronaldo', 'Body Type_Messi',
       'Body Type_Neymar', 'Body Type_Courtois',
       'Body Type_PLAYER_BODY_TYPE_25', 'Position_LF', 'Position_RF',
       'ShortPassing', 'Volleys', 'LongPassing', 'FKAccuracy', 'BallControl',
       'Finishing', 'LongShots', 'ShotPower', 'Dribbling',
       'Nationality_Belgium', 'Crossing', 'Agility', 'Weak Foot', 'Stamina',
       'Nationality_Slovenia', 'Nationality_Gabon', 'Strength', 'SprintSpeed',
       'Acceleration', 'Nationality_Uruguay', 'Position_LAM',
       'Nationality_Costa Rica'],
      dtype='object')

## Filter Feature Selection - Chi-Sqaure

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler

#### Chi-Squared Selector function

In [None]:
def chi_squared_selector(X, y, num_feats):
    # Apply SelectKBest class to extract top features based on the chi-squared test
    chi_selector = SelectKBest(score_func=chi2, k=num_feats)
    chi_selector.fit(X, y)

    # Get the boolean mask of selected features
    chi_support = chi_selector.get_support()

    # Get the feature names of the selected features
    chi_feature = X.columns[chi_support]

    return chi_support, chi_feature


In [None]:
chi_support, chi_feature = chi_squared_selector(X, y,num_feats)
print(str(len(chi_feature)), 'selected features')

30 selected features


### List the selected features from Chi-Square

In [None]:
chi_feature

Index(['Crossing', 'Finishing', 'ShortPassing', 'Dribbling', 'LongPassing',
       'BallControl', 'Acceleration', 'SprintSpeed', 'Agility', 'Stamina',
       'Volleys', 'FKAccuracy', 'Reactions', 'Balance', 'ShotPower',
       'Strength', 'LongShots', 'Aggression', 'Interceptions', 'Position_LF',
       'Position_RF', 'Body Type_C. Ronaldo', 'Body Type_Courtois',
       'Body Type_Messi', 'Body Type_Neymar', 'Body Type_PLAYER_BODY_TYPE_25',
       'Nationality_Belgium', 'Nationality_Gabon', 'Nationality_Slovenia',
       'Nationality_Uruguay'],
      dtype='object')

#Wrapper Feature Selection - Recursive Feature Elimination

In [None]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler

### RFE Selector function

In [None]:
def rfe_selector(X, y, num_feats):
    # Create the RFE object using LogisticRegression as the estimator
    model = LogisticRegression(max_iter=1000, random_state=42, n_jobs=-1)

    # Create the RFE model and select the desired number of features
    rfe_selector = RFE(estimator=model, n_features_to_select=num_feats, step=10)

    # Fit the model to the data
    rfe_selector.fit(X, y)

    # Get the boolean mask of the selected features
    rfe_support = rfe_selector.get_support()

    # Get the feature names of the selected features
    rfe_feature = X.columns[rfe_support]


    return rfe_support, rfe_feature


In [None]:
rfe_support, rfe_feature = rfe_selector(X, y,num_feats)
print(str(len(rfe_feature)), 'selected features')

30 selected features


### List the selected features from RFE

In [None]:
rfe_feature

Index(['Reactions', 'Position_CAM', 'Position_CM', 'Position_GK',
       'Position_LAM', 'Position_LCB', 'Position_LM', 'Position_LW',
       'Position_RB', 'Position_RCB', 'Position_RS', 'Position_RW',
       'Position_ST', 'Body Type_Courtois', 'Body Type_Lean',
       'Nationality_Brazil', 'Nationality_Costa Rica', 'Nationality_Croatia',
       'Nationality_England', 'Nationality_France', 'Nationality_Gabon',
       'Nationality_Germany', 'Nationality_Greece', 'Nationality_Italy',
       'Nationality_Netherlands', 'Nationality_Portugal',
       'Nationality_Senegal', 'Nationality_Slovenia', 'Nationality_Uruguay',
       'Nationality_Wales'],
      dtype='object')

#Embedded Selection - Lasso: SelectFromModel

In [None]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler

In [None]:
from sklearn.linear_model import LogisticRegression
import numpy as np

def embedded_log_reg_selector(X, y, num_feats):
    # Create the Logistic Regression model
    model = LogisticRegression(max_iter=1000, random_state=42)

    # Fit the model to the data
    model.fit(X, y)

    # Get the absolute values of the coefficients
    importance = np.abs(model.coef_[0])

    # Get the indices of the top features
    indices = np.argsort(importance)[-num_feats:]

    # Create a boolean mask for the selected features
    embedded_lr_support = np.zeros(importance.shape, dtype=bool)
    embedded_lr_support[indices] = True

    # Get the feature names of the selected features
    embedded_lr_feature = X.columns[embedded_lr_support]

    return embedded_lr_support, embedded_lr_feature


In [None]:
embedded_lr_support, embedded_lr_feature = embedded_log_reg_selector(X, y, num_feats)
print(str(len(embedded_lr_feature)), 'selected features')

30 selected features


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
embedded_lr_feature

Index(['Preferred Foot_Left', 'Preferred Foot_Right', 'Position_CAM',
       'Position_CF', 'Position_CM', 'Position_LAM', 'Position_LM',
       'Position_LW', 'Position_RB', 'Position_RM', 'Position_RS',
       'Position_RW', 'Body Type_Lean', 'Body Type_Normal', 'Body Type_Stocky',
       'Nationality_Costa Rica', 'Nationality_Croatia', 'Nationality_Gabon',
       'Nationality_Germany', 'Nationality_Italy', 'Nationality_Mexico',
       'Nationality_Netherlands', 'Nationality_Poland', 'Nationality_Portugal',
       'Nationality_Senegal', 'Nationality_Serbia', 'Nationality_Slovenia',
       'Nationality_Ukraine', 'Nationality_Uruguay', 'Nationality_Wales'],
      dtype='object')

#Tree based(Random Forest): SelectFromModel

In [None]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier

In [None]:
def embedded_rf_selector(X, y, num_feats):
    # Create the Random Forest model
    model = RandomForestClassifier(n_estimators=100, random_state=42)

    # Fit the model to the data
    model.fit(X, y)

    # Get feature importances
    importances = model.feature_importances_

    # Get the indices of the top features
    indices = np.argsort(importances)[-num_feats:]

    # Create a boolean mask for the selected features
    embedded_rf_support = np.zeros(importances.shape, dtype=bool)
    embedded_rf_support[indices] = True

    # Get the feature names of the selected features
    embedded_rf_feature = X.columns[embedded_rf_support]

    return embedded_rf_support, embedded_rf_feature


In [None]:
embedded_rf_support, embedded_rf_feature = embedded_rf_selector(X, y, num_feats)
print(str(len(embedded_rf_feature)), 'selected features')


30 selected features


In [None]:
embedded_rf_feature

Index(['Crossing', 'Finishing', 'ShortPassing', 'Dribbling', 'LongPassing',
       'BallControl', 'Acceleration', 'SprintSpeed', 'Agility', 'Stamina',
       'Volleys', 'FKAccuracy', 'Reactions', 'Balance', 'ShotPower',
       'Strength', 'LongShots', 'Aggression', 'Interceptions', 'Weak Foot',
       'Preferred Foot_Right', 'Position_RCB', 'Body Type_Courtois',
       'Body Type_Lean', 'Body Type_Normal', 'Nationality_Belgium',
       'Nationality_Brazil', 'Nationality_Italy', 'Nationality_Slovenia',
       'Nationality_Spain'],
      dtype='object')

#Tree based(Light GBM): SelectFromModel

In [None]:
from sklearn.feature_selection import SelectFromModel
from lightgbm import LGBMClassifier

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [None]:
def embedded_lgbm_selector(X, y, num_feats):
    # Create the LGBM model
    model = LGBMClassifier(n_estimators=100, random_state=42)

    # Fit the model to the data
    model.fit(X, y)

    # Get feature importances
    importances = model.feature_importances_

    # Get the indices of the top features
    indices = np.argsort(importances)[-num_feats:]

    # Create a boolean mask for the selected features
    embedded_lgbm_support = np.zeros(importances.shape, dtype=bool)
    embedded_lgbm_support[indices] = True

    # Get the feature names of the selected features
    embedded_lgbm_feature = X.columns[embedded_lgbm_support]

    return embedded_lgbm_support, embedded_lgbm_feature


In [None]:
embedded_lgbm_support, embedded_lgbm_feature = embedded_lgbm_selector(X, y, num_feats)
print(str(len(embedded_lgbm_feature)), 'selected features')

[LightGBM] [Info] Number of positive: 55, number of negative: 18104
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004517 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1812
[LightGBM] [Info] Number of data points in the train set: 18159, number of used features: 124
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.003029 -> initscore=-5.796555
[LightGBM] [Info] Start training from score -5.796555
30 selected features


In [None]:
embedded_lgbm_feature

Index(['Crossing', 'Finishing', 'ShortPassing', 'Dribbling', 'LongPassing',
       'BallControl', 'Acceleration', 'SprintSpeed', 'Agility', 'Stamina',
       'Volleys', 'FKAccuracy', 'Reactions', 'Balance', 'ShotPower',
       'Strength', 'LongShots', 'Aggression', 'Interceptions', 'Weak Foot',
       'Preferred Foot_Left', 'Position_LCB', 'Body Type_Lean',
       'Nationality_Belgium', 'Nationality_Finland', 'Nationality_France',
       'Nationality_Italy', 'Nationality_Senegal', 'Nationality_Slovakia',
       'Nationality_Slovenia'],
      dtype='object')

#Putting all of it together: AutoFeatureSelector Tool

In [None]:
print(len(feature_name))
print(len(cor_support))
print(len(chi_support))
print(len(rfe_support))
print(len(embedded_lr_support))
print(len(embedded_rf_support))
print(len(embedded_lgbm_support))


223
223
223
223
223
223
223


In [None]:
#I had to make some changes since it displayed some errors in parsing
pd.set_option('display.max_rows', None)

# put all selection together
feature_selection_df = pd.DataFrame({
    'Feature': feature_name,
    'Pearson': cor_support,
    'Chi-2': chi_support,
    'RFE': rfe_support,
    'Logistics': embedded_lr_support,
    'Random Forest': embedded_rf_support,
    'LightGBM': embedded_lgbm_support
})

# Count the selected times for each feature
# Only sum the columns that are binary support indicators
feature_selection_df['Total'] = feature_selection_df[['Pearson', 'Chi-2', 'RFE', 'Logistics', 'Random Forest', 'LightGBM']].sum(axis=1)

# Display the top features based on total selections
feature_selection_df = feature_selection_df.sort_values(['Total', 'Feature'], ascending=False)
feature_selection_df.index = range(1, len(feature_selection_df) + 1)
top_features = feature_selection_df.head(num_feats)

print(top_features)  # Display the top selected features


                   Feature  Pearson  Chi-2    RFE  Logistics  Random Forest  \
1     Nationality_Slovenia     True   True   True       True           True   
2                Reactions     True   True   True      False           True   
3                  Volleys     True   True  False      False           True   
4                 Strength     True   True  False      False           True   
5                  Stamina     True   True  False      False           True   
6              SprintSpeed     True   True  False      False           True   
7                ShotPower     True   True  False      False           True   
8             ShortPassing     True   True  False      False           True   
9      Nationality_Uruguay     True   True   True       True          False   
10       Nationality_Italy    False  False   True       True           True   
11       Nationality_Gabon     True   True   True       True          False   
12     Nationality_Belgium     True   True  False   

#Can you build a Python script that takes dataset and a list of different feature selection methods that you want to try and output the best (maximum votes) features from all methods?

In [None]:
def preprocess_dataset(dataset_path):
    # Load the dataset
    df = pd.read_csv(dataset_path)

    # Handle missing values
    df = df.fillna(0)  # Replace NaN with 0 or use another strategy

    # Convert categorical variables to numeric using one-hot encoding
    df = pd.get_dummies(df, drop_first=True)

    # Split features and target variable
    X = df.drop(columns=['Overall'])
    y = df['Overall']

    # Define the number of features to select
    num_feats = min(10, X.shape[1])  # Example: selecting up to 10 features

    return X, y, num_feats


In [None]:
def autoFeatureSelector(dataset_path, methods=[]):
    # Preprocessing the dataset
    X, y, num_feats = preprocess_dataset(dataset_path)

    # Initialize a DataFrame to collect feature selection results
    feature_selection_df = pd.DataFrame({'Feature': X.columns})

    # Collect features from each method
    if 'pearson' in methods:
        cor_support, cor_feature = cor_selector(X, y, num_feats)
        feature_selection_df['Pearson'] = cor_support.astype(int)  # Ensure boolean to int conversion

    if 'chi-square' in methods:
        chi_support, chi_feature = chi_squared_selector(X, y, num_feats)
        feature_selection_df['Chi-square'] = chi_support.astype(int)

    if 'rfe' in methods:
        rfe_support, rfe_feature = rfe_selector(X, y, num_feats)
        feature_selection_df['RFE'] = rfe_support.astype(int)

    if 'log-reg' in methods:
        embedded_lr_support, embedded_lr_feature = embedded_log_reg_selector(X, y, num_feats)
        feature_selection_df['Logistics'] = embedded_lr_support.astype(int)

    if 'rf' in methods:
        embedded_rf_support, embedded_rf_feature = embedded_rf_selector(X, y, num_feats)
        feature_selection_df['Random Forest'] = embedded_rf_support.astype(int)

    if 'lgbm' in methods:
        embedded_lgbm_support, embedded_lgbm_feature = embedded_lgbm_selector(X, y, num_feats)
        feature_selection_df['LightGBM'] = embedded_lgbm_support.astype(int)

    # Count the selected times for each feature
    feature_selection_df['Total'] = feature_selection_df.iloc[:, 1:].sum(axis=1)

    # Select the features with the maximum votes
    best_features = feature_selection_df[feature_selection_df['Total'] > 0].sort_values('Total', ascending=False)

    return best_features


In [None]:
best_features = autoFeatureSelector(dataset_path="fifa19.csv", methods=['pearson', 'chi-square', 'rfe', 'log-reg', 'rf', 'lgbm'])
best_features