In [1]:
# Load libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Load data

df = pd.read_csv('BankChurners.csv')
df.head(3)

Unnamed: 0,CLIENTNUM,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,...,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2
0,768805383,Existing Customer,45,M,3,High School,Married,$60K - $80K,Blue,39,...,12691.0,777,11914.0,1.335,1144,42,1.625,0.061,9.3e-05,0.99991
1,818770008,Existing Customer,49,F,5,Graduate,Single,Less than $40K,Blue,44,...,8256.0,864,7392.0,1.541,1291,33,3.714,0.105,5.7e-05,0.99994
2,713982108,Existing Customer,51,M,3,Graduate,Married,$80K - $120K,Blue,36,...,3418.0,0,3418.0,2.594,1887,20,2.333,0.0,2.1e-05,0.99998


In [3]:
# Preprocessing

## Create the target variable
def make_target(df, column):
    target_dummies = pd.get_dummies(df['Attrition_Flag'])
    df = pd.concat([df, target_dummies], axis = 1)
    return df

## To keep things simple, we'll just use the int columns as the feature columns. 
def get_int_columns(df, dtype):
    features = []
    for col, t in zip(df.columns, list(df.dtypes)):
        if t == dtype:
            features.append(col)
    return features

df = make_target(df, column = 'Attrition_flag')

target = 'Attrited Customer'
features = get_int_columns(df, dtype='int64')

y = df[target]
X = df[features]

In [4]:
# Method 1.1: Filter Methods - Correlation/ANOVA

correlation_threshold = 0.10

def correlation_selection(df,
                          features, 
                          target,
                          threshold):
    
    correlations = df[features + [target]].corr()[target]
    selected_features = correlations[abs(correlations)>threshold]
    
    remove_target = selected_features.index[selected_features.index != target]
    return selected_features[remove_target]

selected = correlation_selection(df,
                                 features,
                                 target,
                                 threshold = 0.10)

print(selected)

Total_Relationship_Count   -0.150005
Months_Inactive_12_mon      0.152449
Contacts_Count_12_mon       0.204491
Total_Revolving_Bal        -0.263053
Total_Trans_Amt            -0.168598
Total_Trans_Ct             -0.371403
Name: Attrited Customer, dtype: float64


In [5]:
# Method 1.2: Chi-Square, ANOVA, F-Test, Mutual Info Gain

from sklearn.feature_selection import (
    SelectKBest, 
    chi2, 
    f_classif, 
    f_regression,
    r_regression,
    mutual_info_classif,
    mutual_info_regression
)

# Estimatore: chi2. All below estimators can be selected.
    # chi2: Chi-Squared statistics comparing features against categorical target.
    # f_regression: F-statistic between the feature and the target.
    # f_classif: ANOVA F-value between feature and target.
    # r_regression: Pearson Correlation. Similar to the previous cell.
    # mutual_info_classif: Mutual information for a discrete target.
    # mutual_info_regression: Mutual information for a continuous target
# Number of features: 4

kb = SelectKBest(chi2, k=4)
X_new = kb.fit_transform(X,y)
X_new = pd.DataFrame(X_new)
X_new.columns = kb.get_feature_names_out()

X_new

Unnamed: 0,CLIENTNUM,Total_Revolving_Bal,Total_Trans_Amt,Total_Trans_Ct
0,768805383,777,1144,42
1,818770008,864,1291,33
2,713982108,0,1887,20
3,769911858,2517,1171,20
4,709106358,0,816,28
...,...,...,...,...
10122,772366833,1851,15476,117
10123,710638233,2186,8764,69
10124,716506083,0,10291,60
10125,717406983,0,8395,62


In [6]:
# Get feature names and scores

pd.DataFrame(kb.scores_, index=kb.feature_names_in_)

Unnamed: 0,0
CLIENTNUM,40219430.0
Customer_Age,4.654893
Dependent_count,2.626071
Months_on_book,3.367526
Total_Relationship_Count,144.3982
Months_Inactive_12_mon,102.6671
Contacts_Count_12_mon,211.0394
Total_Revolving_Bal,400235.4
Total_Trans_Amt,754246.1
Total_Trans_Ct,11865.37


In [7]:
# Method 2.1 : Wrapper Methods - Forward Stepwise

from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# estimator: This is the estimator/model we want to evaluate the features on. You can input random forest or
            #logistic regression or any model you'd use for your problem.
# n_features_to_select: The number of features we want to select.
# direction: Denote 'forward' for forward stepwise. Denote 'backward' for backward stepwise.

kb = SequentialFeatureSelector(LogisticRegression(),
                               n_features_to_select=4,
                              direction = 'forward',
                              scoring= 'neg_mean_absolute_error')
X_new = kb.fit_transform(X,y)
X_new = pd.DataFrame(X_new)
X_new.columns = kb.get_feature_names_out()

X_new

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Unnamed: 0,Total_Relationship_Count,Contacts_Count_12_mon,Total_Revolving_Bal,Total_Trans_Ct
0,5,3,777,42
1,6,2,864,33
2,4,0,0,20
3,3,1,2517,20
4,5,0,0,28
...,...,...,...,...
10122,3,3,1851,117
10123,4,3,2186,69
10124,5,4,0,60
10125,4,3,0,62


In [8]:
# Method 2.2: Wrapper Methods - Backward Stepwise

from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LogisticRegression

kb = SequentialFeatureSelector(LogisticRegression(),
                               n_features_to_select=4,
                              direction = 'backward')
X_new = kb.fit_transform(X,y)
X_new = pd.DataFrame(X_new)
X_new.columns = kb.get_feature_names_out()

X_new

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Unnamed: 0,Total_Relationship_Count,Contacts_Count_12_mon,Total_Revolving_Bal,Total_Trans_Amt
0,5,3,777,1144
1,6,2,864,1291
2,4,0,0,1887
3,3,1,2517,1171
4,5,0,0,816
...,...,...,...,...
10122,3,3,1851,15476
10123,4,3,2186,8764
10124,5,4,0,10291
10125,4,3,0,8395


In [9]:
# Method 2.3: Wrapper Methods - Recursive Feature Elimination

from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

kb = RFE(LogisticRegression(), n_features_to_select=4)
X_new = kb.fit_transform(X,y)
X_new = pd.DataFrame(X_new)
X_new.columns = kb.get_feature_names_out()

X_new

Unnamed: 0,CLIENTNUM,Total_Revolving_Bal,Total_Trans_Amt,Total_Trans_Ct
0,768805383,777,1144,42
1,818770008,864,1291,33
2,713982108,0,1887,20
3,769911858,2517,1171,20
4,709106358,0,816,28
...,...,...,...,...
10122,772366833,1851,15476,117
10123,710638233,2186,8764,69
10124,716506083,0,10291,60
10125,717406983,0,8395,62


In [10]:
# # Install a pip package in the current Jupyter kernel
# import sys
# !{sys.executable} -m pip install mlxtend

In [11]:
# Method 2.4: Wrapper Methods - Exhaustive Stepwise

from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_iris
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS

lr = LogisticRegression()

efs1 = EFS(lr, 
           min_features=1,
           max_features=4,
           scoring='accuracy',
           print_progress=True,
           cv=5)

efs1 = efs1.fit(X, y)

print('Best accuracy score: %.2f' % efs1.best_score_)
print('Best subset (indices):', efs1.best_idx_)
print('Best subset (corresponding names):', efs1.best_feature_names_)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best accuracy score: 0.86
Best subset (indices): (4, 6, 7, 8)
Best subset (corresponding names): ('Total_Relationship_Count', 'Contacts_Count_12_mon', 'Total_Revolving_Bal', 'Total_Trans_Amt')


In [12]:
# Method 2.5: Wrapper Methods - Bi-Directional Elimination

from mlxtend.feature_selection import SequentialFeatureSelector as SFS

sbs = SFS(LogisticRegression(),
         k_features=4,
         forward=True,
         floating=True,
         cv=0)
sbs.fit(X, y)
sbs.k_feature_names_

('Total_Relationship_Count',
 'Months_Inactive_12_mon',
 'Contacts_Count_12_mon',
 'Total_Trans_Amt')

In [13]:
# Methods 2.6: Wrapper Methods - Variance Threshold

from sklearn.feature_selection import VarianceThreshold

selector = VarianceThreshold()
selector.fit_transform(X)

array([[768805383,        45,         3, ...,       777,      1144,
               42],
       [818770008,        49,         5, ...,       864,      1291,
               33],
       [713982108,        51,         3, ...,         0,      1887,
               20],
       ...,
       [716506083,        44,         1, ...,         0,     10291,
               60],
       [717406983,        30,         2, ...,         0,      8395,
               62],
       [714337233,        43,         2, ...,      1961,     10294,
               61]])