**Connect With Me in Linkedin** :- https://www.linkedin.com/in/dheerajkumar1997/

## Filter Methods: Machine Learning Pipeline

### Step-1: Import Dependencies and Load Dataset

In [1]:
# Import Dependencies
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

In [2]:
# Load Dataset
data = pd.read_csv('./dataset/Santander-Customer-Satisfaction-data/train.csv')
data.shape

(76020, 371)

In [3]:
# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(data.drop(labels=['TARGET'], axis=1),data['TARGET'],test_size=0.3,random_state=0)
X_train.shape, X_test.shape

((53214, 370), (22806, 370))

In [4]:
# Keep a Copy of the Original training and test data for later use
X_train_original = X_train.copy()
X_test_original = X_test.copy()

### Step-2: Get and Remove Constant Features from Dataset

In [5]:
# Remove Constant Features
# remove constant features
constant_features = [feat for feat in X_train.columns if X_train[feat].std() == 0]

In [6]:
X_train.drop(labels=constant_features, axis=1, inplace=True)
X_test.drop(labels=constant_features, axis=1, inplace=True)
 
X_train.shape, X_test.shape

((53214, 332), (22806, 332))

### Step-3: Remove Quasi-Constant Features from Dataset

In [7]:
# Remove quasi-constant features
sel = VarianceThreshold(threshold=0.01)  # 0.1 indicates 99% of observations approximately
sel.fit(X_train)  # fit finds the features with low variance
sum(sel.get_support()) # how many not quasi-constant?

268

In [8]:
# Get the remaining features
features_to_keep = X_train.columns[sel.get_support()]

In [9]:
# Remove the constant features
X_train = sel.transform(X_train)
X_test = sel.transform(X_test)
 
X_train.shape, X_test.shape

((53214, 268), (22806, 268))

In [10]:
# Transform the arrays back to Dataframes
X_train= pd.DataFrame(X_train)
X_train.columns = features_to_keep
 
X_test= pd.DataFrame(X_test)
X_test.columns = features_to_keep

### Step-4: Detect and Remove Duplicate Features

In [11]:
# Check for duplicated features in the training set
duplicated_feat = []
for i in range(0, len(X_train.columns)):
    if i % 10 == 0:  # this helps me understand how the loop is going
        print(i)
 
    col_1 = X_train.columns[i]
 
    for col_2 in X_train.columns[i + 1:]:
        if X_train[col_1].equals(X_train[col_2]):
            duplicated_feat.append(col_2)
            
len(duplicated_feat)

0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260


16

In [12]:
# Remove duplicated features
X_train.drop(labels=duplicated_feat, axis=1, inplace=True)
X_test.drop(labels=duplicated_feat, axis=1, inplace=True)
 
X_train.shape, X_test.shape

((53214, 252), (22806, 252))

In [13]:
# Keep a copy of the dataset except constant and duplicated variables to measure the performance of machine learning models
X_train_basic_filter = X_train.copy()
X_test_basic_filter = X_test.copy()

### Step-5: Detect and Remove Features with Correlation

In [14]:
# Find and remove correlated features
def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr
 
corr_features = correlation(X_train, 0.8)
print('correlated features: ', len(set(corr_features)) )

correlated features:  133


In [15]:
X_train.drop(labels=corr_features, axis=1, inplace=True)
X_test.drop(labels=corr_features, axis=1, inplace=True)
 
X_train.shape, X_test.shape

((53214, 119), (22806, 119))

### Step-6: Performance of Final Dataset on Machine Learning Algorithms

#### 1. Random Forest Classifier

In [16]:
# Function to build Random Forest Classifier and compare performance in train and test set
def run_randomForests(X_train, X_test, y_train, y_test):
    rf = RandomForestClassifier(n_estimators=200, random_state=39, max_depth=4)
    rf.fit(X_train, y_train)
    print('Train set')
    pred = rf.predict_proba(X_train)
    print('Random Forests roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
    print('Test set')
    pred = rf.predict_proba(X_test)
    print('Random Forests roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1])))

In [17]:
# Performance of RFC on Original Dataset with Constants, Duplicates and Correlated Features
run_randomForests(X_train_original.drop(labels=['ID'], axis=1),
                  X_test_original.drop(labels=['ID'], axis=1),
                  y_train, y_test)

Train set
Random Forests roc-auc: 0.8012315676132568
Test set
Random Forests roc-auc: 0.7900499757912425


In [18]:
# Performance of RFC on Dataset without any Constants or Duplicates
run_randomForests(X_train_basic_filter.drop(labels=['ID'], axis=1),
                  X_test_basic_filter.drop(labels=['ID'], axis=1),
                  y_train, y_test)

Train set
Random Forests roc-auc: 0.8016570464386661
Test set
Random Forests roc-auc: 0.7910330692426437


In [19]:
# Performance of RFC on Dataset without any Correlated Features
run_randomForests(X_train.drop(labels=['ID'], axis=1),
                  X_test.drop(labels=['ID'], axis=1),
                  y_train, y_test)

Train set
Random Forests roc-auc: 0.8073914001626228
Test set
Random Forests roc-auc: 0.793766724733034


We can see that removing constant, quasi-constant, duplicated and correlated features reduced the feature space dramatically (from 371 to 119), without affecting the performance of the random forests (0.790 vs 0.794). If anything else, the model can now make even better predictions. And this is most likely due to the fact that high feature spaces affect negatively the performance of random forests.

#### 2. Logistic Regression

In [20]:
# Function to build logistic regression and compare performance in train and test set
def run_logistic(X_train, X_test, y_train, y_test):
    # function to train and test the performance of logistic regression
    logit = LogisticRegression(random_state=44)
    logit.fit(X_train, y_train)
    print('Train set')
    pred = logit.predict_proba(X_train)
    print('Logistic Regression roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
    print('Test set')
    pred = logit.predict_proba(X_test)
    print('Logistic Regression roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1])))

In [21]:
# Performance of Logistic Regression Classifier on Original Dataset
scaler = StandardScaler().fit(X_train_original.drop(labels=['ID'], axis=1))
 
run_logistic(scaler.transform(X_train_original.drop(labels=['ID'], axis=1)),
             scaler.transform(X_test_original.drop(labels=['ID'], axis=1)),
                  y_train, y_test)

Train set
Logistic Regression roc-auc: 0.8070006916699176
Test set
Logistic Regression roc-auc: 0.7936258651482251


  np.exp(prob, prob)


In [22]:
# Performance of Logistic Regression Classifier on Dataset without Constants, Quasi-Constants and Duplicates
scaler = StandardScaler().fit(X_train_basic_filter.drop(labels=['ID'], axis=1))
 
run_logistic(scaler.transform(X_train_basic_filter.drop(labels=['ID'], axis=1)),
             scaler.transform(X_test_basic_filter.drop(labels=['ID'], axis=1)),
                  y_train, y_test)

Train set
Logistic Regression roc-auc: 0.8063203720892692
Test set
Logistic Regression roc-auc: 0.7932285496615372


  np.exp(prob, prob)


In [23]:
# Performance of Logistic Regression Classifier on Dataset without Correaltion between Features
scaler = StandardScaler().fit(X_train.drop(labels=['ID'], axis=1))
 
run_logistic(scaler.transform(X_train.drop(labels=['ID'], axis=1)),
             scaler.transform(X_test.drop(labels=['ID'], axis=1)),
                  y_train, y_test)

Train set
Logistic Regression roc-auc: 0.7964875051006453
Test set
Logistic Regression roc-auc: 0.7929020262989868


**Connect With Me in Linkedin** :- https://www.linkedin.com/in/dheerajkumar1997/