**Connect With Me in Linkedin** :- https://www.linkedin.com/in/dheerajkumar1997/

## Feature Selection: Constant Features

**Aim: Find and Remove Constant Features from a dataset.**

**Constant Features:** Constant features are those features that show the same value for all observations in the dataset i.e. same value for all rows in a dataset. These features do not provide any distinctive information which can be useful for a machine learning algorithm.

In [1]:
# Import Dependencies
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# VarianceThreshold Function to find constant features
# It is a Feature selector that removes all low-variance features.
# This feature selection algorithm looks only at the features (X), not the desired outputs (y), and can thus be used for unsupervised learning.
from sklearn.feature_selection import VarianceThreshold

In [2]:
# Load Dataset
df = pd.read_csv('./dataset/Santander-Customer-Satisfaction-data/train.csv', nrows=50000)
df.shape

(50000, 371)

In [3]:
# Check for the presence of null
[col for col in df.columns if df[col].isnull().sum() > 0]

[]

In [4]:
df.head()

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
0,1,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39205.17,0
1,3,2,34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49278.03,0
2,4,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67333.77,0
3,8,2,37,0.0,195.0,195.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64007.97,0
4,10,2,39,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016,0


In [5]:
# Train Test Split
X = df.drop(['TARGET'], axis=1)
X.head()

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var29_ult3,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38
0,1,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39205.17
1,3,2,34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49278.03
2,4,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67333.77
3,8,2,37,0.0,195.0,195.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64007.97
4,10,2,39,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016


In [6]:
y = df['TARGET']
y.head()

0    0
1    0
2    0
3    0
4    0
Name: TARGET, dtype: int64

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((35000, 370), (35000,), (15000, 370), (15000,))

In [8]:
# Remove constant features using VarianceThreshold
feature_selector = VarianceThreshold(threshold=0)

# Fit finds the features with zero variance
feature_selector.fit(X_train)

VarianceThreshold(threshold=0)

In [9]:
# Get Feature Support
# Returns True if value is not constant else False
feature_selector.get_support()

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False, False,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False, False, False, False,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
       False,  True,  True,  True, False, False,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,

In [10]:
# Number of Features that are not constant
sum(feature_selector.get_support())

334

In [11]:
# Features that are constant
[x for x in X_train.columns if x not in X_train.columns[feature_selector.get_support()]]

['ind_var2_0',
 'ind_var2',
 'ind_var27_0',
 'ind_var28_0',
 'ind_var28',
 'ind_var27',
 'ind_var41',
 'ind_var46_0',
 'ind_var46',
 'num_var27_0',
 'num_var28_0',
 'num_var28',
 'num_var27',
 'num_var41',
 'num_var46_0',
 'num_var46',
 'saldo_var28',
 'saldo_var27',
 'saldo_var41',
 'saldo_var46',
 'imp_amort_var18_hace3',
 'imp_amort_var34_hace3',
 'imp_reemb_var13_hace3',
 'imp_reemb_var17_hace3',
 'imp_reemb_var33_hace3',
 'imp_trasp_var17_out_hace3',
 'imp_trasp_var33_out_hace3',
 'num_var2_0_ult1',
 'num_var2_ult1',
 'num_reemb_var13_hace3',
 'num_reemb_var17_hace3',
 'num_reemb_var33_hace3',
 'num_trasp_var17_out_hace3',
 'num_trasp_var33_out_hace3',
 'saldo_var2_ult1',
 'saldo_medio_var13_medio_hace3']

In [12]:
# Remove constant features from Training and Test Data
X_train = feature_selector.transform(X_train)
X_test = feature_selector.transform(X_test)
X_train.shape, X_test.shape

((35000, 334), (15000, 334))

### Variance Threshold from Scratch

In [13]:
# Load Dataset
data = pd.read_csv('./dataset/Santander-Customer-Satisfaction-data/train.csv', nrows=50000)
data.shape

(50000, 371)

In [14]:
# Train Test Split
X_1 = data.drop(['TARGET'], axis=1)
X_1.head()

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var29_ult3,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38
0,1,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39205.17
1,3,2,34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49278.03
2,4,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67333.77
3,8,2,37,0.0,195.0,195.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64007.97
4,10,2,39,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016


In [15]:
y_1 = data['TARGET']
y_1.head()

0    0
1    0
2    0
3    0
4    0
Name: TARGET, dtype: int64

In [16]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(X_1, y_1, test_size=0.3, random_state=101)
X_train1.shape, y_train1.shape, X_test1.shape, y_test1.shape

((35000, 370), (35000,), (15000, 370), (15000,))

In [17]:
# Find all constant Features
# For each feature in training data, if it's standard deviation == 0, it is a constant variable.
const_features = [feat for feat in X_train1.columns if X_train1[feat].std() == 0]

In [18]:
len(const_features)

36

In [19]:
# Removing constants from data
X_train1.drop(labels=const_features, axis=1, inplace=True)
X_test1.drop(labels=const_features, axis=1, inplace=True)

X_train1.shape, X_test1.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


((35000, 334), (15000, 334))

The Variance Threshold method works only on the numerical values. So, for the categorical values we have two options:

1. Convert the categorical values into numerical values like using one hot encoding.
2. Transform values into object and call the unique function on them.

In [20]:
# Load Dataset
data2 = pd.read_csv('./dataset/Santander-Customer-Satisfaction-data/train.csv', nrows=50000)
data2.shape

(50000, 371)

In [21]:
# Train Test Split
X_2 = data2.drop(['TARGET'], axis=1)
X_2.head()

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var29_ult3,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38
0,1,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39205.17
1,3,2,34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49278.03
2,4,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67333.77
3,8,2,37,0.0,195.0,195.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64007.97
4,10,2,39,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016


In [22]:
y_2 = data2['TARGET']
y_2.head()

0    0
1    0
2    0
3    0
4    0
Name: TARGET, dtype: int64

In [23]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X_2, y_2, test_size=0.3, random_state=101)
X_train2.shape, y_train2.shape, X_test2.shape, y_test2.shape

((35000, 370), (35000,), (15000, 370), (15000,))

In [24]:
# Convert Numerical values to categorical values
X_train2 = X_train2.astype('O')
X_train2.dtypes

ID                               object
var3                             object
var15                            object
imp_ent_var16_ult1               object
imp_op_var39_comer_ult1          object
imp_op_var39_comer_ult3          object
imp_op_var40_comer_ult1          object
imp_op_var40_comer_ult3          object
imp_op_var40_efect_ult1          object
imp_op_var40_efect_ult3          object
imp_op_var40_ult1                object
imp_op_var41_comer_ult1          object
imp_op_var41_comer_ult3          object
imp_op_var41_efect_ult1          object
imp_op_var41_efect_ult3          object
imp_op_var41_ult1                object
imp_op_var39_efect_ult1          object
imp_op_var39_efect_ult3          object
imp_op_var39_ult1                object
imp_sal_var16_ult1               object
ind_var1_0                       object
ind_var1                         object
ind_var2_0                       object
ind_var2                         object
ind_var5_0                       object


In [25]:
# Find all constant Features
const_features = [feat for feat in X_train2.columns if len(X_train2[feat].unique()) == 1]
len(const_features)

36

**Connect With Me in Linkedin** :- https://www.linkedin.com/in/dheerajkumar1997/