In [71]:
'''
Identifying & removing Constant Value Features in a dataset
'''

'\nHandling Constant Value Features in a dataset\n'

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold

In [5]:
data = pd.read_csv('train_santander.csv')
print(data.shape)
data.head()

(76020, 371)


Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
0,1,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39205.17,0
1,3,2,34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49278.03,0
2,4,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67333.77,0
3,8,2,37,0.0,195.0,195.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64007.97,0
4,10,2,39,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016,0


In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels = ['TARGET'], axis = 1),
    data['TARGET'],
    test_size = 0.3,
    random_state = 0)

X_train.shape, X_test.shape

((53214, 370), (22806, 370))

In [75]:
# Select features with variance=0, i.e having constant values
constant_feature_sel = VarianceThreshold(threshold=0)
constant_feature_sel.fit(X_train)

constant_features_list = [x for x in X_train.columns if x not in X_train.columns[constant_feature_sel.get_support()]]
print(len(constant_features_list))
constant_features_list

38


['ind_var2_0',
 'ind_var2',
 'ind_var27_0',
 'ind_var28_0',
 'ind_var28',
 'ind_var27',
 'ind_var41',
 'ind_var46_0',
 'ind_var46',
 'num_var27_0',
 'num_var28_0',
 'num_var28',
 'num_var27',
 'num_var41',
 'num_var46_0',
 'num_var46',
 'saldo_var28',
 'saldo_var27',
 'saldo_var41',
 'saldo_var46',
 'delta_imp_reemb_var33_1y3',
 'delta_num_reemb_var33_1y3',
 'imp_amort_var18_hace3',
 'imp_amort_var34_hace3',
 'imp_reemb_var13_hace3',
 'imp_reemb_var33_hace3',
 'imp_reemb_var33_ult1',
 'imp_trasp_var17_out_hace3',
 'imp_trasp_var33_out_hace3',
 'num_var2_0_ult1',
 'num_var2_ult1',
 'num_reemb_var13_hace3',
 'num_reemb_var33_hace3',
 'num_reemb_var33_ult1',
 'num_trasp_var17_out_hace3',
 'num_trasp_var33_out_hace3',
 'saldo_var2_ult1',
 'saldo_medio_var13_medio_hace3']

In [76]:
X_train = constant_feature_sel.transform(X_train)
X_test = constant_feature_sel.transform(X_test)
X_train.shape, X_test.shape

((53214, 332), (22806, 332))

In [77]:
'''
Identifying & removing Quasi-Constant features in a dataset
'''

'\nHandling Quasi-Constant features in a dataset\n'

In [78]:
data = pd.read_csv('train_santander.csv')

X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels = ['TARGET'], axis = 1),
    data['TARGET'],
    test_size = 0.3,
    random_state = 0)

X_train.shape, X_test.shape

((53214, 370), (22806, 370))

In [79]:
# Removing constant features from data
constant_features = [feat for feat in X_train.columns if X_train[feat].std() == 0]

X_train.drop(labels=constant_features, axis=1, inplace=True)
X_test.drop(labels=constant_features, axis=1, inplace=True)

In [81]:
# Quasi-constant features using VarianceThreshold
qconstant_feature_sel = VarianceThreshold(threshold=0.01) 
qconstant_feature_sel.fit(X_train)

qconstant_features = [x for x in X_train.columns if x not in X_train.columns[qconstant_feature_sel.get_support()]]
print(len(qconstant_features))
qconstant_features

64


['ind_var1',
 'ind_var6_0',
 'ind_var6',
 'ind_var13_largo',
 'ind_var13_medio_0',
 'ind_var13_medio',
 'ind_var14',
 'ind_var17_0',
 'ind_var17',
 'ind_var18_0',
 'ind_var18',
 'ind_var19',
 'ind_var20_0',
 'ind_var20',
 'ind_var29_0',
 'ind_var29',
 'ind_var30_0',
 'ind_var31_0',
 'ind_var31',
 'ind_var32_cte',
 'ind_var32_0',
 'ind_var32',
 'ind_var33_0',
 'ind_var33',
 'ind_var34_0',
 'ind_var34',
 'ind_var40',
 'ind_var39',
 'ind_var44_0',
 'ind_var44',
 'num_var6_0',
 'num_var6',
 'num_var13_medio_0',
 'num_var13_medio',
 'num_var18_0',
 'num_var18',
 'num_var29_0',
 'num_var29',
 'num_var33',
 'num_var34_0',
 'num_var34',
 'delta_imp_aport_var33_1y3',
 'delta_num_aport_var33_1y3',
 'ind_var7_emit_ult1',
 'ind_var7_recib_ult1',
 'num_aport_var33_hace3',
 'num_aport_var33_ult1',
 'num_var7_emit_ult1',
 'num_compra_var44_hace3',
 'num_meses_var13_medio_ult3',
 'num_meses_var17_ult3',
 'num_meses_var29_ult3',
 'num_meses_var33_ult3',
 'num_meses_var44_ult3',
 'num_reemb_var13_ult1',

In [84]:
X_train = qconstant_feature_sel.transform(X_train)
X_test = qconstant_feature_sel.transform(X_test)
X_train.shape, X_test.shape

((53214, 268), (22806, 268))

In [85]:
'''
Identifying & removing Duplicated features in a dataset
'''

'\nHandling Duplicated features in a dataset\n'

In [2]:
data = pd.read_csv('train_santander.csv')

X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels = ['TARGET'], axis = 1),
    data['TARGET'],
    test_size = 0.3,
    random_state = 0)

X_train.shape, X_test.shape

((53214, 370), (22806, 370))

In [3]:
data_T = X_train.T
data_T.head()

Unnamed: 0,7526,51929,46677,46194,20190,10777,57058,40266,75243,50225,...,52620,39512,48600,55026,41993,21243,45891,42613,43567,68268
ID,15083.0,103706.0,93351.0,92424.0,40437.0,21599.0,113842.0,80549.0,150235.0,100282.0,...,105033.0,79105.0,97108.0,109781.0,84101.0,42533.0,91801.0,85345.0,87270.0,136296.0
var3,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,53.0,2.0,9.0,2.0,2.0
var15,23.0,48.0,40.0,83.0,79.0,23.0,23.0,24.0,42.0,64.0,...,26.0,28.0,25.0,33.0,23.0,46.0,29.0,23.0,30.0,55.0
imp_ent_var16_ult1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,60.0,0.0
imp_op_var39_comer_ult1,0.0,0.0,0.0,0.0,535.41,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1770.48,0.0,0.0,72.48,0.0


In [6]:
duplicated_features = data_T[data_T.duplicated()].index.values
duplicated_features

array(['ind_var2', 'ind_var13_medio', 'ind_var18', 'ind_var26',
       'ind_var25', 'ind_var27_0', 'ind_var28_0', 'ind_var28',
       'ind_var27', 'ind_var29_0', 'ind_var29', 'ind_var32', 'ind_var34',
       'ind_var37', 'ind_var41', 'ind_var39', 'ind_var46_0', 'ind_var46',
       'num_var13_medio', 'num_var18', 'num_var26', 'num_var25',
       'num_var27_0', 'num_var28_0', 'num_var28', 'num_var27',
       'num_var29_0', 'num_var29', 'num_var32', 'num_var34', 'num_var37',
       'num_var41', 'num_var39', 'num_var46_0', 'num_var46',
       'saldo_var28', 'saldo_var27', 'saldo_var29', 'saldo_var41',
       'saldo_var46', 'delta_imp_reemb_var33_1y3',
       'delta_num_reemb_var13_1y3', 'delta_num_reemb_var17_1y3',
       'delta_num_reemb_var33_1y3', 'delta_num_trasp_var17_in_1y3',
       'delta_num_trasp_var17_out_1y3', 'delta_num_trasp_var33_in_1y3',
       'delta_num_trasp_var33_out_1y3', 'imp_amort_var18_hace3',
       'imp_amort_var34_hace3', 'imp_reemb_var13_hace3',
       'imp_reemb

In [8]:
data_unique = data_T.drop_duplicates(keep='first').T
data_unique.shape

(53214, 305)

In [9]:
data = pd.read_csv('train_santander.csv')

X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels = ['TARGET'], axis = 1),
    data['TARGET'],
    test_size = 0.3,
    random_state = 0)

X_train.shape, X_test.shape

((53214, 370), (22806, 370))

In [10]:
duplicated_feat = []
for i in range(0, len(X_train.columns)):

    col_1 = X_train.columns[i]

    for col_2 in X_train.columns[i + 1:]:

        if X_train[col_1].equals(X_train[col_2]):
            print(col_1)
            print(col_2)
            print()

            duplicated_feat.append(col_2)

ind_var2_0
ind_var2

ind_var2_0
ind_var27_0

ind_var2_0
ind_var28_0

ind_var2_0
ind_var28

ind_var2_0
ind_var27

ind_var2_0
ind_var41

ind_var2_0
ind_var46_0

ind_var2_0
ind_var46

ind_var2_0
num_var27_0

ind_var2_0
num_var28_0

ind_var2_0
num_var28

ind_var2_0
num_var27

ind_var2_0
num_var41

ind_var2_0
num_var46_0

ind_var2_0
num_var46

ind_var2_0
saldo_var28

ind_var2_0
saldo_var27

ind_var2_0
saldo_var41

ind_var2_0
saldo_var46

ind_var2_0
delta_imp_reemb_var33_1y3

ind_var2_0
delta_num_reemb_var33_1y3

ind_var2_0
imp_amort_var18_hace3

ind_var2_0
imp_amort_var34_hace3

ind_var2_0
imp_reemb_var13_hace3

ind_var2_0
imp_reemb_var33_hace3

ind_var2_0
imp_reemb_var33_ult1

ind_var2_0
imp_trasp_var17_out_hace3

ind_var2_0
imp_trasp_var33_out_hace3

ind_var2_0
num_var2_0_ult1

ind_var2_0
num_var2_ult1

ind_var2_0
num_reemb_var13_hace3

ind_var2_0
num_reemb_var33_hace3

ind_var2_0
num_reemb_var33_ult1

ind_var2_0
num_trasp_var17_out_hace3

ind_var2_0
num_trasp_var33_out_hace3

ind_var2_0


ind_var46
saldo_var41

ind_var46
saldo_var46

ind_var46
delta_imp_reemb_var33_1y3

ind_var46
delta_num_reemb_var33_1y3

ind_var46
imp_amort_var18_hace3

ind_var46
imp_amort_var34_hace3

ind_var46
imp_reemb_var13_hace3

ind_var46
imp_reemb_var33_hace3

ind_var46
imp_reemb_var33_ult1

ind_var46
imp_trasp_var17_out_hace3

ind_var46
imp_trasp_var33_out_hace3

ind_var46
num_var2_0_ult1

ind_var46
num_var2_ult1

ind_var46
num_reemb_var13_hace3

ind_var46
num_reemb_var33_hace3

ind_var46
num_reemb_var33_ult1

ind_var46
num_trasp_var17_out_hace3

ind_var46
num_trasp_var33_out_hace3

ind_var46
saldo_var2_ult1

ind_var46
saldo_medio_var13_medio_hace3

num_var6_0
num_var29_0

num_var6
num_var29

num_var13_medio_0
num_var13_medio

num_var18_0
num_var18

num_var26_0
num_var26

num_var25_0
num_var25

num_var27_0
num_var28_0

num_var27_0
num_var28

num_var27_0
num_var27

num_var27_0
num_var41

num_var27_0
num_var46_0

num_var27_0
num_var46

num_var27_0
saldo_var28

num_var27_0
saldo_var27

num_var27_

saldo_var41
num_var2_0_ult1

saldo_var41
num_var2_ult1

saldo_var41
num_reemb_var13_hace3

saldo_var41
num_reemb_var33_hace3

saldo_var41
num_reemb_var33_ult1

saldo_var41
num_trasp_var17_out_hace3

saldo_var41
num_trasp_var33_out_hace3

saldo_var41
saldo_var2_ult1

saldo_var41
saldo_medio_var13_medio_hace3

saldo_var46
delta_imp_reemb_var33_1y3

saldo_var46
delta_num_reemb_var33_1y3

saldo_var46
imp_amort_var18_hace3

saldo_var46
imp_amort_var34_hace3

saldo_var46
imp_reemb_var13_hace3

saldo_var46
imp_reemb_var33_hace3

saldo_var46
imp_reemb_var33_ult1

saldo_var46
imp_trasp_var17_out_hace3

saldo_var46
imp_trasp_var33_out_hace3

saldo_var46
num_var2_0_ult1

saldo_var46
num_var2_ult1

saldo_var46
num_reemb_var13_hace3

saldo_var46
num_reemb_var33_hace3

saldo_var46
num_reemb_var33_ult1

saldo_var46
num_trasp_var17_out_hace3

saldo_var46
num_trasp_var33_out_hace3

saldo_var46
saldo_var2_ult1

saldo_var46
saldo_medio_var13_medio_hace3

delta_imp_reemb_var13_1y3
delta_num_reemb_var13_1y

num_trasp_var33_out_hace3
saldo_var2_ult1

num_trasp_var33_out_hace3
saldo_medio_var13_medio_hace3

saldo_var2_ult1
saldo_medio_var13_medio_hace3

