In [20]:
import pandas as pd
import numpy as np
import gc
import warnings
warnings.filterwarnings("ignore")

In [21]:
df = pd.read_csv(r'D:\application_train.csv')

In [22]:
df.shape

(307511, 122)

In [23]:
df.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
# Stratified Sampling

In [25]:
df_sample_1 = df.loc[df.TARGET==1].sample(frac=0.1, replace=False)
print('label 1 sample size:', str(df_sample_1.shape[0]))

df_sample_0 = df.loc[df.TARGET==0].sample(frac=0.1, replace=False)
print('label 0 sample size:', str(df_sample_0.shape[0]))

df_sample = pd.concat([df_sample_0, df_sample_1], axis=0).sort_values('SK_ID_CURR')

label 1 sample size: 2482
label 0 sample size: 28269


In [26]:
# Imputing missing values

In [27]:
categorical_list = []
numerical_list = []

for i in df_sample.columns.tolist():
    if df_sample[i].dtype=='object':
        categorical_list.append(i)
    else:
        numerical_list.append(i)
        
print('Number of categorical features:', str(len(categorical_list)))
print('Number of numerical features:', str(len(numerical_list)))

Number of categorical features: 16
Number of numerical features: 106


In [28]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
df_sample[numerical_list] = imputer.fit_transform(df_sample[numerical_list])

In [29]:
# Deal with Categorical features: OneHotEncoding

In [30]:
del df
gc.collect()

df_sample = pd.get_dummies(df_sample, drop_first=True)
print(df_sample.shape)

(30751, 227)


In [31]:
# Feature matrix and target

In [33]:
X = df_sample.drop(['SK_ID_CURR', 'TARGET'], axis=1)
y = df_sample.TARGET
feature_name = X.columns.tolist()

# feature_name

### FEATURE SELECTION

select 100 features from 227 <br>
xxx_support: list to represent select this feature or not<br>
xxx_feature: the name of selected features

#### Filter 

##### Pearson Correlation filter
normalization : No <br>
Impute missing values : yes

In [36]:
def cor_selector(X, y):
    cor_list = []
    
    # calculate the correlation with y for each feature
    for i in X.columns.tolist():
        cor = np.corrcoef(X[i], y)[0, 1]
        cor_list.append(cor)
        
    # replace NaN with 0
    cor_list = [0 if np.isnan(i) else i for i in cor_list]
    
    # feature name
    cor_feature = X.iloc[:,np.argsort(np.abs(cor_list))[-100:]].columns.tolist()
    
    # feature selection 0 for not select, 1 for select
    cor_support = [True if i in cor_feature else False for i in feature_name]
    
    return cor_support, cor_feature

In [37]:
cor_support, cor_feature = cor_selector(X, y)
print(str(len(cor_feature)), 'selected values')

100 selected values


### Ch-squared filter

Normalization: MinMaxScaler (values should be bigger than 0)<br>
Impute missing values: yes

In [38]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler

In [40]:
X_norm = MinMaxScaler().fit_transform(X)
chi_selector = SelectKBest(chi2, k=100)
# train the chi2 selector on the whole X,y data
chi_selector.fit(X_norm, y)

SelectKBest(k=100, score_func=<function chi2 at 0x00000247521783A8>)

In [41]:
chi_support = chi_selector.get_support()
chi_feature = X.loc[:,chi_support].columns.tolist()
print(str(len(chi_feature)), 'selected features')

100 selected features


**Forward Selection**: Forward selection is an iterative method in which we start with having no feature in the model. In each iteration, we keep adding the feature which best improves our model till an addition of a new variable does not improve the performance of the model.

**Backward Elimination**: In backward elimination, we start with all the features and removes the least significant feature at each iteration which improves the performance of the model. We repeat this until no improvement is observed on removal of features.

**Recursive Feature elimination (RFE Wrapper)**: It is a greedy optimization algorithm which aims to find the best performing feature subset. It repeatedly creates models and keeps aside the best or the worst performing feature at each iteration. It constructs the next model with the left features until all the features are exhausted. It then ranks the features based on the order of their elimination.

### Wrapper

Normalization: depend on the used model; yes for LR<br>
Impute missing values: depend on the used model; yes for LR

In [42]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

In [43]:
rfe_selector = RFE(estimator=LogisticRegression(), n_features_to_select=100, step=10, verbose=5)
rfe_selector.fit(X_norm, y)

Fitting estimator with 225 features.
Fitting estimator with 215 features.
Fitting estimator with 205 features.
Fitting estimator with 195 features.
Fitting estimator with 185 features.
Fitting estimator with 175 features.
Fitting estimator with 165 features.
Fitting estimator with 155 features.
Fitting estimator with 145 features.
Fitting estimator with 135 features.
Fitting estimator with 125 features.
Fitting estimator with 115 features.
Fitting estimator with 105 features.


RFE(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                 fit_intercept=True, intercept_scaling=1,
                                 l1_ratio=None, max_iter=100,
                                 multi_class='auto', n_jobs=None, penalty='l2',
                                 random_state=None, solver='lbfgs', tol=0.0001,
                                 verbose=0, warm_start=False),
    n_features_to_select=100, step=10, verbose=5)

In [45]:
rfe_support = rfe_selector.get_support()
rfe_feature = X.loc[:,rfe_support].columns.tolist()
print(str(len(rfe_feature)), 'selected features')

100 selected features


## Embedded 

Normalization: Yes<br>
Impute missing values: Yes

In [51]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression

In [53]:
embeded_lr_selector = SelectFromModel(LogisticRegression(penalty="l2"), '1.25*median')
embeded_lr_selector.fit(X_norm, y)

SelectFromModel(estimator=LogisticRegression(C=1.0, class_weight=None,
                                             dual=False, fit_intercept=True,
                                             intercept_scaling=1, l1_ratio=None,
                                             max_iter=100, multi_class='auto',
                                             n_jobs=None, penalty='l2',
                                             random_state=None, solver='lbfgs',
                                             tol=0.0001, verbose=0,
                                             warm_start=False),
                max_features=None, norm_order=1, prefit=False,
                threshold='1.25*median')

In [54]:
embeded_lr_support = embeded_lr_selector.get_support()
embeded_lr_feature = X.loc[:,embeded_lr_support].columns.tolist()

print(str(len(embeded_lr_feature)), 'selected features')

84 selected features
