In [25]:
import numpy as np
import pandas as pd 

# sklearn preprocessing for dealing with categorical variables
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer

import os

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split

from lightgbm import LGBMClassifier


In [2]:
train_bureau = pd.read_csv('train_bureau_raw.csv')
test_bureau = pd.read_csv('test_bureau_raw.csv')

train_previous = pd.read_csv('train_previous_raw.csv')
test_previous = pd.read_csv('test_previous_raw.csv')

# All columns in dataframes
bureau_columns = list(train_bureau.columns)
previous_columns = list(train_previous.columns)

In [3]:
# Bureau only features
bureau_features = list(set(bureau_columns) - set(previous_columns))

# Previous only features
previous_features = list(set(previous_columns) - set(bureau_columns))

# Original features will be in both datasets
original_features = list(set(previous_columns) & set(bureau_columns))

print('There are %d original features.' % len(original_features))
print('There are %d bureau and bureau balance features.' % len(bureau_features))
print('There are %d previous Home Credit loan features.' % len(previous_features))

There are 123 original features.
There are 179 bureau and bureau balance features.
There are 1043 previous Home Credit loan features.


In [4]:
train_labels = train_bureau['TARGET']
previous_features.append('SK_ID_CURR')

train_ids = train_bureau['SK_ID_CURR']
test_ids = test_bureau['SK_ID_CURR']

# Merge the dataframes avoiding duplicating columns by subsetting train_previous
train = train_bureau.merge(train_previous[previous_features], on = 'SK_ID_CURR')
test = test_bureau.merge(test_previous[previous_features], on = 'SK_ID_CURR')


In [5]:
print('Training shape: ', train.shape)
print('Testing shape: ', test.shape)

Training shape:  (307511, 1345)
Testing shape:  (48744, 1344)


In [6]:
train1 = train.sample(n = 1000, random_state = 42)

print('Training shape: ', train1.shape)

Training shape:  (1000, 1345)


In [7]:
tr_labels = train1['TARGET']

In [8]:
test1 = test.sample(n = 1000, random_state = 42)

print('Training shape: ', test1.shape)

Training shape:  (1000, 1344)


In [26]:
def feat_sel(train, test):
    
    tr_labels = train['TARGET']
    
    train = train.drop(columns = ['SK_ID_CURR', 'TARGET'])
    test = test.drop(columns = ['SK_ID_CURR'])
    
    print('Phase Label encoder')
    #LABEL ENCODER
    le = LabelEncoder()


    # Iterate through the columns
    for col in train:
        if train[col].dtype == 'object':
            # If 2 or fewer unique categories
            if len(list(train[col].unique())) <= 2:
                # Train on the training data
                le.fit(train[col])
                # Transform both training and testing data
                train[col] = le.transform(train[col])
                test[col] = le.transform(test[col])
                
    print('\nDummyfication')
    
    #GET DUMMY
    train = pd.get_dummies(train)
    test = pd.get_dummies(test)
    
    #ALIGN
    
    train, test = train.align(test, join = 'inner', axis = 1)
    
    print('\nNumber of feature in the training data after label encoder and get dummy: ', train.shape[1])
    print('Number of feature in the testing data after label encoder and get dummy: ', test.shape[1])
    
    print('\nPhase correlation')
    
    #Remove Collinear Variables
    
    tr = 0.9

    corr = train.corr().abs()
    
    # Upper triangle of correlations
    upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(np.bool))
    
    tr_drop = [columns for columns in upper.columns if any(upper[columns] > tr)]

    print('\nNumber of variable dropped because they were too correlated :',len(tr_drop))
    
    train1 = train.drop(columns = tr_drop)
    test1 = test.drop(columns = tr_drop)

    print('\nNumber of feature in the training data after the drop of the variable too much correlated',train1.shape[1])
    print('Number of feature in the testing data after the drop of the variable too much correlated',test1.shape[1])

    print('\nPhase Nan')
    #Remove Missing Values
    
    train_missing = (train1.isnull().sum() / len(train1)).sort_values(ascending = False)
    test_missing = (test1.isnull().sum() / len(test1)).sort_values(ascending = False)
    
    train_missing1 = train_missing.index[train_missing > 0.75]
    test_missing1 = test_missing.index[test_missing > 0.75]

    print('\nNumber of columns with more than 75% of missing values in train :', len(train_missing1))
    print('Number of columns with more than 75% of missing values in test :', len(test_missing1))
    
    train1 = train1.drop(columns = train_missing1 )
    test1 = test1.drop(columns = test_missing1 )
    
    train, test = train1.align(test1, join = 'inner', axis = 1)
    
    print('\nNumber of feature in the training data after removing missing values:', train.shape[1])
    print('Number of feature in the testing data after removing missing values :', test.shape[1])
    
    
    #MODELISATION
    
    coltrain = list(train.columns)
    coltest = list(test.columns)

    imputer = SimpleImputer(strategy = 'median')
    
    scaler = MinMaxScaler(feature_range = (0,1))

    train = imputer.fit_transform(train)
    test = imputer.transform(test)
    
    train = scaler.fit_transform(train)
    test = scaler.transform(test)
    
    print('\nStart of the feature selection with LGBM attributes : feature_importances_')
    print('')
    
    zero_imp = np.zeros(train.shape[1])
    
    while(len(zero_imp) > 0 ):
    
        model = LGBMClassifier()

        #fit the model twice to avoid overfitting
        feat_imp = np.zeros(train.shape[1])
    
        for i in range(2):
            X_train, X_valid, y_train, y_valid = train_test_split(train, tr_labels, test_size = 0.20, random_state = i)
    
            model.fit(X_train, y_train, early_stopping_rounds=100, 
                 eval_set = [(X_valid, y_valid)], eval_metric = 'auc', verbose = 200)
    
            feat_imp += model.feature_importances_
    
        feat_imp = feat_imp / 2

        feat_imp = pd.DataFrame({'features' : coltrain, 'importances' : feat_imp}).sort_values(by = 'importances', ascending = False)
    
        zero_imp = list(feat_imp[feat_imp['importances'] == 0.0]['features'])

        print('\nThe number of features with 0.0 importance is:', len(zero_imp))
        
        train = pd.DataFrame(train, columns = coltrain)
        test = pd.DataFrame(test, columns = coltest)

        train.drop(columns = zero_imp, inplace = True)
        test.drop(columns = zero_imp, inplace = True)

        print(train.shape)
        print(test.shape)
        
        coltrain = list(train.columns)
        coltest = list(test.columns)

        imputer = SimpleImputer(strategy = 'median')

        train = imputer.fit_transform(train)
        test = imputer.transform(test)
        
    print('\n End of the features selection, we now have {} variables'.format(train.shape[1]))
    return train, test

In [27]:
%%time
train2, test2 = feat_sel(train1, test1)

Phase Label encoder

Dummyfication

Number of feature in the training data after label encoder and get dummy:  1441
Number of feature in the testing data after label encoder and get dummy:  1441

Phase correlation

Number of variable dropped because they were too correlated : 614

Number of feature in the training data after the drop of the variable too much correlated 827
Number of feature in the testing data after the drop of the variable too much correlated 827

Phase Nan

Number of columns with more than 75% of missing values in train : 108
Number of columns with more than 75% of missing values in test : 19

Number of feature in the training data after removing missing values: 719
Number of feature in the testing data after removing missing values : 719

Start of the feature selection with LGBM attributes : feature_importances_

Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[15]	valid_0's auc: 0.67964	valid_0's binary_

In [28]:
print(train2.shape)
print(test2.shape)

(1000, 283)
(1000, 283)


In [7]:
%%time
feat_sel(train, test)

Phase Label encoder

Dummyfication

Number of feature in the training data after label encoder and get dummy:  1461
Number of feature in the testing data after label encoder and get dummy:  1461

Phase correlation

Nombre de variable drop because they were too correlated : 594

Number of feature in the training data after the drop of the variable too correlated 867
Number of feature in the testing data after the drop of the variable too correlated 867

Phase Nan

Number of columns with more than 75% of missing values in train : 18
Number of columns with more than 75% of missing values in test : 18
Number of feature in the training data after removing missing values: 849
Number of feature in the testing data after removing missing values : 849
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[98]	valid_0's auc: 0.780222	valid_0's binary_logloss: 0.236598
Training until validation scores don't improve for 100 rounds
Did not mee

(0, (307511, 514), (48744, 514))