In [1]:
import numpy as np, pandas as pd

%matplotlib inline

import matplotlib.pyplot as plt
plt.style.use('bmh')
import seaborn as sns

import lightgbm as lgb

import gc

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder



# 9 Preparing Datasets for Modeling

### 9.1 Merging New Features

In [2]:
train_bureau = pd.read_csv('train_1.csv')
test_bureau = pd.read_csv('test_1.csv')

train_previous = pd.read_csv('train_2.csv')
test_previous = pd.read_csv('test_2.csv')

In [3]:
print(train_bureau.shape)
print(train_previous.shape)

(307511, 333)
(307511, 1125)


In [4]:
bureau_cols = list(train_bureau.columns)
previous_cols = list(train_previous.columns)

bureau_feats = list(set(bureau_cols) - set(previous_cols))
previous_feats = list(set(previous_cols) - set(bureau_cols))
original_feats = list(set(previous_cols) & set(bureau_cols))

print('There are %d original features.' % len(original_feats))
print('There are %d bureau and bureau balance features.' % len(bureau_feats))
print('There are %d previous Home Credit loan features.' % len(previous_feats))

There are 122 original features.
There are 211 bureau and bureau balance features.
There are 1003 previous Home Credit loan features.


In [5]:
train_labels = train_bureau['TARGET']
previous_feats.append('SK_ID_CURR')

train_ids = train_bureau['SK_ID_CURR']
test_ids = test_bureau['SK_ID_CURR']

train = train_bureau.merge(train_previous[previous_feats], on = 'SK_ID_CURR')
test = test_bureau.merge(test_previous[previous_feats], on = 'SK_ID_CURR')

print(train.shape)
print(test.shape)

(307511, 1336)
(48744, 1335)


In [6]:
train = pd.get_dummies(train)
test = pd.get_dummies(test)

train, test = train.align(test, join = 'inner', axis = 1)
print(train.shape)
print(test.shape)


(307511, 1456)
(48744, 1456)


In [7]:
cols_with_id = [x for x in train.columns if 'SK_ID_CURR' in x]
train = train.drop(columns = cols_with_id)
test = test.drop(columns = cols_with_id)

### 9.2 Removing Collinear Variables

In [8]:
corr_matrix = train.corr().abs()
corr_matrix

Unnamed: 0,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,...,HOUSETYPE_MODE_terraced house,WALLSMATERIAL_MODE_Block,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes
CNT_CHILDREN,1.000000,0.012882,0.002145,0.021374,0.001827,0.025573,0.330938,0.239818,0.183395,0.028019,...,0.001830,0.005272,0.000709,0.001607,0.002032,0.020892,0.025088,0.011036,0.038644,0.004525
AMT_INCOME_TOTAL,0.012882,1.000000,0.156870,0.191657,0.159610,0.074796,0.027261,0.064223,0.027805,0.008506,...,0.000718,0.011696,0.006149,0.023886,0.003886,0.032753,0.016523,0.003369,0.050174,0.002894
AMT_CREDIT,0.002145,0.156870,1.000000,0.770138,0.986968,0.099738,0.055436,0.066838,0.009621,0.006575,...,0.000869,0.014314,0.007987,0.027255,0.005799,0.046644,0.009756,0.007373,0.058256,0.004308
AMT_ANNUITY,0.021374,0.191657,0.770138,1.000000,0.775109,0.118429,0.009445,0.104332,0.038514,0.011268,...,0.002166,0.015589,0.010077,0.035318,0.005968,0.054988,0.009189,0.007711,0.068145,0.003910
AMT_GOODS_PRICE,0.001827,0.159610,0.986968,0.775109,1.000000,0.103520,0.053442,0.064842,0.011565,0.009267,...,0.000796,0.013696,0.009955,0.028315,0.005877,0.049759,0.012888,0.009135,0.062878,0.004848
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WALLSMATERIAL_MODE_Panel,0.020892,0.032753,0.046644,0.054988,0.049759,0.092652,0.013597,0.004193,0.066690,0.015459,...,0.020005,0.092112,0.045358,0.039892,0.038117,1.000000,0.270257,0.069666,0.502903,0.039280
"WALLSMATERIAL_MODE_Stone, brick",0.025088,0.016523,0.009756,0.009189,0.012888,0.010782,0.010964,0.006327,0.023389,0.005677,...,0.027683,0.091023,0.044822,0.039421,0.037666,0.270257,1.000000,0.068843,0.491544,0.007605
WALLSMATERIAL_MODE_Wooden,0.011036,0.003369,0.007373,0.007711,0.009135,0.056263,0.008758,0.002846,0.000159,0.012702,...,0.121678,0.023464,0.011554,0.010162,0.009710,0.069666,0.068843,1.000000,0.078964,0.273271
EMERGENCYSTATE_MODE_No,0.038644,0.050174,0.058256,0.068145,0.062878,0.103513,0.019852,0.010907,0.069039,0.015230,...,0.053146,0.165333,0.076786,0.073260,0.060550,0.502903,0.491544,0.078964,1.000000,0.090624


In [9]:
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))


In [10]:
threshold = 0.9
to_drop = [column for column in upper.columns if any(upper[column]>threshold)]

In [11]:
len(to_drop)

601

In [12]:
train = train.drop(columns = to_drop)
test = test.drop(columns = to_drop)
print(train.shape)
print(test.shape)

(307511, 854)
(48744, 854)


### 9.3 Columns with Missing Values
We remove columns with missing values above a certain threshold. We don't remove all since LGBM can very well deal with them.

In [13]:
train_missing = (train.isnull().sum() / len(train)).sort_values(ascending=False)
train_missing.head()

client_credit_AMT_PAYMENT_CURRENT_min_mean          0.801438
client_credit_AMT_PAYMENT_CURRENT_mean_max          0.801438
client_credit_CNT_DRAWINGS_ATM_CURRENT_mean_mean    0.801178
client_credit_AMT_DRAWINGS_ATM_CURRENT_max_min      0.801178
client_credit_AMT_DRAWINGS_ATM_CURRENT_mean_min     0.801178
dtype: float64

In [14]:
test_missing = (test.isnull().sum() / len(test)).sort_values(ascending = False)
test_missing.head()
train_missing = train_missing.index[train_missing > 0.75]
test_missing = test_missing.index[test_missing > 0.75]
all_missing = list(set(set(train_missing) | set(test_missing)))


In [15]:
print(train.shape)
print(test.shape)

(307511, 854)
(48744, 854)


In [16]:
train = pd.get_dummies(train.drop(columns = all_missing))
test = pd.get_dummies(test.drop(columns = all_missing))

train, test = train.align(test, join = 'inner', axis = 1)

### 9.4 Removing Unimportant Features

In [17]:
import re
train = train.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
test = test.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

In [18]:
feature_importances = np.zeros(train.shape[1])

model = lgb.LGBMClassifier(objective = 'binary', boosting_type = 'goss', n_estimators = 10000, class_weight = 'balanced')

In [19]:
for i in range(2):
    train_features, valid_features, train_y, valid_y = train_test_split(train, train_labels, test_size = 0.25, random_state = i)
    model.fit(train_features, train_y, early_stopping_rounds = 100, eval_set = [(valid_features, valid_y)], eval_metric = 'auc', verbose = 200)
    
    feature_importances += model.feature_importances_



[200]	valid_0's auc: 0.782332	valid_0's binary_logloss: 0.508747




[200]	valid_0's auc: 0.783974	valid_0's binary_logloss: 0.508746


In [20]:
feature_importances = feature_importances / 2
feature_importances = pd.DataFrame({'feature': list(train.columns), 'importance': feature_importances}).sort_values('importance', ascending = False)

feature_importances.head(15)

Unnamed: 0,feature,importance
24,EXT_SOURCE_1,160.0
26,EXT_SOURCE_3,121.5
25,EXT_SOURCE_2,120.0
654,client_installments_AMT_PAYMENT_min_sum,117.0
5,DAYS_BIRTH,100.5
2,AMT_CREDIT,82.0
3,AMT_ANNUITY,69.5
6,DAYS_EMPLOYED,61.0
78,bureau_DAYS_CREDIT_ENDDATE_max,50.5
71,bureau_DAYS_CREDIT_max,47.5


In [21]:
zero_features = list(feature_importances[feature_importances['importance'] <= 0.1]['feature'])
print(len(zero_features))

295


In [22]:
train = train.drop(columns = zero_features)
test = test.drop(columns = zero_features)
print(train.shape)
print(test.shape)

(307511, 541)
(48744, 541)


In [23]:
train.to_csv('train_3.csv', index = False)
test.to_csv('test_3.csv', index = False)

In [25]:
train_labels.to_csv('train_labels.csv', index = False)
train_ids.to_csv('train_ids.csv', index = False)
test_ids.to_csv('test_ids.csv', index = False)