In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import gc

import lightgbm as lgb

In [2]:
# pca to reduce feature space? collinearity? correlations? noisy features removal? Tikhonov regularization 
# train model with boost library

In [3]:
### Remove collinear features

In [4]:
# Regularization is the process of introducing additional information in order to solve ill-posed problems or prevent overfitting.

In [5]:
#every time we are saving a csv, dtypes are lost by default. Define the following read and write function to preserve converted types in the first row to avoid another conversion after every loading.

import os
import json

def to_csv(df, path):
    
    dtypes = df.dtypes.apply(lambda x: x.name).to_dict()
    jtypes = json.dumps(dtypes)

    fileName = os.path.splitext(path)

    # save df as usual along with a json representation of the dictionary
    df.to_csv(path, index=False)

    f = open(fileName[0]+'Types',"w")
    f.write(jtypes)
    f.close()

    # free memory
    gc.enable()
    del df
    gc.collect()

def read_csv(path):
    
    fileName = os.path.splitext(path)
    
    jtypes = json.load(open(fileName[0]+'Types'))
    
    return pd.read_csv(path, dtype=jtypes)

In [6]:
#load final train
train = read_csv('../../data/trainjoined.csv')

In [7]:
#compute the upper triangle of the Pearson correlation coefficient matrix. The pearson coefficient is computed between every pair of features
corrMatrix = train.drop('TARGET', axis=1).corr().abs()
tableCorrelations = corrMatrix.where(np.triu(np.ones(corrMatrix.shape), k=1).astype(np.bool)).stack().reset_index()

In [8]:
highCorr = [corr[1:] for corr in tableCorrelations.itertuples() if (corr[3] > 0.90)]

In [9]:
gc.enable()
del tableCorrelations
gc.collect()

46

In [10]:
highCollFeat = set([corr[0] for corr in highCorr])

In [11]:
len(highCollFeat)

217

In [12]:
highCollFeat

{'AMT_CREDIT',
 'APARTMENTS_AVG',
 'APARTMENTS_MODE',
 'BASEMENTAREA_AVG',
 'BASEMENTAREA_MODE',
 'COMMONAREA_AVG',
 'COMMONAREA_MODE',
 'ELEVATORS_AVG',
 'ELEVATORS_MODE',
 'ENTRANCES_AVG',
 'ENTRANCES_MODE',
 'FLAG_EMP_PHONE',
 'FLOORSMAX_AVG',
 'FLOORSMAX_MODE',
 'FLOORSMIN_AVG',
 'FLOORSMIN_MODE',
 'HOUSETYPE_MODE_block of flats',
 'LIVINGAPARTMENTS_AVG',
 'LIVINGAPARTMENTS_MODE',
 'LIVINGAREA_AVG',
 'LIVINGAREA_MEDI',
 'LIVINGAREA_MODE',
 'NAME_INCOME_TYPE_Pensioner',
 'NONLIVINGAPARTMENTS_AVG',
 'NONLIVINGAPARTMENTS_MODE',
 'NONLIVINGAREA_AVG',
 'ORGANIZATION_TYPE_XNA',
 'REGION_RATING_CLIENT',
 'YEARS_BEGINEXPLUATATION_AVG',
 'YEARS_BEGINEXPLUATATION_MODE',
 'YEARS_BUILD_AVG',
 'YEARS_BUILD_MODE',
 'bureau_AMT_CREDIT_MAX_OVERDUE_max',
 'bureau_AMT_CREDIT_MAX_OVERDUE_mean',
 'bureau_AMT_CREDIT_SUM_DEBT_max',
 'bureau_AMT_CREDIT_SUM_LIMIT_max',
 'bureau_AMT_CREDIT_SUM_OVERDUE_max',
 'bureau_CNT_CREDIT_PROLONG_max',
 'bureau_CREDIT_ACTIVE_Active_mean',
 'bureau_CREDIT_CURRENCY_curr

In [13]:
train.drop(highCollFeat, axis=1, inplace=True)

In [14]:
len(train.columns)

557

In [15]:
test = read_csv('../../data/testjoined.csv')

In [16]:
# align test and set by features as always

target = train['TARGET']

#Align the training and testing data, keep only columns present in both dataframes
train, test = train.align(test, join = 'inner', axis = 1)

#Add the target back in
train['TARGET'] = target

In [17]:
test.shape

(48744, 556)

In [18]:
train.shape

(307511, 557)

In [19]:
to_csv(train, '../../data/trainjoincoll.csv')

In [20]:
# store collinear-free features test
to_csv(test, '../../data/testjoincoll.csv')

In [21]:
#read final test and train set. Remember that this dataframes are the result of mergind the whole data and removing collinear features
train = read_csv('../../data/trainjoincoll.csv')
test = read_csv('../../data/testjoincoll.csv')

In [None]:
#In this notebook we employed a number of feature selection methods. These methods are necessary to reduce the number of features to increase model interpretability, decrease model runtime, and increase generalization performance on the test set

In [85]:
#Compute Pearson correlation coefficients: https://en.wikipedia.org/wiki/Pearson_correlation_coefficient. between every variable and the target

def computePearson(feature):
    corr = np.absolute(np.corrcoef(train[feature], train['TARGET'])[0,1])
    if np.isnan(corr):
        corr = 0
    return (feature, corr)

corrTargetList = [ computePearson(feat) for feat in train if (feat != 'TARGET') and (feat != 'SK_ID_CURR') ]

corrSupport = [True if corrTargetList[i][1] != 0 else False for i in range(0, len(corrTargetList)-1)]
#correlations = train.corr()['TARGET'].sort_values()
#
#print('Most Positive Correlations:\n', correlations.tail(10))
#print('Most Negative Correlations:', correlations.head(10))

#Explanation: since the DAYS_BIRTH feature increases negatively, and the correlation is positive, it means that as the client gets older he will be less likely to default. Moreover, among the top ones we have features as EXT_SOURCE_1/3, EDUCATION TYPE .. . However, none of the features seem to be strongly correlated w/ the target w/ respect to Evans (1996) general interpretatoins (http://www.statstutor.ac.uk/resources/uploaded/pearsons.pdf). Indeed having an abs value Pearson coefficient between .00-.19 is considered as "very week" correlation.

In [86]:
corrSupport

[True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,

In [81]:
corrTargetList[300]

('prev_creditcard_MONTHS_BALANCE_max_sum', 0)

In [37]:
len(corrTargetList)

555

In [26]:
#Visualize w/ a density plot how the younger clients tend to default more likely
#plt.figure(figsize = (10, 8))
#
## KDE plot of loans that were repaid on time
#sns.kdeplot(train.loc[train['TARGET'] == 0, 'DAYS_BIRTH'] / -365, label = 'target == 0')
#
## KDE plot of loans which were not repaid on time
#sns.kdeplot(train.loc[train['TARGET'] == 1, 'DAYS_BIRTH'] / -365, label = 'target == 1')
## Labeling of plot
#plt.xlabel('Age (years)'); plt.ylabel('Density'); plt.title('Distribution of Ages');

In [None]:
# Create the null importances distributions : these are created fitting the model over several runs on a shuffled version of the target. This shows how the model can make sense of a feature irrespective of the target.

In [30]:
### NULL IMPORTANCE ###
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
import time
from lightgbm import LGBMClassifier
import lightgbm as lgb

import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
%matplotlib inline

In [None]:
#We show that this method can be used to correct for the bias of feature importance computed with RF and MI. 

In [None]:
# remove all features w/ 0 overall importance using: Person Coefficient, Chi coefficient, Logistic regression and RF w/ lightGBM.
