In [2]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

import gc

import lightgbm as lgb

In [None]:
# pca to reduce feature space? collinearity? correlations? noisy features removal? Tikhonov regularization 
# train model with boost library

In [None]:
### Remove collinear features

In [None]:
# Regularization is the process of introducing additional information in order to solve ill-posed problems or prevent overfitting.

In [3]:
#every time we are saving a csv, dtypes are lost by default. Define the following read and write function to preserve converted types in the first row to avoid another conversion after every new load.

import os
import json

def to_csv(df, path):
    
    dtypes = df.dtypes.apply(lambda x: x.name).to_dict()
    jtypes = json.dumps(dtypes)

    fileName = os.path.splitext(path)

    # save df as usual along with a json representation of the dictionary
    df.to_csv(path, index=False)

    f = open(fileName[0]+'Types',"w")
    f.write(jtypes)
    f.close()

    # free memory
    gc.enable()
    del df
    gc.collect()

def read_csv(path):
    
    fileName = os.path.splitext(path)
    
    jtypes = json.load(open(fileName[0]+'Types'))
    
    return pd.read_csv(path, dtype=jtypes)

def read_csvTmp(path):
    
    fileName = os.path.splitext(path)
    
    jtypes = json.load(open(fileName[0]+'Types'))
    
    return pd.read_csv(path, dtype=jtypes, nrows=10000)

In [None]:
# Load final train
train = read_csv('../../data/trainjoined.csv')

In [None]:
# to be removed, issue fixed in _traintest
le = sklearn.preprocessing.LabelEncoder()
for c in train:
    if train[c].dtype == 'bool':
        le.fit(train[c])
        train[c]=le.transform(train[c])


In [None]:
# Impute the median over the remaining nan values
from sklearn.impute import SimpleImputer 
imputer = SimpleImputer(missing_values=np.nan, strategy='median')

imputer.fit(train)
train.loc[:] = imputer.transform(train)

In [None]:
# Check for null values
nulls= train.isnull().sum()
nulls= nulls[nullssing_values=np.n > 0]

nulls.shape

In [None]:
#compute the upper triangle of the Pearson correlation coefficient matrix. The pearson coefficient is computed between every pair of features
corrMatrix = train.drop('TARGET', axis=1).corr().abs()
tableCorrelations = corrMatrix.where(np.triu(np.ones(corrMatrix.shape), k=1).astype(np.bool)).stack().reset_index()

In [None]:
highCorr = [corr[1:] for corr in tableCorrelations.itertuples() if (corr[3] > 0.90)]

In [None]:
gc.enable()
del tableCorrelations
gc.collect()

In [None]:
highCollFeat = set([corr[0] for corr in highCorr])

In [None]:
len(highCollFeat)

In [None]:
highCollFeat

In [None]:
train.drop(highCollFeat, axis=1, inplace=True)

In [None]:
len(train.columns)

In [None]:
test = read_csv('../../data/testjoined.csv')

In [None]:
# to be removed, issue fixed in _traintest
le = sklearn.preprocessing.LabelEncoder()
for c in test:
    if test[c].dtype == 'bool':
        le.fit(test[c])
        test[c]=le.transform(test[c])


In [None]:
# align test and set by features as always

target = train['TARGET']

#Align the training and testing data, keep only columns present in both dataframes
train, test = train.align(test, join = 'inner', axis = 1)

#Add the target back in
train['TARGET'] = target

In [None]:
test.shape

In [None]:
train.shape

In [None]:
to_csv(train, '../../data/trainjoincoll.csv')

In [None]:
# store collinear-free features test
to_csv(test, '../../data/testjoincoll.csv')

In [None]:
#read final test and train set. Remember that this dataframes are the result of merging the whole data and removing collinear features
train = read_csv('../../data/trainjoincoll.csv')

In [None]:
train = read_csvTmp('../../data/trainjoincoll.csv')

In [None]:
len(train.columns)

In [None]:
test = read_csv('../../data/testjoincoll.csv')

In [None]:
## Feature Importance ##

In [None]:
#In this notebook we employed a number of feature selection methods. These methods are necessary to reduce the number of features to increase model interpretability, decrease model runtime, and increase generalization performance on the test set

In [None]:
#Compute Pearson correlation coefficients: https://en.wikipedia.org/wiki/Pearson_correlation_coefficient. between every variable and the target

def computePearson(feature):
    corr = np.absolute(np.corrcoef(train[feature], train['TARGET'])[0,1])
    if np.isnan(corr):
        corr = 0
    return (feature, corr)

pcorrTarget = [ computePearson(feat) for feat in train if (feat != 'TARGET') and (feat != 'SK_ID_CURR') ]
# Sort from descending order
sortedPcorr = sorted(pcorrTarget, key=lambda feat:feat[1], reverse=True)

In [None]:
# None of the features seem to be strongly correlated w/ the target w/ respect to Evans (1996) general interpretatoins (http://www.statstutor.ac.uk/resources/uploaded/pearsons.pdf). Indeed having an abs value Pearson coefficient between .00-.19 is considered as "very week" correlation.
sortedPcorr[0]

In [None]:
# Compute the support of P coeff i.e. mark the features which are sligthly correlated to the target
pcorrSupport = [True if feat[1] > 0.03 else False for feat in pcorrTarget]

In [None]:
len(pcorrSupport)

In [None]:
pcorrSupport

In [None]:
# As for _traintest, noisy features test. For efficiency test only on new features and Fold=8
from featexp import get_trend_stats

# select the joined new features
newFeatsTrain = train.loc[:,'DAYS_EMPLOYED_ANOM':] 

# Build a validation set
msk = np.random.rand(len(newFeatsTrain)) < 0.7
trainset = newFeatsTrain[msk].astype(np.float32)
validationset = newFeatsTrain[~msk].astype(np.float32)

# Compute noisy's statistics for each feature wrt the target
stats = get_trend_stats(data=trainset, target_col='TARGET', data_test=validationset)

In [None]:
stats

In [None]:
total_trend_correlations=stats['Trend_correlation']
for i in range(0,7):
    msk = np.random.rand(len(newFeatsTrain)) < 0.7
    trainset = newFeatsTrain[msk].astype(np.float32)
    validationset = newFeatsTrain[~msk].astype(np.float32)
    
    ith_stats = get_trend_stats(data=trainset, target_col='TARGET', data_test=validationset)
    ith_tc = ith_stats['Trend_correlation']
    
    total_trend_correlations += ith_tc

averaged_trend_correlations = total_trend_correlations / 8

In [None]:
averaged_trend_correlations

In [None]:
stats['Trend_correlation'] = averaged_trend_correlations
# Select returned noisy feats.
noisyFeats = stats[stats['Trend_correlation'] < 0.80]['Feature'].to_list()

#drop noisy features
train.drop(noisyFeats, axis=1, inplace=True)

In [None]:
len(noisyFeats)

In [None]:
train.shape

In [None]:
# Store train without noisy features
to_csv(train, '../../data/trainjoincollnoisy.csv')

In [4]:
train = read_csv('../../data/trainjoincollnoisy.csv')

In [None]:
# Noisy support for the new features
noisySupport = [False if feat in noisyFeats else True for feat in newFeatsTrain.columns if feat != 'TARGET']

In [None]:
gc.enable()
del stats
gc.collect()

In [6]:
# Feature Importance: Tree based methods
# Tree-based models (and consequently ensembles of trees) can determine an "importance" for each feature by measuring the reduction in impurity for including the feature in the model. We will use a Gradient Boosted Model from the LightGBM library to assess feature importances.


# Remove the target for training and cast to int
target = train['TARGET'].astype(int)
train.drop('TARGET', axis = 1, inplace=True)

# An array to store feats importance

In [7]:
featureImportancePimp = pd.read_csv('../../data/featimp.csv')

In [8]:
randomId = featureImportancePimp.index.get_loc(featureImportancePimp[featureImportancePimp['Feature'] == 'random'].index[0])
featureImportancePimp = featureImportancePimp.iloc[:181,0]

In [None]:
trainImp = train[importantFeatures]

In [13]:
# An array to store feats importance
featureImportance = np.zeros(train.shape[1])

In [None]:
target = trainImp['TARGET'].astype(int)
trainImp.drop('TARGET', axis = 1, inplace=True)


In [None]:
target

In [9]:
# Create the model with hyperparameters similar to https://www.kaggle.com/willkoehrsen/introduction-to-feature-selection
model = lgb.LGBMClassifier(objective='binary', boosting_type = 'goss', n_estimators = 10000, class_weight = 'balanced')

In [None]:
from sklearn.model_selection import train_test_split

# Due to the last version of lgb, we got an error related to special chars in column names. https://www.kaggle.com/c/data-science-bowl-2019/discussion/120344 commands fixed it
trainImp.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in trainImp.columns]

# Fit twice to avoid overfitting
for i in range(2):
    
    # Split into training and validation set
    trainfeats, validFeats, trainY, validY = train_test_split(trainImp, target, test_size = 0.25, random_state = i)
    
    # Train using early stopping
    model.fit(trainfeats, trainY, early_stopping_rounds=80, eval_set = [(validFeats, validY)], eval_metric = 'auc', verbose = 200)
    
    # Record the feature importances
    featureImportance += model.feature_importances_

In [14]:
from sklearn.model_selection import train_test_split

# Due to the last version of lgb, we got an error related to special chars in column names. https://www.kaggle.com/c/data-science-bowl-2019/discussion/120344 commands fixed it
train.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in train.columns]

# Fit twice to avoid overfitting and Gini bias
for i in range(2):
    
    # Split into training and validation set
    trainX, validX, trainY, validY = train_test_split(train, target, test_size = 0.25, random_state = i)
    
    # Train using early stopping
    model.fit(trainX, trainY, early_stopping_rounds=80, eval_set = [(validX, validY)], eval_metric = 'auc', verbose = 200)
    
    # Record the feature importances
    featureImportance += model.feature_importances_

Training until validation scores don't improve for 80 rounds
[200]	valid_0's auc: 0.780906	valid_0's binary_logloss: 0.513547
Early stopping, best iteration is:
[240]	valid_0's auc: 0.781624	valid_0's binary_logloss: 0.503968
Training until validation scores don't improve for 80 rounds
Early stopping, best iteration is:
[118]	valid_0's auc: 0.783012	valid_0's binary_logloss: 0.533104


In [15]:
featureImportance = featureImportance / 2
featureImportance = pd.DataFrame({'feature': list(train.columns), 'importance': featureImportance}).sort_values('importance', ascending = False)

In [None]:
featureImportance.head(20)

In [16]:
# Find the features with zero importance and remove em
zeroFeats = featureImportance[featureImportance['importance'] == 0.0]['feature'].to_list()
print('There are %d features with 0.0 importance' % len(zeroFeats))
featureImportance.tail()

There are 68 features with 0.0 importance


Unnamed: 0,feature,importance
202,prev_installments_NUM_INSTALMENT_VERSION_min_min,0.0
93,OCCUPATION_TYPE_IT_staff,0.0
91,OCCUPATION_TYPE_HR_staff,0.0
393,prev_NAME_SELLER_INDUSTRY_Tourism_mean,0.0
469,bureau_CREDIT_TYPE_Real_estate_loan_mean,0.0


In [18]:
featureImportancePimp = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in featureImportancePimp]

In [22]:
featureImportancePimp = set(featureImportancePimp).intersection(set(train.columns))

In [28]:
len(featureImportancePimp)

174

In [24]:
importantFeaturesLgb = set(featureImportance['feature']).difference(set(zeroFeats))

In [25]:
len(importantFeaturesLgb)

402

In [29]:
importantFeatures = importantFeaturesLgb.union(featureImportancePimp)

In [30]:
len(importantFeatures)

413

In [31]:
# compute again the feature importance with the new dataset and check if the model gives 0 to any of the founded features
trainFinal = train[list(importantFeatures)]

In [32]:
featureImportanceFinal = np.zeros(trainFinal.shape[1])

In [33]:
# Fit twice to avoid overfitting and Gini bias
for i in range(2):
    
    # Split into training and validation set
    trainX, validX, trainY, validY = train_test_split(trainFinal, target, test_size = 0.25, random_state = i)
    
    # Train using early stopping
    model.fit(trainX, trainY, early_stopping_rounds=80, eval_set = [(validX, validY)], eval_metric = 'auc', verbose = 200)
    
    # Record the feature importances
    featureImportanceFinal += model.feature_importances_

Training until validation scores don't improve for 80 rounds
[200]	valid_0's auc: 0.780209	valid_0's binary_logloss: 0.513187
Early stopping, best iteration is:
[157]	valid_0's auc: 0.780951	valid_0's binary_logloss: 0.52386
Training until validation scores don't improve for 80 rounds
Early stopping, best iteration is:
[118]	valid_0's auc: 0.783012	valid_0's binary_logloss: 0.533104


In [35]:
featureImportanceFinal = featureImportanceFinal / 2
featureImportanceFinal = pd.DataFrame({'feature': list(trainFinal.columns), 'importance': featureImportanceFinal}).sort_values('importance', ascending = False)

In [36]:
# Find the features with zero importance and remove em
zeroFeats = featureImportanceFinal[featureImportanceFinal['importance'] == 0.0]['feature'].to_list()
print('There are %d features with 0.0 importance' % len(zeroFeats))
featureImportanceFinal.tail()

There are 27 features with 0.0 importance


Unnamed: 0,feature,importance
34,ORGANIZATION_TYPE_Industry__type_1,0.0
132,ORGANIZATION_TYPE_Restaurant,0.0
241,prev_creditcard_AMT_DRAWINGS_ATM_CURRENT_max_sum,0.0
310,ORGANIZATION_TYPE_Other,0.0
128,WALLSMATERIAL_MODE_Monolithic,0.0


In [38]:
importantFeatures = set(featureImportanceFinal['feature']).difference(set(zeroFeats))

In [39]:
trainFfinal = trainFinal[list(importantFeatures)]

In [40]:
trainFfinal.shape

(307511, 386)

In [None]:
# Null importance

# Article: https://academic.oup.com/bioinformatics/article/26/10/1340/193348
# As suggested in: https://www.kaggle.com/ogrellier/feature-selection-with-null-importances/output

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
import time
from lightgbm import LGBMClassifier
import lightgbm as lgb
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
%matplotlib inline

In [None]:
# Remove the target for training and cast to int
target = train['TARGET'].astype(int)
train.drop(['TARGET', 'SK_ID_CURR'], axis = 1, inplace=True)

In [None]:
target

In [None]:
# Features Importance and AUC scores retrival using Lgb in RF 

def featureImpAuc(featuresDf, target, shuffle=False):

    # Due to the last version of lgb, we got an error related to special chars in column names. https://www.kaggle.com/c/              data-science-bowl-2019/discussion/120344 commands fixed it
    featuresDf.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in featuresDf.columns]
    
    # Compute importance for a randomly permuted target
    if shuffle:
        target = np.random.permutation(target)
    
    # Fit the model using adapted parameters from Kaggle's notebook
    # We will do hyper-parameters tuning => do not care too much
    lgbParms = {
       
        'objective': 'binary',
        'boosting_type': 'rf',
        'subsample': 0.623, # bagging fraction, for each tree use a fraction of data. used to speed up training and avoid overfitting
        'bagging_freq': 1, # apply begging at every iteration
        'colsample_bytree': 0.8, #feature fraction, same as subsample but for features
        'num_leaves': 127, 
        'max_depth': 8,
        'n_jobs': 4
    }

    lgbTrain = lgb.Dataset(featuresDf, target)
    model = lgb.train(params=lgbParms, train_set=lgbTrain, num_boost_round=200)
    
    # Get feature importances
    featureImportance = pd.DataFrame({'feature': list(featuresDf.columns),
                                      'importance_gain': model.feature_importance(importance_type='gain'),
                                      'importance_split': model.feature_importance(importance_type='split'),
                                      'auc_score': roc_auc_score(target, model.predict(featuresDf)) 
                                      }).sort_values('importance_gain', ascending = False)

    return featureImportance

In [None]:
# Get actual importance
featureActualImportance = featureImpAuc(train, target)

In [None]:
featureActualImportance.head()

In [None]:
#PCA
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler

data = train.drop(columns=['SK_ID_CURR', 'TARGET'])
mmscaler = MinMaxScaler()
X = mmscaler.fit_transform(data) 

In [None]:
X.shape

In [None]:
X_pca = PCA().fit(X)

#Plotting the Cumulative Summation of the Explained Variance
plt.figure()
plt.plot(np.cumsum(X_pca.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Variance (%)') #for each component
plt.title('Pulsar Dataset Explained Variance')
plt.show()

In [None]:
print('200 principal components account for {:.4f}% of the variance.'.format(100 * np.sum(X_pca.explained_variance_ratio_[:200])))

In [None]:
pca = PCA(n_components=200)
trainPca=pca.fit_transform(X)

In [None]:
train_Pca = pd.DataFrame(trainPca)

In [None]:
train_Pca['SK_ID_CURR'] = train['SK_ID_CURR']
train_Pca['TARGET'] = train['TARGET']

In [None]:
train_Pca['TARGET'] = train_Pca['TARGET'].astype(int)

In [None]:
target = train_Pca['TARGET']


In [None]:
train_Pca = train_Pca.drop('TARGET', axis=1)

In [None]:
train_Pca = train_Pca.drop('SK_ID_CURR', axis=1)

In [None]:
gc.enable()
del train
gc.collect()

In [None]:
feature_importances = np.zeros(train_Pca.shape[1])

# Create the model with several hyperparameters
model = lgb.LGBMClassifier(objective='binary', boosting_type = 'goss', n_estimators = 10000, class_weight = 'balanced')

In [None]:



for i in range(2):
    
    # Split into training and validation set
    train_features, valid_features, train_y, valid_y = train_test_split(train_Pca, target, test_size = 0.25, random_state = i)
    
    # Train using early stopping
    model.fit(train_features, train_y, early_stopping_rounds=100, eval_set = [(valid_features, valid_y)], 
              eval_metric = 'auc', verbose = 200)
    
    # Record the feature importances
    feature_importances += model.feature_importances_

In [None]:
# Make sure to average feature importances! 
feature_importances = feature_importances / 2
feature_importances = pd.DataFrame({'feature': list(train_Pca.columns), 'importance': feature_importances}).sort_values('importance', ascending = False)

feature_importances.head()

In [None]:
zero_features = list(feature_importances[feature_importances['importance'] == 0.0]['feature'])
print('There are %d features with 0.0 importance' % len(zero_features))
feature_importances.tail()

In [None]:
corrSupport

In [None]:
corrTargetList[300]

In [None]:
len(corrTargetList)

In [None]:
#Visualize w/ a density plot how the younger clients tend to default more likely
#plt.figure(figsize = (10, 8))
#
## KDE plot of loans that were repaid on time
#sns.kdeplot(train.loc[train['TARGET'] == 0, 'DAYS_BIRTH'] / -365, label = 'target == 0')
#
## KDE plot of loans which were not repaid on time
#sns.kdeplot(train.loc[train['TARGET'] == 1, 'DAYS_BIRTH'] / -365, label = 'target == 1')
## Labeling of plot
#plt.xlabel('Age (years)'); plt.ylabel('Density'); plt.title('Distribution of Ages');

In [None]:
# Create the null importances distributions : these are created fitting the model over several runs on a shuffled version of the target. This shows how the model can make sense of a feature irrespective of the target.

In [None]:
### NULL IMPORTANCE ###
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
import time
from lightgbm import LGBMClassifier
import lightgbm as lgb

import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
%matplotlib inline

In [None]:
#We show that this method can be used to correct for the bias of feature importance computed with RF and MI. 

In [None]:
# remove all features w/ 0 overall importance using: Person Coefficient, Chi coefficient, Logistic regression and RF w/ lightGBM.
