In [None]:
## FEATURE SELECTION ##
# In this notebook we employed a number of feature selection methods. These methods are necessary to reduce the number of features to, decrease model runtime, increase model interpretability and generalization performance

In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import gc
gc.enable()
import lightgbm as lgb
import featexp
import utilities

In [2]:
# Load final train
train = utilities.read_csv('../../datatmp/data/trainjoined.csv')

In [3]:
# For some reason we need to convert types again (perhaps loosed during alignment?)
train = utilities.convert_types(train)

In [4]:
train.shape

(307511, 777)

In [6]:
# Impute the median over the remaining nan values
from sklearn.impute import SimpleImputer 
imputer = SimpleImputer(missing_values=np.nan, strategy='median')

imputer.fit(train)
train.loc[:] = imputer.transform(train)

In [7]:
#check for null values
nulls= train.isnull().sum()
nulls= nulls[nulls > 0]
(nulls / train.shape[0]).shape

(0,)

In [8]:
## Multicollinearity ##
# Multicollinearity could negatively affect permutation importance later and Regression
# Infact, when features are collinear, permutating one feature will have little effect on the models performance because it can get the same information from a correlated feature
# Are we loosing here all the manually created features? 

In [9]:
# Compute the upper triangle of the Pearson correlation coefficient matrix. The pearson coefficient is computed between every pair of features
corrMatrix = train.drop('TARGET', axis=1).corr().abs()
tableCorrelations = corrMatrix.where(np.triu(np.ones(corrMatrix.shape), k=1).astype(np.bool)).stack().reset_index()

In [10]:
# Select every highly correlated pair of features
highCorr = [corr[1:] for corr in tableCorrelations.itertuples() if (corr[3] > 0.92)]

In [11]:
del tableCorrelations
gc.collect()

126

In [12]:
# For each pair of features, select only the unique ones from the first column i.e. one from each "cluster" of strong collinear features
highCollFeat = set([corr[0] for corr in highCorr])
train.drop(highCollFeat, axis=1, inplace=True)

In [13]:
# How many features are left?
len(train.columns)

572

In [19]:
test = utilities.read_csv('../../datatmp/data/testjoined.csv')

In [20]:
# Align test and set by features
target = train['TARGET']
train, test = train.align(test, join = 'inner', axis = 1)

#Add the target back in
train['TARGET'] = target

In [21]:
test.shape

(48744, 571)

In [22]:
# For some reason, after the alignment we need to convert to the right types again
train = utilities.convert_types(train, True)
test = utilities.convert_types(test, True)

Original Memory Usage: 1.41 gb.
New Memory Usage: 0.7 gb.
Original Memory Usage: 0.11 gb.
New Memory Usage: 0.11 gb.


In [23]:
# Store collinear-free test and train
utilities.to_csv(train, '../../datatmp/data/trainjoincoll.csv')
utilities.to_csv(test, '../../datatmp/data/testjoincoll.csv')

In [None]:
#train = utilities.read_csv('../../data/trainjoincoll.csv')
#test = utilities.read_csv('../../data/testjoincoll.csv')

In [24]:
# As with _traintest, noisy features test. For efficiency handle only on new features (the original already done in _traintest)
from featexp import get_trend_stats

# select the joined new features
newFeatsTrain = train.loc[:,'DAYS_EMPLOYED_ANOM':] 

# Build a validation set
msk = np.random.rand(len(newFeatsTrain)) < 0.75
trainset = newFeatsTrain[msk].astype(np.float32)
validationset = newFeatsTrain[~msk].astype(np.float32)

# Compute noisy's statistics for each feature wrt the target
stats = get_trend_stats(data=trainset, target_col='TARGET', data_test=validationset)

Only one bin created for prev_NFLAG_LAST_APPL_IN_DAY_mean. Correlation can't be calculated
Only one bin created for prev_NFLAG_LAST_APPL_IN_DAY_max. Correlation can't be calculated
Only one bin created for prev_NFLAG_LAST_APPL_IN_DAY_min. Correlation can't be calculated
Only one bin created for prev_DAYS_FIRST_DRAWING_ANOM_max. Correlation can't be calculated
Only one bin created for prev_installments_NUM_INSTALMENT_NUMBER_min_min. Correlation can't be calculated
Only one bin created for prev_cash_SK_DPD_min_min. Correlation can't be calculated
Only one bin created for prev_cash_SK_DPD_min_sum. Correlation can't be calculated
Only one bin created for prev_cash_SK_DPD_DEF_min_min. Correlation can't be calculated
Only one bin created for prev_cash_SK_DPD_DEF_min_sum. Correlation can't be calculated
Only one bin created for prev_cash_NAME_CONTRACT_STATUS_Amortized debt_mean_min. Correlation can't be calculated
Only one bin created for prev_cash_NAME_CONTRACT_STATUS_Amortized debt_mean_sum

In [25]:
total_trend_correlations=stats['Trend_correlation']
for i in range(0,4):
    msk = np.random.rand(len(newFeatsTrain)) < 0.75
    trainset = newFeatsTrain[msk].astype(np.float32)
    validationset = newFeatsTrain[~msk].astype(np.float32)
    
    ith_stats = get_trend_stats(data=trainset, target_col='TARGET', data_test=validationset)
    ith_tc = ith_stats['Trend_correlation']
    
    total_trend_correlations += ith_tc

averaged_trend_correlations = total_trend_correlations / 5

Only one bin created for prev_NFLAG_LAST_APPL_IN_DAY_mean. Correlation can't be calculated
Only one bin created for prev_NFLAG_LAST_APPL_IN_DAY_max. Correlation can't be calculated
Only one bin created for prev_NFLAG_LAST_APPL_IN_DAY_min. Correlation can't be calculated
Only one bin created for prev_DAYS_FIRST_DRAWING_ANOM_max. Correlation can't be calculated
Only one bin created for prev_installments_NUM_INSTALMENT_NUMBER_min_min. Correlation can't be calculated
Only one bin created for prev_cash_SK_DPD_min_min. Correlation can't be calculated
Only one bin created for prev_cash_SK_DPD_min_sum. Correlation can't be calculated
Only one bin created for prev_cash_SK_DPD_DEF_min_min. Correlation can't be calculated
Only one bin created for prev_cash_SK_DPD_DEF_min_sum. Correlation can't be calculated
Only one bin created for prev_cash_NAME_CONTRACT_STATUS_Amortized debt_mean_min. Correlation can't be calculated
Only one bin created for prev_cash_NAME_CONTRACT_STATUS_Amortized debt_mean_sum

In [26]:
del newFeatsTrain
gc.collect()

43

In [27]:
stats['Trend_correlation'] = averaged_trend_correlations
# Select returned noisy feats. N: If trend_correlation == 0 then correlation couldnt be calculated. Lazy approach: save the feature.
noisyFeats = list(stats.loc[ stats['Trend_correlation'] < 0.75]['Feature']) # Treshold 0.8, tunable

# Drop noisy features
train.drop(noisyFeats, axis=1, inplace=True)

In [28]:
len(noisyFeats)

93

In [29]:
# How many features are left? 
train.shape

(307511, 479)

In [30]:
test.shape

(48744, 571)

In [31]:
# Align test and set by features
target = train['TARGET']
train, test = train.align(test, join = 'inner', axis = 1)

#Add the target back in
train['TARGET'] = target

In [None]:
test.shape

In [32]:
# We forgot to impute missing values for test set, do it now
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
imputer.fit(test)
test.loc[:] = imputer.transform(test)

In [34]:
# Store test train without noisy features
utilities.to_csv(train, '../../datatmp/data/trainjoincollnoisy.csv')
utilities.to_csv(test, '../../datatmp/data/testjoincollnoisy.csv')

In [None]:
# Continue to the models