In [1]:
## Feature Importance ##

In [2]:
# After noise and correlation detection was made, and thus the n. of feateures shrinked down, we ended up with 470 features, which is still a considerable amount.
# We therefore apply here feature selection by means of feature importance permutation in the hope of reducing even more the space of predictors and, perhaps, gain some insights towards the interpetation of the model
# Permutation Importance tries to overcome the bias problem present in most common Gini (Impurity based) importance mechanism by permuting several times each feature and assessing how much the score, related to a validation set, is affected.
# In particular, impurity-based feature importance for trees are strongly biased and favor high cardinality features (typically numerical features)
# We computed pimp using LGB as a classifier since it was the one performing the best among the tried algorithms

In [3]:
import pandas as pd 
import numpy as np
import sklearn
import gc
gc.enable()
import lightgbm as lgb
import utilities

In [4]:
train = utilities.read_csv('../../datatmp/data/trainjoincollnoisy.csv')
test = utilities.read_csv('../../datatmp/data/testjoincollnoisy.csv')

In [5]:
target = train['TARGET'].astype(int).copy()
testIds = test['SK_ID_CURR'].astype(int).copy()

train.drop('SK_ID_CURR', axis=1, inplace=True)
train.drop('TARGET',axis=1, inplace=True)
test.drop('SK_ID_CURR', axis=1, inplace=True)

In [6]:
# Due to the last version of lgb, we got an error related to special chars in column names. Replacing the blanks in the following fixed it
train.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in train.columns]
test.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in test.columns]

In [7]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.inspection import permutation_importance

# Train LGB and hence compute its permutation importance
def trainNpredLgb(train, y, nFolds, pimp=True):

    valPreds = np.zeros(train.shape[0])
    importanceDf = pd.DataFrame(columns=train.columns)
    importanceDf.loc[0] = np.zeros(train.shape[1])
    
    # Unbalanced dataset better to fold (use stratified K-Folds?)
    folds = KFold(n_splits=nFolds, shuffle=True, random_state=42)

    for fold, (trainIds, valIds) in enumerate(folds.split(train, y)):
        
        trainX, trainY = train.iloc[trainIds], y.iloc[trainIds]
        valX, valY = train.iloc[valIds], y.iloc[valIds]

        # LightGBM parameters found by Bayesian optimization (from https://www.kaggle.com/tilii7/olivier-lightgbm-parameters-by-bayesian-opt/code)
        clf = lgb.LGBMClassifier(
            objective = 'binary',
            boosting_type = 'gbdt',
            nthread=4,
            n_estimators=5000,
            learning_rate=0.03,
            num_leaves=34,
            colsample_bytree=0.9497036,
            subsample=0.8715623,
            max_depth=8,
            reg_alpha=0.041545473,
            reg_lambda=0.0735294,
            min_split_gain=0.0222415,
            min_child_weight=39.3259775,
            silent=-1,
            verbose=-1, )

        clf.fit(
            trainX,
            trainY,
            eval_set=[(trainX, trainY), (valX, valY)],
            eval_metric='auc',
            verbose=100,
            early_stopping_rounds=200 
        )

        # Show validation scores                        
        valPreds[valIds] = clf.predict_proba(valX, num_iteration=clf.best_iteration_)[:, 1]
        print('Fold %2d AUC : %.6f' %(fold + 1, roc_auc_score(valY, valPreds[valIds])))
        
        # Free space
        del trainX, trainY, valX, valY
        gc.collect()

    
    # Feature Importance 

    # Split another time to compute meaningful pimp
    trainX, valX, trainY, valY = train_test_split(train, y, test_size = 0.25, random_state = 43)
    
    print('Computing features permutation importance')
    resultAuc = permutation_importance(clf, valX, valY, scoring='roc_auc', n_repeats=2, random_state=42, n_jobs=-1)
    importanceDf.loc[0] += (resultAuc.importances_mean / folds.n_splits) #N: same as dividing after!
    
    print('5 Most important features for Fold %2d' %(fold + 1))
    best5Ids = resultAuc.importances_mean.argsort()[::-1][:5]
    print(train.columns[best5Ids])

    return roc_auc_score(y, valPreds), importanceDf

In [8]:
# 5-Fold
%time auscore, importances = trainNpredLgb(train, target, 5)

Training until validation scores don't improve for 200 rounds
[100]	training's auc: 0.784843	training's binary_logloss: 0.240265	valid_1's auc: 0.765697	valid_1's binary_logloss: 0.244769
[200]	training's auc: 0.80784	training's binary_logloss: 0.230811	valid_1's auc: 0.778215	valid_1's binary_logloss: 0.239896
[300]	training's auc: 0.821694	training's binary_logloss: 0.225044	valid_1's auc: 0.782862	valid_1's binary_logloss: 0.2381
[400]	training's auc: 0.832835	training's binary_logloss: 0.220492	valid_1's auc: 0.785045	valid_1's binary_logloss: 0.237187
[500]	training's auc: 0.842018	training's binary_logloss: 0.216701	valid_1's auc: 0.786146	valid_1's binary_logloss: 0.236701
[600]	training's auc: 0.849901	training's binary_logloss: 0.213399	valid_1's auc: 0.787015	valid_1's binary_logloss: 0.236336
[700]	training's auc: 0.857528	training's binary_logloss: 0.210199	valid_1's auc: 0.787434	valid_1's binary_logloss: 0.236154
[800]	training's auc: 0.864693	training's binary_logloss: 0

In [9]:
# Display Train AUC
print('Full AUC train score %.6f' % auscore)

Full AUC train score 0.788084


In [10]:
# Save feats importance
utilities.to_csv(importances, '../../datatmp/data/importances.csv')