In [1]:
pip install fastparquet lightgbm

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
import boto3
import pandas as pd; pd.set_option('display.max_columns', 100)
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
%matplotlib inline

from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression

from lightgbm import LGBMClassifier
import lightgbm as lgb

s3 = boto3.resource('s3')
bucket_name = 'analytics-data-science-competitions'
bucket = s3.Bucket(bucket_name)

file_key_1 = 'Tabular-Playground-Series/Tabular-Playground-Nov-2022/sample_submission.csv'

bucket_object_1 = bucket.Object(file_key_1)
file_object_1 = bucket_object_1.get()
file_content_stream_1 = file_object_1.get('Body')

## Reading data-files
submission = pd.read_csv(file_content_stream_1)
df = pd.read_parquet('s3://analytics-data-science-competitions/Tabular-Playground-Series/Tabular-Playground-Nov-2022/preds_concat_gzip.parquet', engine = 'fastparquet')

In [3]:
preds_df = df.clip(0, 1) ## Notice that some of the model prediction files have negative likehookds and greater than 1 likelihoods
train = preds_df[preds_df['target'].notnull()]
test = preds_df[preds_df['target'].isnull()] 

In [4]:
def get_feature_importances(data, shuffle, seed=None):
    # Gather real features
    train_features = [f for f in data if f not in ['target']]
    # Go over fold and keep track of CV score (train and valid) and feature importances
    
    # Shuffle target if required
    y = data['target'].copy()
    if shuffle:
        # Here you could as well use a binomial distribution
        y = data['target'].copy().sample(frac = 1.0)
    
    # Fit LightGBM in RF mode, yes it's quicker than sklearn RandomForest
    dtrain = lgb.Dataset(data[train_features], y, free_raw_data = False, silent = True)
    lgb_params = {
        'objective': 'binary',
        'boosting_type': 'rf',
        'subsample': 0.623,
        'colsample_bytree': 0.7,
        'num_leaves': 127,
        'max_depth': 8,
        'seed': seed,
        'bagging_freq': 1,
        'n_jobs': 4
    }
    
    # Fit the model
    clf = lgb.train(params=lgb_params, train_set=dtrain, num_boost_round=200)

    # Get feature importances
    imp_df = pd.DataFrame()
    imp_df["feature"] = list(train_features)
    imp_df["importance_gain"] = clf.feature_importance(importance_type='gain')
    imp_df["importance_split"] = clf.feature_importance(importance_type='split')
    imp_df['trn_score'] = log_loss(y, clf.predict(data[train_features]))
    
    return imp_df

In [5]:
# Seed the unexpected randomness of this world
np.random.seed(253)
# Get the actual importance, i.e. without shuffling
actual_imp_df = get_feature_importances(data=train, shuffle=False)



[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261381
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


In [6]:
actual_imp_df.head()

Unnamed: 0,feature,importance_gain,importance_split,trn_score
0,0.7301891713.csv,98.469757,107,0.481366
1,0.6750726968.csv,95.595743,55,0.481366
2,0.7194704070.csv,44.631556,40,0.481366
3,0.7107007521.csv,32.283775,29,0.481366
4,0.6952032365.csv,21.002132,29,0.481366


In [7]:
null_imp_df = pd.DataFrame()
nb_runs = 80
import time
start = time.time()
dsp = ''
for i in range(nb_runs):
    # Get current run importances
    imp_df = get_feature_importances(data=train, shuffle=True)
    imp_df['run'] = i + 1 
    # Concat the latest importances with the old ones
    null_imp_df = pd.concat([null_imp_df, imp_df], axis=0)
    # Erase previous message
    for l in range(len(dsp)):
        print('\b', end='', flush=True)
    # Display current run and time used
    spent = (time.time() - start) / 60
    dsp = 'Done with %4d of %4d (Spent %5.1f min)' % (i + 1, nb_runs, spent)
    print(dsp, end='', flush=True)



[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261381
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Done with    1 of   80 (Spent   1.0 min)



[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261381
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Done with    2 of   80 (Spent   2.0 min)



[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261381
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Done with    3 of   80 (Spent   2.9 min)



[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261381
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Done with    4 of   80 (Spent   3.9 min)



[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261381
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Done with    5 of   80 (Spent   4.8 min)



[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261381
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Done with    6 of   80 (Spent   5.7 min)



[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261381
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Done with    7 of   80 (Spent   6.6 min)



[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261381
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Done with    8 of   80 (Spent   7.5 min)



[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261381
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Done with    9 of   80 (Spent   8.4 min)



[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261381
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Done with   10 of   80 (Spent   9.3 min)



[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261381
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Done with   11 of   80 (Spent  10.3 min)



[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261381
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Done with   12 of   80 (Spent  11.2 min)



[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261381
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Done with   13 of   80 (Spent  12.1 min)



[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261381
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Done with   14 of   80 (Spent  13.0 min)



[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261381
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Done with   15 of   80 (Spent  14.0 min)



[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261381
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Done with   16 of   80 (Spent  14.9 min)



[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261381
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Done with   17 of   80 (Spent  15.7 min)



[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261381
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Done with   18 of   80 (Spent  16.8 min)



[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261381
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Done with   19 of   80 (Spent  17.6 min)



[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261381
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Done with   20 of   80 (Spent  18.7 min)



[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261381
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Done with   21 of   80 (Spent  19.8 min)



[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261381
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Done with   22 of   80 (Spent  20.7 min)



[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261381
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Done with   23 of   80 (Spent  21.7 min)



[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261381
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Done with   24 of   80 (Spent  22.5 min)



[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261381
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Done with   25 of   80 (Spent  23.4 min)



[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261381
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Done with   26 of   80 (Spent  24.5 min)



[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261381
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Done with   27 of   80 (Spent  25.5 min)



[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261381
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Done with   28 of   80 (Spent  26.4 min)



[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261381
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Done with   29 of   80 (Spent  27.4 min)



[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261381
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Done with   30 of   80 (Spent  28.3 min)



[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261381
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Done with   31 of   80 (Spent  29.1 min)



[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261381
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Done with   32 of   80 (Spent  30.0 min)



[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261381
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Done with   33 of   80 (Spent  31.1 min)



[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261381
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Done with   34 of   80 (Spent  32.1 min)



[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261381
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Done with   35 of   80 (Spent  33.0 min)



[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261381
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Done with   36 of   80 (Spent  33.8 min)



[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261381
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Done with   37 of   80 (Spent  34.8 min)



[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261381
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Done with   38 of   80 (Spent  35.7 min)



[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261381
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Done with   39 of   80 (Spent  36.7 min)



[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261381
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Done with   40 of   80 (Spent  37.5 min)



[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261381
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Done with   41 of   80 (Spent  38.3 min)



[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261381
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Done with   42 of   80 (Spent  39.2 min)



[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261381
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Done with   43 of   80 (Spent  40.1 min)



[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261381
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Done with   44 of   80 (Spent  40.9 min)



[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261381
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Done with   45 of   80 (Spent  41.9 min)



[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261381
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Done with   46 of   80 (Spent  42.8 min)



[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261381
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Done with   47 of   80 (Spent  43.8 min)



[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261381
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Done with   48 of   80 (Spent  44.7 min)



[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261381
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Done with   49 of   80 (Spent  45.5 min)



[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261381
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Done with   50 of   80 (Spent  46.4 min)



[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261381
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Done with   51 of   80 (Spent  47.4 min)



[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261381
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Done with   52 of   80 (Spent  48.3 min)



[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261381
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Done with   53 of   80 (Spent  49.2 min)



[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261381
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Done with   54 of   80 (Spent  50.1 min)



[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261381
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Done with   55 of   80 (Spent  50.9 min)



[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261381
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Done with   56 of   80 (Spent  51.8 min)



[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261381
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Done with   57 of   80 (Spent  52.7 min)



[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261381
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Done with   58 of   80 (Spent  53.6 min)



[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261381
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Done with   59 of   80 (Spent  54.5 min)



[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261381
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Done with   60 of   80 (Spent  55.4 min)



[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261381
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Done with   61 of   80 (Spent  56.2 min)



[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261381
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Done with   62 of   80 (Spent  57.0 min)



[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261381
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Done with   63 of   80 (Spent  57.8 min)



[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261381
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Done with   64 of   80 (Spent  58.7 min)



[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261381
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Done with   65 of   80 (Spent  59.6 min)



[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261381
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Done with   66 of   80 (Spent  60.5 min)



[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261381
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Done with   67 of   80 (Spent  61.4 min)



[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261381
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Done with   68 of   80 (Spent  62.4 min)



[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261381
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Done with   69 of   80 (Spent  63.2 min)



[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261381
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Done with   70 of   80 (Spent  64.3 min)



[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261381
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Done with   71 of   80 (Spent  65.3 min)



[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261381
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Done with   72 of   80 (Spent  66.3 min)



[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261381
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Done with   73 of   80 (Spent  67.2 min)



[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261381
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Done with   74 of   80 (Spent  68.0 min)



[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261381
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Done with   75 of   80 (Spent  68.9 min)



[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261381
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Done with   76 of   80 (Spent  69.8 min)



[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261381
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Done with   77 of   80 (Spent  71.0 min)



[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261381
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Done with   78 of   80 (Spent  72.0 min)



[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261381
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Done with   79 of   80 (Spent  72.8 min)



[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261381
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Done with   80 of   80 (Spent  73.8 min)

In [8]:
feature_scores = []
for _f in actual_imp_df['feature'].unique():
    f_null_imps_gain = null_imp_df.loc[null_imp_df['feature'] == _f, 'importance_gain'].values
    f_act_imps_gain = actual_imp_df.loc[actual_imp_df['feature'] == _f, 'importance_gain'].mean()
    gain_score = np.log(1e-10 + f_act_imps_gain / (1 + np.percentile(f_null_imps_gain, 75)))  # Avoid didvide by zero
    f_null_imps_split = null_imp_df.loc[null_imp_df['feature'] == _f, 'importance_split'].values
    f_act_imps_split = actual_imp_df.loc[actual_imp_df['feature'] == _f, 'importance_split'].mean()
    split_score = np.log(1e-10 + f_act_imps_split / (1 + np.percentile(f_null_imps_split, 75)))  # Avoid didvide by zero
    feature_scores.append((_f, split_score, gain_score))

scores_df = pd.DataFrame(feature_scores, columns=['feature', 'split_score', 'gain_score'])
scores_df.to_csv('scores_df.csv', index = False)

# plt.figure(figsize=(16, 16))
# gs = gridspec.GridSpec(1, 2)
# # Plot Split importances
# ax = plt.subplot(gs[0, 0])
# sns.barplot(x='split_score', y='feature', data=scores_df.sort_values('split_score', ascending=False).iloc[0:70], ax=ax)
# ax.set_title('Feature scores wrt split importances', fontweight='bold', fontsize=14)
# # Plot Gain importances
# ax = plt.subplot(gs[0, 1])
# sns.barplot(x='gain_score', y='feature', data=scores_df.sort_values('gain_score', ascending=False).iloc[0:70], ax=ax)
# ax.set_title('Feature scores wrt gain importances', fontweight='bold', fontsize=14)
# plt.tight_layout()