In [1]:
# ! pip install steppy

In [2]:
import logging
import os
import random
import sys
import multiprocessing as mp
from functools import reduce

import glob
import numpy as np
import pandas as pd
from tqdm import tqdm
import yaml
from attrdict import AttrDict
from functools import partial

def create_submission(meta, predictions):
    submission = pd.DataFrame({'SK_ID_CURR': meta['SK_ID_CURR'].tolist(),
                               'TARGET': predictions
                               })
    return submission


def verify_submission(submission, sample_submission):
    assert submission.shape == sample_submission.shape, \
        'Expected submission to have shape {} but got {}'.format(sample_submission.shape, submission.shape)

    for submission_id, correct_id in zip(submission['SK_ID_CURR'].values, sample_submission['SK_ID_CURR'].values):
        assert correct_id == submission_id, \
            'Wrong id: expected {} but got {}'.format(correct_id, submission_id)


def get_logger():
    return logging.getLogger('home-credit')


def init_logger():
    logger = logging.getLogger('home-credit')
    logger.setLevel(logging.INFO)
    message_format = logging.Formatter(fmt='%(asctime)s %(name)s >>> %(message)s',
                                       datefmt='%Y-%m-%d %H-%M-%S')

    # console handler for validation info
    ch_va = logging.StreamHandler(sys.stdout)
    ch_va.setLevel(logging.INFO)

    ch_va.setFormatter(fmt=message_format)

    # add the handlers to the logger
    logger.addHandler(ch_va)

    return logger


def read_params(ctx, fallback_file):
    if ctx.params.__class__.__name__ == 'OfflineContextParams':
        neptune_config = read_yaml(fallback_file)
        params = neptune_config.parameters
    else:
        params = ctx.params
    return params


def read_yaml(filepath):
    with open(filepath) as f:
        config = yaml.load(f)
    return AttrDict(config)


def parameter_eval(param):
    try:
        return eval(param)
    except Exception:
        return param


def persist_evaluation_predictions(experiment_directory, y_pred, raw_data, id_column, target_column):
    raw_data.loc[:, 'y_pred'] = y_pred.reshape(-1)
    predictions_df = raw_data.loc[:, [id_column, target_column, 'y_pred']]
    filepath = os.path.join(experiment_directory, 'evaluation_predictions.csv')
    logging.info('evaluation predictions csv shape: {}'.format(predictions_df.shape))
    predictions_df.to_csv(filepath, index=None)


def set_seed(seed=90210):
    random.seed(seed)
    np.random.seed(seed)


def calculate_rank(predictions):
    rank = (1 + predictions.rank().values) / (predictions.shape[0] + 1)
    return rank


def chunk_groups(groupby_object, chunk_size):
    n_groups = groupby_object.ngroups
    group_chunk, index_chunk = [], []
    for i, (index, df) in enumerate(groupby_object):
        group_chunk.append(df)
        index_chunk.append(index)

        if (i + 1) % chunk_size == 0 or i + 1 == n_groups:
            group_chunk_, index_chunk_ = group_chunk.copy(), index_chunk.copy()
            group_chunk, index_chunk = [], []
            yield index_chunk_, group_chunk_


def parallel_apply(groups, func, index_name='Index', num_workers=1, chunk_size=100000):
    n_chunks = np.ceil(1.0 * groups.ngroups / chunk_size)
    indeces, features = [], []
    for index_chunk, groups_chunk in tqdm(chunk_groups(groups, chunk_size), total=n_chunks):
        with mp.Pool(num_workers) as executor:
            features_chunk = executor.map(func, groups_chunk)
        features.extend(features_chunk)
        indeces.extend(index_chunk)

    features = pd.DataFrame(features)
    features.index = indeces
    features.index.name = index_name
    return features


def read_oof_predictions(prediction_dir, train_filepath, id_column, target_column):
    labels = pd.read_csv(train_filepath, usecols=[id_column, target_column])

    filepaths_train, filepaths_test = [], []
    for filepath in sorted(glob.glob('{}/*'.format(prediction_dir))):
        if filepath.endswith('_oof_train.csv'):
            filepaths_train.append(filepath)
        elif filepath.endswith('_oof_test.csv'):
            filepaths_test.append(filepath)

    train_dfs = []
    for filepath in filepaths_train:
        train_dfs.append(pd.read_csv(filepath))
    train_dfs = reduce(lambda df1, df2: pd.merge(df1, df2, on=[id_column, 'fold_id']), train_dfs)
    train_dfs.columns = _clean_columns(train_dfs, keep_colnames=[id_column, 'fold_id'])
    train_dfs = pd.merge(train_dfs, labels, on=[id_column])

    test_dfs = []
    for filepath in filepaths_test:
        test_dfs.append(pd.read_csv(filepath))
    test_dfs = reduce(lambda df1, df2: pd.merge(df1, df2, on=[id_column, 'fold_id']), test_dfs)
    test_dfs.columns = _clean_columns(test_dfs, keep_colnames=[id_column, 'fold_id'])
    return train_dfs, test_dfs


def _clean_columns(df, keep_colnames):
    new_colnames = keep_colnames
    feature_colnames = df.drop(keep_colnames, axis=1).columns
    for i, colname in enumerate(feature_colnames):
        new_colnames.append('model_{}'.format(i))
    return new_colnames

def safe_div(a, b):
    try:
        return float(a) / float(b)
    except:
        return 0.0

In [3]:
from steppy.base import BaseTransformer

class BasicHandCraftedFeatures(BaseTransformer):
    def __init__(self, num_workers=1, **kwargs):
        self.num_workers = num_workers
        self.features = None

    @property
    def feature_names(self):
        feature_names = list(self.features.columns)
        feature_names.remove('SK_ID_CURR')
        return feature_names

    def transform(self, **kwargs):
        return {'features_table': self.features}

    def load(self, filepath):
        self.features = joblib.load(filepath)
        return self

    def persist(self, filepath):
        joblib.dump(self.features, filepath)


class ApplicationFeatures(BaseTransformer):
    def __init__(self, categorical_columns, numerical_columns):
        self.categorical_columns = categorical_columns
        self.numerical_columns = numerical_columns
        self.engineered_numerical_columns = ['annuity_income_percentage',
                                             'car_to_birth_ratio',
                                             'car_to_employ_ratio',
                                             'children_ratio',
                                             'credit_to_annuity_ratio',
                                             'credit_to_goods_ratio',
                                             'credit_to_income_ratio',
                                             'days_employed_percentage',
                                             'income_credit_percentage',
                                             'income_per_child',
                                             'income_per_person',
                                             'payment_rate',
                                             'phone_to_birth_ratio',
                                             'phone_to_employ_ratio',
                                             'external_sources_weighted',
                                             'external_sources_min',
                                             'external_sources_max',
                                             'external_sources_sum',
                                             'external_sources_mean',
                                             'external_sources_nanmedian',
                                             'short_employment',
                                             'young_age',
                                             'cnt_non_child',
                                             'child_to_non_child_ratio',
                                             'income_per_non_child',
                                             'credit_per_person',
                                             'credit_per_child',
                                             'credit_per_non_child',
                                             ]

    def transform(self, X, **kwargs):
        X['annuity_income_percentage'] = X['AMT_ANNUITY'] / X['AMT_INCOME_TOTAL']
        X['car_to_birth_ratio'] = X['OWN_CAR_AGE'] / X['DAYS_BIRTH']
        X['car_to_employ_ratio'] = X['OWN_CAR_AGE'] / X['DAYS_EMPLOYED']
        X['children_ratio'] = X['CNT_CHILDREN'] / X['CNT_FAM_MEMBERS']
        X['credit_to_annuity_ratio'] = X['AMT_CREDIT'] / X['AMT_ANNUITY']
        X['credit_to_goods_ratio'] = X['AMT_CREDIT'] / X['AMT_GOODS_PRICE']
        X['credit_to_income_ratio'] = X['AMT_CREDIT'] / X['AMT_INCOME_TOTAL']
        X['days_employed_percentage'] = X['DAYS_EMPLOYED'] / X['DAYS_BIRTH']
        X['income_credit_percentage'] = X['AMT_INCOME_TOTAL'] / X['AMT_CREDIT']
        X['income_per_child'] = X['AMT_INCOME_TOTAL'] / (1 + X['CNT_CHILDREN'])
        X['income_per_person'] = X['AMT_INCOME_TOTAL'] / X['CNT_FAM_MEMBERS']
        X['payment_rate'] = X['AMT_ANNUITY'] / X['AMT_CREDIT']
        X['phone_to_birth_ratio'] = X['DAYS_LAST_PHONE_CHANGE'] / X['DAYS_BIRTH']
        X['phone_to_employ_ratio'] = X['DAYS_LAST_PHONE_CHANGE'] / X['DAYS_EMPLOYED']
        X['external_sources_weighted'] = X.EXT_SOURCE_1 * 2 + X.EXT_SOURCE_2 * 3 + X.EXT_SOURCE_3 * 4
        X['cnt_non_child'] = X['CNT_FAM_MEMBERS'] - X['CNT_CHILDREN']
        X['child_to_non_child_ratio'] = X['CNT_CHILDREN'] / X['cnt_non_child']
        X['income_per_non_child'] = X['AMT_INCOME_TOTAL'] / X['cnt_non_child']
        X['credit_per_person'] = X['AMT_CREDIT'] / X['CNT_FAM_MEMBERS']
        X['credit_per_child'] = X['AMT_CREDIT'] / (1 + X['CNT_CHILDREN'])
        X['credit_per_non_child'] = X['AMT_CREDIT'] / X['cnt_non_child']
        for function_name in ['min', 'max', 'sum', 'mean', 'nanmedian']:
            X['external_sources_{}'.format(function_name)] = eval('np.{}'.format(function_name))(
                X[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']], axis=1)

        X['short_employment'] = (X['DAYS_EMPLOYED'] < -2000).astype(int)
        X['young_age'] = (X['DAYS_BIRTH'] < -14000).astype(int)

        return {'numerical_features': X[self.engineered_numerical_columns + self.numerical_columns],
                'categorical_features': X[self.categorical_columns]
}

class POSCASHBalanceFeatures(BasicHandCraftedFeatures):
    def __init__(self, last_k_agg_periods, last_k_trend_periods, num_workers=1, **kwargs):
        super().__init__(num_workers=num_workers)
        self.last_k_agg_periods = last_k_agg_periods
        self.last_k_trend_periods = last_k_trend_periods

        self.num_workers = num_workers
        self.features = None

    def fit(self, pos_cash, **kwargs):
        pos_cash['is_contract_status_completed'] = pos_cash['NAME_CONTRACT_STATUS'] == 'Completed'
        pos_cash['pos_cash_paid_late'] = (pos_cash['SK_DPD'] > 0).astype(int)
        pos_cash['pos_cash_paid_late_with_tolerance'] = (pos_cash['SK_DPD_DEF'] > 0).astype(int)

        features = pd.DataFrame({'SK_ID_CURR': pos_cash['SK_ID_CURR'].unique()})
        groupby = pos_cash.groupby(['SK_ID_CURR'])
        func = partial(POSCASHBalanceFeatures.generate_features,
                       agg_periods=self.last_k_agg_periods,
                       trend_periods=self.last_k_trend_periods)
        g = parallel_apply(groupby, func, index_name='SK_ID_CURR', num_workers=self.num_workers).reset_index()
        features = features.merge(g, on='SK_ID_CURR', how='left')

        self.features = features
        return self

    @staticmethod
    def generate_features(gr, agg_periods, trend_periods):
        one_time = POSCASHBalanceFeatures.one_time_features(gr)
        all = POSCASHBalanceFeatures.all_installment_features(gr)
        agg = POSCASHBalanceFeatures.last_k_installment_features(gr, agg_periods)
        trend = POSCASHBalanceFeatures.trend_in_last_k_installment_features(gr, trend_periods)
        last = POSCASHBalanceFeatures.last_loan_features(gr)
        features = {**one_time, **all, **agg, **trend, **last}
        return pd.Series(features)

    @staticmethod
    def one_time_features(gr):
        gr_ = gr.copy()
        gr_.sort_values(['MONTHS_BALANCE'], inplace=True)
        features = {}

        features['pos_cash_remaining_installments'] = gr_['CNT_INSTALMENT_FUTURE'].tail(1)
        features['pos_cash_completed_contracts'] = gr_['is_contract_status_completed'].agg('sum')

        return features

    @staticmethod
    def all_installment_features(gr):
        return POSCASHBalanceFeatures.last_k_installment_features(gr, periods=[10e16])

    @staticmethod
    def last_k_installment_features(gr, periods):
        gr_ = gr.copy()
        gr_.sort_values(['MONTHS_BALANCE'], ascending=False, inplace=True)

        features = {}
        for period in periods:
            if period > 10e10:
                period_name = 'all_installment_'
                gr_period = gr_.copy()
            else:
                period_name = 'last_{}_'.format(period)
                gr_period = gr_.iloc[:period]

            features = add_features_in_group(features, gr_period, 'pos_cash_paid_late',
                                             ['count', 'mean'],
                                             period_name)
            features = add_features_in_group(features, gr_period, 'pos_cash_paid_late_with_tolerance',
                                             ['count', 'mean'],
                                             period_name)
            features = add_features_in_group(features, gr_period, 'SK_DPD',
                                             ['sum', 'mean', 'max', 'std', 'skew', 'kurt'],
                                             period_name)
            features = add_features_in_group(features, gr_period, 'SK_DPD_DEF',
                                             ['sum', 'mean', 'max', 'std', 'skew', 'kurt'],
                                             period_name)
        return features

    @staticmethod
    def trend_in_last_k_installment_features(gr, periods):
        gr_ = gr.copy()
        gr_.sort_values(['MONTHS_BALANCE'], ascending=False, inplace=True)

        features = {}
        for period in periods:
            gr_period = gr_.iloc[:period]

            features = add_trend_feature(features, gr_period,
                                         'SK_DPD', '{}_period_trend_'.format(period)
                                         )
            features = add_trend_feature(features, gr_period,
                                         'SK_DPD_DEF', '{}_period_trend_'.format(period)
                                         )
            features = add_trend_feature(features, gr_period,
                                         'CNT_INSTALMENT_FUTURE', '{}_period_trend_'.format(period)
                                         )
        return features

    @staticmethod
    def last_loan_features(gr):
        gr_ = gr.copy()
        gr_.sort_values(['MONTHS_BALANCE'], ascending=False, inplace=True)
        last_installment_id = gr_['SK_ID_PREV'].iloc[0]
        gr_ = gr_[gr_['SK_ID_PREV'] == last_installment_id]

        features={}
        features = add_features_in_group(features, gr_, 'pos_cash_paid_late',
                                         ['count', 'sum', 'mean'],
                                         'last_loan_')
        features = add_features_in_group(features, gr_, 'pos_cash_paid_late_with_tolerance',
                                         ['mean'],
                                         'last_loan_')
        features = add_features_in_group(features, gr_, 'SK_DPD',
                                         ['sum', 'mean', 'max', 'std'],
                                         'last_loan_')
        features = add_features_in_group(features, gr_, 'SK_DPD_DEF',
                                         ['sum', 'mean', 'max', 'std'],
                                         'last_loan_')

        return features



In [4]:
import numpy as np
import pandas as pd
import gc
import time
import category_encoders as ce
from contextlib import contextmanager
import lightgbm as lgb
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from scipy.cluster.vq import kmeans2, whiten
from sklearn.preprocessing import Imputer
from sklearn.decomposition import truncated_svd
import category_encoders as ce
from catboost import CatBoostClassifier
from sklearn import preprocessing
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

num_rows = None
EPS = 1e-100



In [5]:
descretize = lambda x, n: list(map(str, list(pd.qcut(x, n, duplicates='drop'))))

def binary_encoder(df, n_train):
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    enc = ce.BinaryEncoder(impute_missing=True, cols=categorical_columns).fit(df[0:n_train], df[0:n_train]['TARGET'])
    df = enc.transform(df)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df[new_columns]

def helmert_encoder(df, n_train):
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    enc = ce.HelmertEncoder(impute_missing=True, cols=categorical_columns).fit(df[0:n_train], df[0:n_train]['TARGET'])
    df = enc.transform(df)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df[new_columns]

def target_encoder(df, n_train):
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    enc = ce.TargetEncoder(impute_missing=True, cols=categorical_columns).fit(df[0:n_train], df[0:n_train]['TARGET'])
    df = enc.transform(df)
    return df[categorical_columns]

def poly_encoder(df, n_train):
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    enc = ce.PolynomialEncoder(impute_missing=True, cols=categorical_columns).fit(df[0:n_train], df[0:n_train]['TARGET'])
    df = enc.transform(df)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df[new_columns]

In [6]:
# df = pd.read_csv('/media/limbo/Home-Credit/data/application_train.csv.zip', nrows=num_rows)
# n_train = df.shape[0]

# test_df = pd.read_csv('/media/limbo/Home-Credit/data/application_test.csv.zip', nrows=num_rows)
# df = df.append(test_df).reset_index()

In [7]:
#target_encoder(df, n_train)

In [8]:
# binary_encoder(df, n_train).shape

In [9]:
train = pd.read_csv('../data/application_train.csv.zip', nrows=num_rows)
n_train = train.shape[0]

In [10]:
def application_train_test(num_rows=num_rows, nan_as_category=False):
    # Read data and merge
    df = pd.read_csv('../data/application_train.csv.zip', nrows=num_rows)

    n_train = df.shape[0]

    test_df = pd.read_csv('../data/application_test.csv.zip', nrows=num_rows)
    print("Train samples: {}, test samples: {}".format(len(df), len(test_df)))
    df = df.append(test_df).reset_index()

    df['CODE_GENDER'].replace('XNA', np.nan, inplace=True)
    df['DAYS_EMPLOYED'].replace(365243, np.nan, inplace=True)
    df['NAME_FAMILY_STATUS'].replace('Unknown', np.nan, inplace=True)
    df['ORGANIZATION_TYPE'].replace('XNA', np.nan, inplace=True)
    
    df['annuity_income_percentage'] = df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL']
    df['car_to_birth_ratio'] = df['OWN_CAR_AGE'] / df['DAYS_BIRTH']
    df['car_to_employ_ratio'] = df['OWN_CAR_AGE'] / df['DAYS_EMPLOYED']
    df['children_ratio'] = df['CNT_CHILDREN'] / df['CNT_FAM_MEMBERS']
    df['credit_to_annuity_ratio'] = df['AMT_CREDIT'] / df['AMT_ANNUITY']
    df['credit_to_goods_ratio'] = df['AMT_CREDIT'] / df['AMT_GOODS_PRICE']
    df['credit_to_income_ratio'] = df['AMT_CREDIT'] / df['AMT_INCOME_TOTAL']
    df['days_employed_percentage'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH']
    df['income_credit_percentage'] = df['AMT_INCOME_TOTAL'] / df['AMT_CREDIT']
    df['income_per_child'] = df['AMT_INCOME_TOTAL'] / (1 + df['CNT_CHILDREN'])
    df['income_per_person'] = df['AMT_INCOME_TOTAL'] / df['CNT_FAM_MEMBERS']
    df['payment_rate'] = df['AMT_ANNUITY'] / df['AMT_CREDIT']
    df['phone_to_birth_ratio'] = df['DAYS_LAST_PHONE_CHANGE'] / df['DAYS_BIRTH']
    df['phone_to_employ_ratio'] = df['DAYS_LAST_PHONE_CHANGE'] / df['DAYS_EMPLOYED']
    df['external_sources_weighted'] = df.EXT_SOURCE_1 * 2 + df.EXT_SOURCE_2 * 3 + df.EXT_SOURCE_3 * 4
    df['cnt_non_child'] = df['CNT_FAM_MEMBERS'] - df['CNT_CHILDREN']
    df['child_to_non_child_ratio'] = df['CNT_CHILDREN'] / df['cnt_non_child']
    df['income_per_non_child'] = df['AMT_INCOME_TOTAL'] / df['cnt_non_child']
    df['credit_per_person'] = df['AMT_CREDIT'] / df['CNT_FAM_MEMBERS']
    df['credit_per_child'] = df['AMT_CREDIT'] / (1 + df['CNT_CHILDREN'])
    df['credit_per_non_child'] = df['AMT_CREDIT'] / df['cnt_non_child']
    for function_name in ['min', 'max', 'sum', 'mean', 'nanmedian']:
        df['external_sources_{}'.format(function_name)] = eval('np.{}'.format(function_name))(
        df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']], axis=1)

    df['short_employment'] = (df['DAYS_EMPLOYED'] < -2000).astype(int)
    df['young_age'] = (df['DAYS_BIRTH'] < -14000).astype(int)
    
    return df

In [11]:
df = application_train_test(num_rows=num_rows, nan_as_category=False)

Train samples: 307511, test samples: 48744


  r = func(a, **kwargs)


In [12]:
categorical_columns = [col for col in df.columns if df[col].dtype == 'object']

In [13]:
df[categorical_columns].head()

Unnamed: 0,CODE_GENDER,EMERGENCYSTATE_MODE,FLAG_OWN_CAR,FLAG_OWN_REALTY,FONDKAPREMONT_MODE,HOUSETYPE_MODE,NAME_CONTRACT_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,NAME_INCOME_TYPE,NAME_TYPE_SUITE,OCCUPATION_TYPE,ORGANIZATION_TYPE,WALLSMATERIAL_MODE,WEEKDAY_APPR_PROCESS_START
0,M,No,N,Y,reg oper account,block of flats,Cash loans,Secondary / secondary special,Single / not married,House / apartment,Working,Unaccompanied,Laborers,Business Entity Type 3,"Stone, brick",WEDNESDAY
1,F,No,N,N,reg oper account,block of flats,Cash loans,Higher education,Married,House / apartment,State servant,Family,Core staff,School,Block,MONDAY
2,M,,Y,Y,,,Revolving loans,Secondary / secondary special,Single / not married,House / apartment,Working,Unaccompanied,Laborers,Government,,MONDAY
3,F,,N,Y,,,Cash loans,Secondary / secondary special,Civil marriage,House / apartment,Working,Unaccompanied,Laborers,Business Entity Type 3,,WEDNESDAY
4,M,,N,Y,,,Cash loans,Secondary / secondary special,Single / not married,House / apartment,Working,Unaccompanied,Core staff,Religion,,THURSDAY


In [14]:
df.shape

(356255, 151)

In [15]:
gc.collect()

14

In [16]:
def one_hot_encoder(df, nan_as_category=False):
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    df = pd.get_dummies(df, columns= categorical_columns, dummy_na= nan_as_category)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns

In [17]:
def bureau_and_balance(num_rows = None, nan_as_category = True):
    
    bureau = pd.read_csv('../data/bureau.csv', nrows = num_rows)
    bureau['bureau_credit_active_binary'] = (bureau['CREDIT_ACTIVE'] != 'Closed').astype(int)
    bureau['bureau_credit_enddate_binary'] = (bureau['DAYS_CREDIT_ENDDATE'] > 0).astype(int)
    features = pd.DataFrame({'SK_ID_CURR': bureau['SK_ID_CURR'].unique()})

    groupby = bureau.groupby(by=['SK_ID_CURR'])

    g = groupby['DAYS_CREDIT'].agg('count').reset_index()
    g.rename(index=str, columns={'DAYS_CREDIT': 'bureau_number_of_past_loans'}, inplace=True)
    features = features.merge(g, on=['SK_ID_CURR'], how='left')

    g = groupby['CREDIT_TYPE'].agg('nunique').reset_index()
    g.rename(index=str, columns={'CREDIT_TYPE': 'bureau_number_of_loan_types'}, inplace=True)
    features = features.merge(g, on=['SK_ID_CURR'], how='left')

    g = groupby['bureau_credit_active_binary'].agg('mean').reset_index()
    g.rename(index=str, columns={'bureau_credit_active_binary': 'bureau_credit_active_binary'}, inplace=True)
    features = features.merge(g, on=['SK_ID_CURR'], how='left')

    g = groupby['AMT_CREDIT_SUM_DEBT'].agg('sum').reset_index()
    g.rename(index=str, columns={'AMT_CREDIT_SUM_DEBT': 'bureau_total_customer_debt'}, inplace=True)
    features = features.merge(g, on=['SK_ID_CURR'], how='left')

    g = groupby['AMT_CREDIT_SUM'].agg('sum').reset_index()
    g.rename(index=str, columns={'AMT_CREDIT_SUM': 'bureau_total_customer_credit'}, inplace=True)
    features = features.merge(g, on=['SK_ID_CURR'], how='left')

    g = groupby['AMT_CREDIT_SUM_OVERDUE'].agg('sum').reset_index()
    g.rename(index=str, columns={'AMT_CREDIT_SUM_OVERDUE': 'bureau_total_customer_overdue'}, inplace=True)
    features = features.merge(g, on=['SK_ID_CURR'], how='left')

    g = groupby['CNT_CREDIT_PROLONG'].agg('sum').reset_index()
    g.rename(index=str, columns={'CNT_CREDIT_PROLONG': 'bureau_average_creditdays_prolonged'}, inplace=True)
    features = features.merge(g, on=['SK_ID_CURR'], how='left')

    g = groupby['bureau_credit_enddate_binary'].agg('mean').reset_index()
    g.rename(index=str, columns={'bureau_credit_enddate_binary': 'bureau_credit_enddate_percentage'}, inplace=True)
    features = features.merge(g, on=['SK_ID_CURR'], how='left')

    features['bureau_average_of_past_loans_per_type'] = \
        features['bureau_number_of_past_loans'] / features['bureau_number_of_loan_types']

    features['bureau_debt_credit_ratio'] = \
        features['bureau_total_customer_debt'] / features['bureau_total_customer_credit']

    features['bureau_overdue_debt_ratio'] = \
    features['bureau_total_customer_overdue'] / features['bureau_total_customer_debt']
    
    return features

In [18]:
bureau = bureau_and_balance(num_rows)

In [19]:
bureau.shape

(305811, 12)

In [20]:
bureau.head()

Unnamed: 0,SK_ID_CURR,bureau_number_of_past_loans,bureau_number_of_loan_types,bureau_credit_active_binary,bureau_total_customer_debt,bureau_total_customer_credit,bureau_total_customer_overdue,bureau_average_creditdays_prolonged,bureau_credit_enddate_percentage,bureau_average_of_past_loans_per_type,bureau_debt_credit_ratio,bureau_overdue_debt_ratio
0,215354,11,3,0.545455,284463.18,5973945.3,0.0,0,0.454545,3.666667,0.047617,0.0
1,162297,6,3,0.5,0.0,8230386.15,0.0,0,0.166667,2.0,0.0,
2,402440,1,1,1.0,76905.0,89910.0,0.0,0,1.0,1.0,0.855355,0.0
3,238881,8,2,0.375,552730.5,1285239.06,0.0,0,0.125,4.0,0.43006,0.0
4,222183,8,3,0.625,1185081.84,7158960.0,0.0,0,0.625,2.666667,0.165538,0.0


In [66]:
#df[bureau.columns].to_csv('../data/neptune_bureau_0.csv')

In [67]:
df.shape

(356255, 218)

In [21]:
df = df.join(bureau, how='left', on='SK_ID_CURR', lsuffix='_left', rsuffix='_right')
# del bureau
gc.collect()

189

In [22]:
df['SK_ID_CURR'] = df['SK_ID_CURR_left'].values
del df['SK_ID_CURR_left']
del df['SK_ID_CURR_right']

In [23]:
df.head()

Unnamed: 0,index,AMT_ANNUITY,AMT_CREDIT,AMT_GOODS_PRICE,AMT_INCOME_TOTAL,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_WEEK,...,bureau_credit_active_binary,bureau_total_customer_debt,bureau_total_customer_credit,bureau_total_customer_overdue,bureau_average_creditdays_prolonged,bureau_credit_enddate_percentage,bureau_average_of_past_loans_per_type,bureau_debt_credit_ratio,bureau_overdue_debt_ratio,SK_ID_CURR
0,0,24700.5,406597.5,351000.0,202500.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,765000.0,0.0,0.0,1.0,1.0,0.0,,100002
1,1,35698.5,1293502.5,1129500.0,270000.0,0.0,0.0,0.0,0.0,0.0,...,0.25,43393.5,1372036.5,0.0,0.0,0.375,8.0,0.031627,0.0,100003
2,2,6750.0,135000.0,135000.0,67500.0,0.0,0.0,0.0,0.0,0.0,...,0.333333,457915.5,1133982.0,0.0,0.0,0.333333,6.0,0.403812,0.0,100004
3,3,29686.5,312682.5,297000.0,135000.0,,,,,,...,1.0,77346.0,90000.0,0.0,0.0,1.0,1.0,0.8594,0.0,100006
4,4,21865.5,513000.0,513000.0,121500.0,0.0,0.0,0.0,0.0,0.0,...,1.0,373176.0,462150.0,0.0,0.0,1.0,1.0,0.807478,0.0,100007


In [46]:
def previous_applications(num_rows=None, nan_as_category=True):
    prev_applications = pd.read_csv('../data/previous_application.csv', nrows = num_rows)
    
    features = pd.DataFrame({'SK_ID_CURR': prev_applications['SK_ID_CURR'].unique()})

    prev_app_sorted = prev_applications.sort_values(['SK_ID_CURR', 'DAYS_DECISION'])
    prev_app_sorted_groupby = prev_app_sorted.groupby(by=['SK_ID_CURR'])

    prev_app_sorted['previous_application_prev_was_approved'] = (
        prev_app_sorted['NAME_CONTRACT_STATUS'] == 'Approved').astype('int')
    g = prev_app_sorted_groupby['previous_application_prev_was_approved'].last().reset_index()
    features = features.merge(g, on=['SK_ID_CURR'], how='left')

    prev_app_sorted['previous_application_prev_was_refused'] = (
        prev_app_sorted['NAME_CONTRACT_STATUS'] == 'Refused').astype('int')
    g = prev_app_sorted_groupby['previous_application_prev_was_refused'].last().reset_index()
    features = features.merge(g, on=['SK_ID_CURR'], how='left')

    g = prev_app_sorted_groupby['SK_ID_PREV'].agg('nunique').reset_index()
    g.rename(index=str, columns={'SK_ID_PREV': 'previous_application_number_of_prev_application'}, inplace=True)
    features = features.merge(g, on=['SK_ID_CURR'], how='left')

    g = prev_app_sorted.groupby(by=['SK_ID_CURR'])['previous_application_prev_was_refused'].mean().reset_index()
    g.rename(index=str, columns={
        'previous_application_prev_was_refused': 'previous_application_fraction_of_refused_applications'},
             inplace=True)
    features = features.merge(g, on=['SK_ID_CURR'], how='left')

    prev_app_sorted['prev_applications_prev_was_revolving_loan'] = (
        prev_app_sorted['NAME_CONTRACT_TYPE'] == 'Revolving loans').astype('int')
    g = prev_app_sorted.groupby(by=['SK_ID_CURR'])[
        'prev_applications_prev_was_revolving_loan'].last().reset_index()
    features = features.merge(g, on=['SK_ID_CURR'], how='left')

    for number in [1, 2, 3, 8, 16, 32]:
        prev_applications_tail = prev_app_sorted_groupby.tail(number)

        tail_groupby = prev_applications_tail.groupby(by=['SK_ID_CURR'])

        g = tail_groupby['CNT_PAYMENT'].agg('mean').reset_index()
        g.rename(index=str,
                 columns={'CNT_PAYMENT': 'previous_application_term_of_last_{}_credits_mean'.format(number)},
                 inplace=True)
        features = features.merge(g, on=['SK_ID_CURR'], how='left')

        g = tail_groupby['DAYS_DECISION'].agg('mean').reset_index()
        g.rename(index=str,
                 columns={'DAYS_DECISION': 'previous_application_days_decision_about_last_{}_credits_mean'.format(
                     number)},
                 inplace=True)
        features = features.merge(g, on=['SK_ID_CURR'], how='left')

        g = tail_groupby['DAYS_FIRST_DRAWING'].agg('mean').reset_index()
        g.rename(index=str,
                 columns={
                     'DAYS_FIRST_DRAWING': 'previous_application_days_first_drawing_last_{}_credits_mean'.format(
                         number)},
                 inplace=True)
    features = features.merge(g, on=['SK_ID_CURR'], how='left')
    return features

# Preprocess POS_CASH_balance.csv
def pos_cash(num_rows = None, nan_as_category = True):
    pos = pd.read_csv('../data/POS_CASH_balance.csv', nrows=num_rows)
    pos['is_contract_status_completed'] = pos['NAME_CONTRACT_STATUS'] == 'Completed'
    pos['pos_cash_paid_late'] = (pos['SK_DPD'] > 0).astype(int)
    pos['pos_cash_paid_late_with_tolerance'] = (pos['SK_DPD_DEF'] > 0).astype(int)

    features = pd.DataFrame({'SK_ID_CURR': pos['SK_ID_CURR'].unique()})
#     groupby = pos.groupby(['SK_ID_CURR'])
#     func = partial(POSCASHBalanceFeatures.generate_features,
#                        agg_periods=[15, 30, 60, 180],
#                        trend_periods=[15, 30, 60, 180])
#     g = parallel_apply(groupby, func, index_name='SK_ID_CURR', num_workers=4).reset_index()
    
#     features = features.merge(g, on='SK_ID_CURR', how='left')
    return features

# Preprocess installments_payments.csv
def installments_payments(num_rows = None, nan_as_category = True):
    installments = pd.read_csv('/media/limbo/Home-Credit/data/installments_payments.csv', nrows = num_rows)
    
    installments['installment_paid_late_in_days'] = installments['DAYS_ENTRY_PAYMENT'] - installments[
            'DAYS_INSTALMENT']
    installments['installment_paid_late'] = (installments['installment_paid_late_in_days'] > 0).astype(int)
    installments['installment_paid_over_amount'] = installments['AMT_PAYMENT'] - installments['AMT_INSTALMENT']
    installments['installment_paid_over'] = (installments['installment_paid_over_amount'] > 0).astype(int)

    features = pd.DataFrame({'SK_ID_CURR': installments['SK_ID_CURR'].unique()})
#     groupby = installments.groupby(['SK_ID_CURR'])

#     func = partial(InstallmentPaymentsFeatures.generate_features,
#                    agg_periods=self.last_k_agg_periods,
#                    period_fractions=self.last_k_agg_period_fractions,
#                    trend_periods=self.last_k_trend_periods)
#     g = parallel_apply(groupby, func, index_name='SK_ID_CURR', num_workers=self.num_workers).reset_index()
    #features = features.merge(g, on='SK_ID_CURR', how='left')
    return features

# Preprocess credit_card_balance.csv
def credit_card_balance(num_rows = None, nan_as_category = True):
    credit_card = pd.read_csv('../data/credit_card_balance.csv', nrows = num_rows)
    credit_card['number_of_installments'] = credit_card.groupby(
            by=['SK_ID_CURR', 'SK_ID_PREV'])['CNT_INSTALMENT_MATURE_CUM'].agg('max').reset_index()[
            'CNT_INSTALMENT_MATURE_CUM']

    credit_card['credit_card_max_loading_of_credit_limit'] = credit_card.groupby(
        by=['SK_ID_CURR', 'SK_ID_PREV', 'AMT_CREDIT_LIMIT_ACTUAL']).apply(
        lambda x: x.AMT_BALANCE.max() / x.AMT_CREDIT_LIMIT_ACTUAL.max()).reset_index()[0]

    features = pd.DataFrame({'SK_ID_CURR': credit_card['SK_ID_CURR'].unique()})

    groupby = credit_card.groupby(by=['SK_ID_CURR'])

    g = groupby['SK_ID_PREV'].agg('nunique').reset_index()
    g.rename(index=str, columns={'SK_ID_PREV': 'credit_card_number_of_loans'}, inplace=True)
    features = features.merge(g, on=['SK_ID_CURR'], how='left')

    g = groupby['SK_DPD'].agg('mean').reset_index()
    g.rename(index=str, columns={'SK_DPD': 'credit_card_average_of_days_past_due'}, inplace=True)
    features = features.merge(g, on=['SK_ID_CURR'], how='left')

    g = groupby['AMT_DRAWINGS_ATM_CURRENT'].agg('sum').reset_index()
    g.rename(index=str, columns={'AMT_DRAWINGS_ATM_CURRENT': 'credit_card_drawings_atm'}, inplace=True)
    features = features.merge(g, on=['SK_ID_CURR'], how='left')

    g = groupby['AMT_DRAWINGS_CURRENT'].agg('sum').reset_index()
    g.rename(index=str, columns={'AMT_DRAWINGS_CURRENT': 'credit_card_drawings_total'}, inplace=True)
    features = features.merge(g, on=['SK_ID_CURR'], how='left')

    g = groupby['number_of_installments'].agg('sum').reset_index()
    g.rename(index=str, columns={'number_of_installments': 'credit_card_total_installments'}, inplace=True)
    features = features.merge(g, on=['SK_ID_CURR'], how='left')

    g = groupby['credit_card_max_loading_of_credit_limit'].agg('mean').reset_index()
    g.rename(index=str,
             columns={'credit_card_max_loading_of_credit_limit': 'credit_card_avg_loading_of_credit_limit'},
             inplace=True)
    features = features.merge(g, on=['SK_ID_CURR'], how='left')

    features['credit_card_cash_card_ratio'] = features['credit_card_drawings_atm'] / features[
        'credit_card_drawings_total']

    features['credit_card_installments_per_loan'] = (
        features['credit_card_total_installments'] / features['credit_card_number_of_loans'])

    return features

def credit_card_balance_dynamic(num_rows = None, nan_as_category = True):
    credit_card = pd.read_csv('../data/credit_card_balance.csv', nrows = num_rows)
    features = pd.DataFrame({'SK_ID_CURR': credit_card['SK_ID_CURR'].unique()})

    credit_card_sorted = credit_card.sort_values(['SK_ID_CURR', 'MONTHS_BALANCE'])

    groupby = credit_card_sorted.groupby(by=['SK_ID_CURR'])
    credit_card_sorted['credit_card_monthly_diff'] = groupby['AMT_BALANCE'].diff()
    groupby = credit_card_sorted.groupby(by=['SK_ID_CURR'])

    g = groupby['credit_card_monthly_diff'].agg('mean').reset_index()
    features = features.merge(g, on=['SK_ID_CURR'], how='left')
    return features



In [25]:
cc = credit_card_balance(num_rows)



In [26]:
df = df.join(cc, how='left', on='SK_ID_CURR', lsuffix='_left', rsuffix='_right')
# del bureau
gc.collect()

14

In [27]:
df['SK_ID_CURR'] = df['SK_ID_CURR_left'].values
del df['SK_ID_CURR_left']
del df['SK_ID_CURR_right']

In [28]:
cc_dynamic = credit_card_balance_dynamic(num_rows)

In [29]:
df = df.join(cc_dynamic, how='left', on='SK_ID_CURR', lsuffix='_left', rsuffix='_right')
# del bureau
gc.collect()

84

In [30]:
df['SK_ID_CURR'] = df['SK_ID_CURR_left'].values
del df['SK_ID_CURR_left']
del df['SK_ID_CURR_right']

In [33]:
# if __name__ == '__main__':
pos = pos_cash(num_rows)

In [34]:
df = df.join(pos, how='left', on='SK_ID_CURR', lsuffix='_left', rsuffix='_right')
# del bureau
gc.collect()

56

In [35]:
df['SK_ID_CURR'] = df['SK_ID_CURR_left'].values
del df['SK_ID_CURR_left']
del df['SK_ID_CURR_right']

In [41]:
prev = previous_applications(num_rows)


In [68]:
#df[prev.columns].to_csv('../data/previous_applications_0.csv')

In [42]:
df = df.join(prev, how='left', on='SK_ID_CURR', lsuffix='_left', rsuffix='_right')

gc.collect()

327

In [43]:
df['SK_ID_CURR'] = df['SK_ID_CURR_left'].values
del df['SK_ID_CURR_left']
del df['SK_ID_CURR_right']

In [47]:
ins = installments_payments(num_rows)

gc.collect()

275

In [48]:
df = df.join(ins, how='left', on='SK_ID_CURR', lsuffix='_left', rsuffix='_right')
# del bureau
gc.collect()

7

In [49]:
df['SK_ID_CURR'] = df['SK_ID_CURR_left'].values
del df['SK_ID_CURR_left']
del df['SK_ID_CURR_right']

In [None]:
df['SK_ID_CURR']

In [51]:
df.head()

Unnamed: 0,index,AMT_ANNUITY,AMT_CREDIT,AMT_GOODS_PRICE,AMT_INCOME_TOTAL,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_WEEK,...,previous_application_term_of_last_3_credits_mean,previous_application_days_decision_about_last_3_credits_mean,previous_application_term_of_last_8_credits_mean,previous_application_days_decision_about_last_8_credits_mean,previous_application_term_of_last_16_credits_mean,previous_application_days_decision_about_last_16_credits_mean,previous_application_term_of_last_32_credits_mean,previous_application_days_decision_about_last_32_credits_mean,previous_application_days_first_drawing_last_32_credits_mean,SK_ID_CURR
0,0,24700.5,406597.5,351000.0,202500.0,0.0,0.0,0.0,0.0,0.0,...,24.0,-221.0,30.0,-393.875,16.0,-889.0,12.0,-1197.421053,303954.833333,100002
1,1,35698.5,1293502.5,1129500.0,270000.0,0.0,0.0,0.0,0.0,0.0,...,22.0,-822.666667,19.2,-943.6,19.2,-943.6,19.2,-943.6,365243.0,100003
2,2,6750.0,135000.0,135000.0,67500.0,0.0,0.0,0.0,0.0,0.0,...,24.0,-421.666667,25.714286,-1013.625,24.0,-1179.111111,24.0,-1179.111111,365243.0,100004
3,3,29686.5,312682.5,297000.0,135000.0,,,,,,...,6.0,-735.0,6.0,-735.0,6.0,-735.0,6.0,-735.0,365243.0,100006
4,4,21865.5,513000.0,513000.0,121500.0,0.0,0.0,0.0,0.0,0.0,...,10.0,-266.5,10.0,-266.5,10.0,-266.5,10.0,-266.5,365243.0,100007


In [62]:
test_file_path = "Level_1_stack/test_lgbm_k_3.csv"
validation_file_path = 'Level_1_stack/validation_lgbm_k_3.csv'
num_folds = 5

In [53]:
new_features = pd.read_csv('selected_features.csv', header=0, index_col=None)

In [55]:
uniuqe = [f for f in new_features.columns if f not in df.columns]
uniuqe
new_df = pd.concat([df, new_features[uniuqe]], axis=1)

In [56]:
new_df.shape

(356255, 214)

In [57]:
last_3_months_features = pd.read_csv('../data/last_3_months_payments.csv', header=0, index_col=None)

In [58]:
uniuqe_0 = [f for f in last_3_months_features.columns if f not in new_df.columns]
new_df = pd.concat([new_df, last_3_months_features[uniuqe_0]], axis=1)

In [59]:
df = new_df

In [60]:
df.shape

(356255, 218)

In [63]:
encoding = 'ohe'

train_df = df.iloc[0:n_train]
test_df = df.iloc[n_train:]

print("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape))
gc.collect()
# Cross validation model
folds = KFold(n_splits=num_folds, shuffle=True, random_state=1001)
# Create arrays and dataframes to store results
oof_preds = np.zeros(train_df.shape[0])
sub_preds = np.zeros(test_df.shape[0])
feature_importance_df = pd.DataFrame()
feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]

#feats = [col for col in feats_0 if df[col].dtype == 'object']


print(train_df[feats].shape)
for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])):
        
      
        categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
        
        if encoding == 'ohe':
            
            enc = ce.OneHotEncoder(impute_missing=True, cols=categorical_columns).fit(train_df[feats].iloc[train_idx],
                                                                                       train_df['TARGET'].iloc[train_idx])
            x_train = enc.transform(train_df[feats].iloc[train_idx])
            x_valid = enc.transform(train_df[feats].iloc[valid_idx])
            x_test = enc.transform(test_df[feats])
            print(x_train.shape, x_valid.shape, x_test.shape)
            
            dtrain = lgb.Dataset(data=x_train, 
                             label=train_df['TARGET'].iloc[train_idx], 
                             free_raw_data=False, silent=True)
            dvalid = lgb.Dataset(data=x_valid, 
                             label=train_df['TARGET'].iloc[valid_idx], 
                             free_raw_data=False, silent=True)
            gc.collect()
            
        params = {
            'objective': 'regression',
            'boosting_type': 'goss',
            'nthread': 8,
            'learning_rate': 0.02,  
            'num_leaves': 2 ** 5,
            'colsample_bytree': 0.25,
            'subsample': 0.9,
            'max_depth': 8,
            'reg_alpha': 1,
            'reg_lambda': 2 ** 5,
            'min_split_gain': 0.1,
            'min_child_weight': 2 ** 5,
            'seed': 666,
            'scale_pos_weight' : 3,
            'verbose': -1,
            'metric': 'rmse',
            'max_bin': 2 ** 7
        }
        
        clf = lgb.train(
            params=params,
            train_set=dtrain,
            num_boost_round=10000,
            valid_sets=[dtrain, dvalid],
            early_stopping_rounds=100,
            verbose_eval=False
        )
        
        oof_preds[valid_idx] = clf.predict(dvalid.data)
        sub_preds += clf.predict(x_test) / folds.n_splits

        print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(dvalid.label, oof_preds[valid_idx])))
        del clf, dtrain, dvalid
        gc.collect()

print('Full AUC score %.6f' % roc_auc_score(train_df['TARGET'], oof_preds))
        
sub_df = test_df[['SK_ID_CURR']].copy()
sub_df['TARGET'] = sub_preds
sub_df[['SK_ID_CURR', 'TARGET']].to_csv(test_file_path, index= False)

val_df = train_df[['SK_ID_CURR', 'TARGET']].copy()
val_df['TARGET'] = oof_preds
val_df[['SK_ID_CURR', 'TARGET']].to_csv(validation_file_path, index= False)        
            
            
        
        
    



Starting LightGBM. Train shape: (307511, 218), test shape: (48744, 218)
(307511, 215)
(246008, 364) (61503, 364) (48744, 364)
Fold  1 AUC : 0.782377
(246009, 363) (61502, 363) (48744, 363)
Fold  2 AUC : 0.754661
(246009, 364) (61502, 364) (48744, 364)
Fold  3 AUC : 0.774387
(246009, 364) (61502, 364) (48744, 364)
Fold  4 AUC : 0.778663
(246009, 364) (61502, 364) (48744, 364)
Fold  5 AUC : 0.779519
Full AUC score 0.770298
