In [35]:
import numpy as np
import pandas as pd
import gc
import time
import category_encoders as ce
from contextlib import contextmanager
import lightgbm as lgb
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from scipy.cluster.vq import kmeans2, whiten
from sklearn.preprocessing import Imputer
from sklearn.decomposition import truncated_svd
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import LinearSVC
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

num_rows = None

In [2]:
descretize = lambda x, n: list(map(str, list(pd.qcut(x, n, duplicates='drop'))))

# One-hot encoding for categorical columns with get_dummies
def one_hot_encoder(df, nan_as_category = True):
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    df = pd.get_dummies(df, columns= categorical_columns, dummy_na= nan_as_category)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns


In [3]:
# Preprocess application_train.csv and application_test.csv
def application_train_test(num_rows = None, nan_as_category=False):
    # Read data and merge
    df = pd.read_csv('/media/limbo/Home-Credit/data/application_train.csv.zip', nrows= num_rows)
    
    n_train = df.shape[0]
    
    test_df = pd.read_csv('/media/limbo/Home-Credit/data/application_test.csv.zip', nrows= num_rows)
    print("Train samples: {}, test samples: {}".format(len(df), len(test_df)))
    df = df.append(test_df).reset_index()
    
    
    df['CODE_GENDER'].replace('XNA', np.nan, inplace=True)
    df['DAYS_EMPLOYED'].replace(365243, np.nan, inplace=True)
    df['NAME_FAMILY_STATUS'].replace('Unknown', np.nan, inplace=True)
    df['ORGANIZATION_TYPE'].replace('XNA', np.nan, inplace=True)
    
    # Optional: Remove 4 applications with XNA CODE_GENDER (train set)
    df = df[df['CODE_GENDER'] != 'XNA']
    
    docs = [_f for _f in df.columns if 'FLAG_DOC' in _f]
    live = [_f for _f in df.columns if ('FLAG_' in _f) & ('FLAG_DOC' not in _f) & ('_FLAG_' not in _f)]
    
    # NaN values for DAYS_EMPLOYED: 365.243 -> nan
    df['DAYS_EMPLOYED'].replace(365243, np.nan, inplace= True)

    inc_by_org = df[['AMT_INCOME_TOTAL', 'ORGANIZATION_TYPE']].groupby('ORGANIZATION_TYPE').median()['AMT_INCOME_TOTAL']

    df['NEW_CREDIT_TO_ANNUITY_RATIO'] = df['AMT_CREDIT'] / df['AMT_ANNUITY']
    df['NEW_AMT_INCOME_TOTAL_RATIO'] = df['AMT_CREDIT'] / df['AMT_INCOME_TOTAL']
    df['NEW_CREDIT_TO_GOODS_RATIO'] = df['AMT_CREDIT'] / df['AMT_GOODS_PRICE']
    df['NEW_DOC_IND_AVG'] = df[docs].mean(axis=1)
    df['NEW_DOC_IND_STD'] = df[docs].std(axis=1)
    df['NEW_DOC_IND_KURT'] = df[docs].kurtosis(axis=1)
    df['NEW_LIVE_IND_SUM'] = df[live].sum(axis=1)
    df['NEW_LIVE_IND_STD'] = df[live].std(axis=1)
    df['NEW_LIVE_IND_KURT'] = df[live].kurtosis(axis=1)
    df['NEW_INC_PER_CHLD'] = df['AMT_INCOME_TOTAL'] / (1 + df['CNT_CHILDREN'])
    df['NEW_INC_BY_ORG'] = df['ORGANIZATION_TYPE'].map(inc_by_org)
    df['NEW_EMPLOY_TO_BIRTH_RATIO'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH']
    df['NEW_ANNUITY_TO_INCOME_RATIO'] = df['AMT_ANNUITY'] / (1 + df['AMT_INCOME_TOTAL'])
    df['NEW_SOURCES_PROD'] = df['EXT_SOURCE_1'] * df['EXT_SOURCE_2'] * df['EXT_SOURCE_3']
    df['NEW_EXT_SOURCES_MEAN'] = df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].mean(axis=1)
    df['NEW_SCORES_STD'] = df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].std(axis=1)
    df['NEW_SCORES_STD'] = df['NEW_SCORES_STD'].fillna(df['NEW_SCORES_STD'].mean())
    df['NEW_CAR_TO_BIRTH_RATIO'] = df['OWN_CAR_AGE'] / df['DAYS_BIRTH']
    df['NEW_CAR_TO_EMPLOY_RATIO'] = df['OWN_CAR_AGE'] / df['DAYS_EMPLOYED']
    df['NEW_PHONE_TO_BIRTH_RATIO'] = df['DAYS_LAST_PHONE_CHANGE'] / df['DAYS_BIRTH']
    df['NEW_PHONE_TO_EMPLOY_RATIO'] = df['DAYS_LAST_PHONE_CHANGE'] / df['DAYS_EMPLOYED']
    df['NEW_CREDIT_TO_INCOME_RATIO'] = df['AMT_CREDIT'] / df['AMT_INCOME_TOTAL']
    
    df['children_ratio'] = df['CNT_CHILDREN'] / df['CNT_FAM_MEMBERS']
    
    
    df['NEW_EXT_SOURCES_MEDIAN'] = df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].median(axis=1)
    
    df['NEW_DOC_IND_SKEW'] = df[docs].skew(axis=1)
    df['NEW_LIVE_IND_SKEW'] = df[live].skew(axis=1)
    
    
    df['ind_0'] = df['DAYS_EMPLOYED'] - df['DAYS_EMPLOYED'].replace([np.inf, -np.inf], np.nan).fillna(df['DAYS_EMPLOYED'].dropna().median()).mean()
    df['ind_1'] = df['DAYS_EMPLOYED'] - df['DAYS_EMPLOYED'].replace([np.inf, -np.inf], np.nan).fillna(df['DAYS_EMPLOYED'].dropna().median()).median()
    
    df['ind_2'] = df['DAYS_BIRTH'] - df['DAYS_BIRTH'].replace([np.inf, -np.inf], np.nan).fillna(df['DAYS_BIRTH'].dropna().median()).mean()
    df['ind_3'] = df['DAYS_BIRTH'] - df['DAYS_BIRTH'].replace([np.inf, -np.inf], np.nan).fillna(df['DAYS_BIRTH'].dropna().median()).median()
    
    
    df['ind_4'] = df['AMT_INCOME_TOTAL'] - df['AMT_INCOME_TOTAL'].replace([np.inf, -np.inf], np.nan).fillna(df['AMT_INCOME_TOTAL'].dropna().median()).mean()
    df['ind_5'] = df['AMT_INCOME_TOTAL'] - df['AMT_INCOME_TOTAL'].replace([np.inf, -np.inf], np.nan).fillna(df['AMT_INCOME_TOTAL'].dropna().median()).median() 
   
    
    df['ind_6'] = df['AMT_CREDIT'] - df['AMT_CREDIT'].replace([np.inf, -np.inf], np.nan).fillna(df['AMT_CREDIT'].dropna().median()).mean()
    df['ind_7'] = df['AMT_CREDIT'] - df['AMT_CREDIT'].replace([np.inf, -np.inf], np.nan).fillna(df['AMT_CREDIT'].dropna().median()).median() 
   
    df['ind_8'] = df['AMT_ANNUITY'] - df['AMT_ANNUITY'].replace([np.inf, -np.inf], np.nan).fillna(df['AMT_ANNUITY'].dropna().median()).mean()
    df['ind_9'] = df['AMT_ANNUITY'] - df['AMT_ANNUITY'].replace([np.inf, -np.inf], np.nan).fillna(df['AMT_ANNUITY'].dropna().median()).median() 
    
    df['ind_10'] = df['AMT_CREDIT'] - df['AMT_INCOME_TOTAL'].replace([np.inf, -np.inf], np.nan).fillna(df['AMT_INCOME_TOTAL'].dropna().median()).mean()
    df['ind_11'] = df['AMT_CREDIT'] - df['AMT_INCOME_TOTAL'].replace([np.inf, -np.inf], np.nan).fillna(df['AMT_INCOME_TOTAL'].dropna().median()).median() 
    
    
    AGGREGATION_RECIPIES = [
    (['CODE_GENDER', 'NAME_EDUCATION_TYPE'], [('AMT_ANNUITY', 'max'),
                                              ('AMT_CREDIT', 'max'),
                                              ('EXT_SOURCE_1', 'mean'),
                                              ('EXT_SOURCE_2', 'mean'),
                                              ('OWN_CAR_AGE', 'max'),
                                              ('OWN_CAR_AGE', 'sum')]),
    (['CODE_GENDER', 'ORGANIZATION_TYPE'], [('AMT_ANNUITY', 'mean'),
                                            ('AMT_INCOME_TOTAL', 'mean'),
                                            ('DAYS_REGISTRATION', 'mean'),
                                            ('EXT_SOURCE_1', 'mean'),
                                            ('NEW_CREDIT_TO_ANNUITY_RATIO', 'mean')]),
    (['CODE_GENDER', 'REG_CITY_NOT_WORK_CITY'], [('AMT_ANNUITY', 'mean'),
                                                 ('CNT_CHILDREN', 'mean'),
                                                 ('DAYS_ID_PUBLISH', 'mean')]),
    (['CODE_GENDER', 'NAME_EDUCATION_TYPE', 'OCCUPATION_TYPE', 'REG_CITY_NOT_WORK_CITY'], [('EXT_SOURCE_1', 'mean'),
                                                                                           ('EXT_SOURCE_2', 'mean')]),
    (['NAME_EDUCATION_TYPE', 'OCCUPATION_TYPE'], [('AMT_CREDIT', 'mean'),
                                                  ('AMT_REQ_CREDIT_BUREAU_YEAR', 'mean'),
                                                  ('APARTMENTS_AVG', 'mean'),
                                                  ('BASEMENTAREA_AVG', 'mean'),
                                                  ('EXT_SOURCE_1', 'mean'),
                                                  ('EXT_SOURCE_2', 'mean'),
                                                  ('EXT_SOURCE_3', 'mean'),
                                                  ('NONLIVINGAREA_AVG', 'mean'),
                                                  ('OWN_CAR_AGE', 'mean')]),
    (['NAME_EDUCATION_TYPE', 'OCCUPATION_TYPE', 'REG_CITY_NOT_WORK_CITY'], [('ELEVATORS_AVG', 'mean'),
                                                                            ('EXT_SOURCE_1', 'mean')]),
    (['OCCUPATION_TYPE'], [('AMT_ANNUITY', 'median'),
                           ('CNT_CHILDREN', 'median'),
                           ('CNT_FAM_MEMBERS', 'median'),
                           ('DAYS_BIRTH', 'median'),
                           ('DAYS_EMPLOYED', 'median'),
                           ('NEW_CREDIT_TO_ANNUITY_RATIO', 'median'),
                           ('DAYS_REGISTRATION', 'median'),
                           ('EXT_SOURCE_1', 'median'),
                           ('EXT_SOURCE_2', 'median'),
                           ('EXT_SOURCE_3', 'median')]),
]

    
    for groupby_cols, specs in AGGREGATION_RECIPIES:
        group_object = df.groupby(groupby_cols)
        for select, agg in specs:
            groupby_aggregate_name = '{}_{}_{}'.format('_'.join(groupby_cols), agg, select)
            df = df.merge(group_object[select]
                              .agg(agg)
                              .reset_index()
                              .rename(index=str,
                                      columns={select: groupby_aggregate_name})
                              [groupby_cols + [groupby_aggregate_name]],
                              on=groupby_cols,
                              how='left')
    
    df['retirement_age'] = (df['DAYS_BIRTH'] > -14000).astype(int)
    df['long_employment'] = (df['DAYS_EMPLOYED'] > -2000).astype(int)
    df['cnt_non_child'] = df['CNT_FAM_MEMBERS'] - df['CNT_CHILDREN']
    df['child_to_non_child_ratio'] = df['CNT_CHILDREN'] / df['cnt_non_child']
    df['income_per_non_child'] = df['AMT_INCOME_TOTAL'] / df['cnt_non_child']
    df['credit_per_person'] = df['AMT_CREDIT'] / df['CNT_FAM_MEMBERS']
    df['credit_per_child'] = df['AMT_CREDIT'] / (1 + df['CNT_CHILDREN'])
    df['credit_per_non_child'] = df['AMT_CREDIT'] / df['cnt_non_child']
    
    df['cnt_non_child'] = df['CNT_FAM_MEMBERS'] - df['CNT_CHILDREN']
    df['child_to_non_child_ratio'] = df['CNT_CHILDREN'] / df['cnt_non_child']
    df['income_per_non_child'] = df['AMT_INCOME_TOTAL'] / df['cnt_non_child']
    df['credit_per_person'] = df['AMT_CREDIT'] / df['CNT_FAM_MEMBERS']
    df['credit_per_child'] = df['AMT_CREDIT'] / (1 + df['CNT_CHILDREN'])
    df['credit_per_non_child'] = df['AMT_CREDIT'] / df['cnt_non_child']
    
    df['p_0'] = descretize(df['credit_per_non_child'].values, 2 ** 5)
    df['p_1'] = descretize(df['credit_per_person'].values, 2 ** 5)
    df['p_2'] = descretize(df['credit_per_child'].values, 2 ** 5)
    df['p_3'] = descretize(df['retirement_age'].values, 2 ** 5)
    df['p_4'] = descretize(df['income_per_non_child'].values, 2 ** 5)
    df['p_5'] = descretize(df['child_to_non_child_ratio'].values, 2 ** 5)
    
    df['p_6'] = descretize(df['NEW_CREDIT_TO_ANNUITY_RATIO'].values, 2 ** 5)
    df['p_7'] = descretize(df['NEW_CREDIT_TO_ANNUITY_RATIO'].values, 2 ** 6)
    df['p_8'] = descretize(df['NEW_CREDIT_TO_ANNUITY_RATIO'].values, 2 ** 7)
    
    
    df['pe_0'] = descretize(df['credit_per_non_child'].values, 2 ** 6)
    df['pe_1'] = descretize(df['credit_per_person'].values, 2 ** 6)
    df['pe_2'] = descretize(df['credit_per_child'].values, 2 ** 6)
    df['pe_3'] = descretize(df['retirement_age'].values, 2 ** 6)
    df['pe_4'] = descretize(df['income_per_non_child'].values, 2 ** 6)
    df['pe_5'] = descretize(df['child_to_non_child_ratio'].values, 2 ** 6)
          
    
    c = df['NEW_CREDIT_TO_ANNUITY_RATIO'].replace([np.inf, -np.inf], np.nan).fillna(999).values
    a, b = kmeans2(np.log1p(c), 2, iter=333)  
    df['x_0'] = b

    a, b = kmeans2(np.log1p(c), 4, iter=333)  
    df['x_1'] = b

    a, b = kmeans2(np.log1p(c), 8, iter=333)  
    df['x_2'] = b

    a, b = kmeans2(np.log1p(c), 16, iter=333)  
    df['x_3'] = b

    a, b = kmeans2(np.log1p(c), 32, iter=333)  
    df['x_4'] = b

    a, b = kmeans2(np.log1p(c), 64, iter=333)  
    df['x_5'] = b
    
    a, b = kmeans2(np.log1p(c), 128, iter=333)  
    df['x_6'] = b

    a, b = kmeans2(np.log1p(c), 150, iter=333)  
    df['x_7'] = b

    a, b = kmeans2(np.log1p(c), 256, iter=333)  
    df['x_8'] = b

    a, b = kmeans2(np.log1p(c), 512, iter=333)  
    df['x_9'] = b
    
    a, b = kmeans2(np.log1p(c), 1024, iter=333)  
    df['x_10'] = b

    
    # Categorical features with Binary encode (0 or 1; two categories)
    for bin_feature in ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY']:
        df[bin_feature], uniques = pd.factorize(df[bin_feature])
            
    # Categorical features with One-Hot encode
    df, cat_cols = one_hot_encoder(df, nan_as_category)
    
    del test_df
    gc.collect()
    return df

In [4]:
df = application_train_test(num_rows=None, nan_as_category=False)

Train samples: 307511, test samples: 48744


  r = func(a, **kwargs)


In [5]:
gc.collect()

14

In [6]:
def bureau_and_balance(num_rows = None, nan_as_category = True):
    bureau = pd.read_csv('../data/bureau.csv', nrows = num_rows)
    bb = pd.read_csv('../data/bureau_balance.csv', nrows = num_rows)
    bb, bb_cat = one_hot_encoder(bb, nan_as_category)
    bureau, bureau_cat = one_hot_encoder(bureau, nan_as_category)
    
    # Bureau balance: Perform aggregations and merge with bureau.csv
    bb_aggregations = {'MONTHS_BALANCE': ['min', 'max', 'size', 'median']}
    for col in bb_cat:
        bb_aggregations[col] = ['median']
    bb_agg = bb.groupby('SK_ID_BUREAU').agg(bb_aggregations)
    bb_agg.columns = pd.Index([e[0] + "_" + e[1].upper() for e in bb_agg.columns.tolist()])
    bureau = bureau.join(bb_agg, how='left', on='SK_ID_BUREAU')
    bureau.drop(['SK_ID_BUREAU'], axis=1, inplace= True)
    del bb, bb_agg
    gc.collect()
    
    # Bureau and bureau_balance numeric features
    num_aggregations = {
        'DAYS_CREDIT': ['min', 'max', 'median', 'var'],
        'DAYS_CREDIT_ENDDATE': ['min', 'max', 'median'],
        'DAYS_CREDIT_UPDATE': ['median'],
        'CREDIT_DAY_OVERDUE': ['max', 'median'],
        'AMT_CREDIT_MAX_OVERDUE': ['median'],
        'AMT_CREDIT_SUM': ['max', 'median', 'sum'],
        'AMT_CREDIT_SUM_DEBT': ['max', 'median', 'sum'],
        'AMT_CREDIT_SUM_OVERDUE': ['median'],
        'AMT_CREDIT_SUM_LIMIT': ['median', 'sum'],
        'AMT_ANNUITY': ['max', 'median'],
        'CNT_CREDIT_PROLONG': ['sum'],
        'MONTHS_BALANCE_MIN': ['min', 'median'],
        'MONTHS_BALANCE_MAX': ['max', 'median'],
        'MONTHS_BALANCE_SIZE': ['median', 'sum']
    }
    # Bureau and bureau_balance categorical features
    cat_aggregations = {}
    for cat in bureau_cat: cat_aggregations[cat] = ['mean']
    for cat in bb_cat: cat_aggregations[cat + "_MEDIAN"] = ['mean']
    
    bureau_agg = bureau.groupby('SK_ID_CURR').agg({**num_aggregations, **cat_aggregations})
    bureau_agg.columns = pd.Index(['BURO_' + e[0] + "_" + e[1].upper() for e in bureau_agg.columns.tolist()])
    # Bureau: Active credits - using only numerical aggregations
    active = bureau[bureau['CREDIT_ACTIVE_Active'] == 1]
    active_agg = active.groupby('SK_ID_CURR').agg(num_aggregations)
    cols = active_agg.columns.tolist()
    active_agg.columns = pd.Index(['ACTIVE_' + e[0] + "_" + e[1].upper() for e in active_agg.columns.tolist()])
    bureau_agg = bureau_agg.join(active_agg, how='left', on='SK_ID_CURR')
    del active, active_agg
    gc.collect()
    # Bureau: Closed credits - using only numerical aggregations
    closed = bureau[bureau['CREDIT_ACTIVE_Closed'] == 1]
    closed_agg = closed.groupby('SK_ID_CURR').agg(num_aggregations)
    closed_agg.columns = pd.Index(['CLOSED_' + e[0] + "_" + e[1].upper() for e in closed_agg.columns.tolist()])
    bureau_agg = bureau_agg.join(closed_agg, how='left', on='SK_ID_CURR')
    
    for e in cols:
        bureau_agg['NEW_RATIO_BURO_' + e[0] + "_" + e[1].upper()] = bureau_agg['ACTIVE_' + e[0] + "_" + e[1].upper()] / bureau_agg['CLOSED_' + e[0] + "_" + e[1].upper()]
    
    del closed, closed_agg, bureau
    gc.collect()
    return bureau_agg

In [7]:
bureau = bureau_and_balance(num_rows)

In [8]:
df = df.join(bureau, how='left', on='SK_ID_CURR')
del bureau
gc.collect()

7

In [9]:
def previous_applications(num_rows=None, nan_as_category=True):
    prev = pd.read_csv('../data/previous_application.csv', nrows = num_rows)
    prev, cat_cols = one_hot_encoder(prev, nan_as_category= True)
    # Days 365.243 values -> nan
    prev['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace= True)
    prev['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace= True)
    prev['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace= True)
    prev['DAYS_LAST_DUE'].replace(365243, np.nan, inplace= True)
    prev['DAYS_TERMINATION'].replace(365243, np.nan, inplace= True)
    # Add feature: value ask / value received percentage
    prev['APP_CREDIT_PERC'] = prev['AMT_APPLICATION'] / prev['AMT_CREDIT']
    # Previous applications numeric features
    num_aggregations = {
        'AMT_ANNUITY': ['min', 'max', 'median'],
        'AMT_APPLICATION': ['min', 'max', 'median'],
        'AMT_CREDIT': ['min', 'max', 'median'],
        'APP_CREDIT_PERC': ['min', 'max', 'median', 'var'],
        'AMT_DOWN_PAYMENT': ['min', 'max', 'median'],
        'AMT_GOODS_PRICE': ['min', 'max', 'median'],
        'HOUR_APPR_PROCESS_START': ['min', 'max', 'median'],
        'RATE_DOWN_PAYMENT': ['min', 'max', 'median'],
        'DAYS_DECISION': ['min', 'max', 'median'],
        'CNT_PAYMENT': ['median', 'sum'],
    }
    # Previous applications categorical features
    cat_aggregations = {}
    for cat in cat_cols:
        cat_aggregations[cat] = ['mean']
    
    prev_agg = prev.groupby('SK_ID_CURR').agg({**num_aggregations, **cat_aggregations})
    prev_agg.columns = pd.Index(['PREV_' + e[0] + "_" + e[1].upper() for e in prev_agg.columns.tolist()])
    # Previous Applications: Approved Applications - only numerical features
    approved = prev[prev['NAME_CONTRACT_STATUS_Approved'] == 1]
    approved_agg = approved.groupby('SK_ID_CURR').agg(num_aggregations)
    cols = approved_agg.columns.tolist()
    approved_agg.columns = pd.Index(['APPROVED_' + e[0] + "_" + e[1].upper() for e in approved_agg.columns.tolist()])
    prev_agg = prev_agg.join(approved_agg, how='left', on='SK_ID_CURR')
    # Previous Applications: Refused Applications - only numerical features
    refused = prev[prev['NAME_CONTRACT_STATUS_Refused'] == 1]
    refused_agg = refused.groupby('SK_ID_CURR').agg(num_aggregations)
    refused_agg.columns = pd.Index(['REFUSED_' + e[0] + "_" + e[1].upper() for e in refused_agg.columns.tolist()])
    prev_agg = prev_agg.join(refused_agg, how='left', on='SK_ID_CURR')
    del refused, refused_agg, approved, approved_agg, prev
    
    for e in cols:
        prev_agg['NEW_RATIO_PREV_' + e[0] + "_" + e[1].upper()] = prev_agg['APPROVED_' + e[0] + "_" + e[1].upper()] / prev_agg['REFUSED_' + e[0] + "_" + e[1].upper()]
    
    gc.collect()
    return prev_agg

# Preprocess POS_CASH_balance.csv
def pos_cash(num_rows = None, nan_as_category = True):
    pos = pd.read_csv('../data/POS_CASH_balance.csv', nrows = num_rows)
    pos, cat_cols = one_hot_encoder(pos, nan_as_category= True)
    # Features
    aggregations = {
        'MONTHS_BALANCE': ['max', 'median', 'size'],
        'SK_DPD': ['max', 'median'],
        'SK_DPD_DEF': ['max', 'median']
    }
    for cat in cat_cols:
        aggregations[cat] = ['mean']
    
    pos_agg = pos.groupby('SK_ID_CURR').agg(aggregations)
    pos_agg.columns = pd.Index(['POS_' + e[0] + "_" + e[1].upper() for e in pos_agg.columns.tolist()])
    # Count pos cash accounts
    pos_agg['POS_COUNT'] = pos.groupby('SK_ID_CURR').size()
    del pos
    gc.collect()
    return pos_agg
    
# Preprocess installments_payments.csv
def installments_payments(num_rows = None, nan_as_category = True):
    ins = pd.read_csv('/media/limbo/Home-Credit/data/installments_payments.csv', nrows = num_rows)
    ins, cat_cols = one_hot_encoder(ins, nan_as_category= True)
    # Percentage and difference paid in each installment (amount paid and installment value)
    ins['PAYMENT_PERC'] = ins['AMT_PAYMENT'] / ins['AMT_INSTALMENT']
    ins['PAYMENT_DIFF'] = ins['AMT_INSTALMENT'] - ins['AMT_PAYMENT']
    
    ins['PAYMENT_PERC_median'] = ins['PAYMENT_PERC'] - ins['PAYMENT_PERC'].replace([np.inf, -np.inf], np.nan).fillna(ins['PAYMENT_PERC'].dropna().median()).median()
    ins['PAYMENT_PERC_MEDIAN'] = ins['PAYMENT_PERC'] - ins['PAYMENT_PERC'].replace([np.inf, -np.inf], np.nan).fillna(ins['PAYMENT_PERC'].dropna().median()).median()
    
    ins['PAYMENT_DIFF_median'] = ins['PAYMENT_DIFF'] - ins['PAYMENT_DIFF'].replace([np.inf, -np.inf], np.nan).fillna(ins['PAYMENT_DIFF'].dropna().median()).median()
    ins['PAYMENT_DIFF_MEDIAN'] = ins['PAYMENT_DIFF'] - ins['PAYMENT_DIFF'].replace([np.inf, -np.inf], np.nan).fillna(ins['PAYMENT_DIFF'].dropna().median()).median()
    
    
    ins['pay_0'] = descretize(ins['PAYMENT_PERC'].values, 2 ** 6)
    ins['pay_1'] = descretize(ins['PAYMENT_DIFF'].values, 2 ** 6)
    ins['pay_2'] = descretize(ins['PAYMENT_PERC_MEDIAN'].values, 2 ** 6)
    ins['pay_3'] = descretize(ins['PAYMENT_PERC_MEDIAN'].values, 2 ** 6)
    ins['pay_4'] = descretize(ins['PAYMENT_DIFF_median'].values, 2 ** 6)
    ins['pay_5'] = descretize(ins['PAYMENT_DIFF_MEDIAN'].values, 2 ** 6)
    
    # Days past due and days before due (no negative values)
    ins['DPD'] = ins['DAYS_ENTRY_PAYMENT'] - ins['DAYS_INSTALMENT']
    ins['DBD'] = ins['DAYS_INSTALMENT'] - ins['DAYS_ENTRY_PAYMENT']
    ins['DPD'] = ins['DPD'].apply(lambda x: x if x > 0 else 0)
    ins['DBD'] = ins['DBD'].apply(lambda x: x if x > 0 else 0)
    
    
    ins['day_0'] = descretize(ins['DAYS_ENTRY_PAYMENT'].values, 2 ** 6)
    ins['day_1'] = descretize(ins['DAYS_INSTALMENT'].values, 2 ** 6)
    ins['day_2'] = descretize(ins['DBD'].values, 2 ** 6)
    
    
    # Features: Perform aggregations
    aggregations = {
        'NUM_INSTALMENT_VERSION': ['nunique'],
        'DPD': ['max', 'median', 'sum'],
        'DBD': ['max', 'median', 'sum'],
        'PAYMENT_PERC': ['max', 'median', 'sum', 'var'],
        'PAYMENT_DIFF': ['max', 'median', 'sum', 'var'],
        'AMT_INSTALMENT': ['max', 'median', 'sum'],
        'AMT_PAYMENT': ['min', 'max', 'median', 'sum'],
        'DAYS_ENTRY_PAYMENT': ['max', 'median', 'sum']
    }
    for cat in cat_cols:
        aggregations[cat] = ['median']
    ins_agg = ins.groupby('SK_ID_CURR').agg(aggregations)
    ins_agg.columns = pd.Index(['INSTAL_' + e[0] + "_" + e[1].upper() for e in ins_agg.columns.tolist()])
    # Count installments accounts
    ins_agg['INSTAL_COUNT'] = ins.groupby('SK_ID_CURR').size()
    del ins
    gc.collect()
    return ins_agg


# Preprocess credit_card_balance.csv
def credit_card_balance(num_rows = None, nan_as_category = True):
    cc = pd.read_csv('../data/credit_card_balance.csv', nrows = num_rows)
    cc, cat_cols = one_hot_encoder(cc, nan_as_category= True)
    # General aggregations
    cc.drop(['SK_ID_PREV'], axis= 1, inplace = True)
    cc_agg = cc.groupby('SK_ID_CURR').agg(['min', 'max', 'mean', 'sum', 'var'])
    cc_agg.columns = pd.Index(['CC_' + e[0] + "_" + e[1].upper() for e in cc_agg.columns.tolist()])
    # Count credit card lines
    cc_agg['CC_COUNT'] = cc.groupby('SK_ID_CURR').size()
    del cc
    gc.collect()
    return cc_agg


In [10]:
prev = previous_applications(num_rows)
print("Previous applications df shape:", prev.shape)
df = df.join(prev, how='left', on='SK_ID_CURR')
del prev
gc.collect()

pos = pos_cash(num_rows)
print("Pos-cash balance df shape:", pos.shape)
df = df.join(pos, how='left', on='SK_ID_CURR')
del pos
gc.collect()

ins = installments_payments(num_rows)
print("Installments payments df shape:", ins.shape)
df = df.join(ins, how='left', on='SK_ID_CURR')
del ins
gc.collect()



Previous applications df shape: (338857, 279)
Pos-cash balance df shape: (337252, 18)
Installments payments df shape: (339587, 26)


7

In [11]:
cc = credit_card_balance(num_rows)
print("Credit card balance df shape:", cc.shape)
df = df.join(cc, how='left', on='SK_ID_CURR')
del cc
gc.collect()

print(df.shape)


Credit card balance df shape: (103558, 141)
(356255, 1518)


In [12]:
df.head()

Unnamed: 0,index,AMT_ANNUITY,AMT_CREDIT,AMT_GOODS_PRICE,AMT_INCOME_TOTAL,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_WEEK,...,CC_NAME_CONTRACT_STATUS_Signed_MAX,CC_NAME_CONTRACT_STATUS_Signed_MEAN,CC_NAME_CONTRACT_STATUS_Signed_SUM,CC_NAME_CONTRACT_STATUS_Signed_VAR,CC_NAME_CONTRACT_STATUS_nan_MIN,CC_NAME_CONTRACT_STATUS_nan_MAX,CC_NAME_CONTRACT_STATUS_nan_MEAN,CC_NAME_CONTRACT_STATUS_nan_SUM,CC_NAME_CONTRACT_STATUS_nan_VAR,CC_COUNT
0,0,24700.5,406597.5,351000.0,202500.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
1,1,35698.5,1293502.5,1129500.0,270000.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
2,2,6750.0,135000.0,135000.0,67500.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
3,3,29686.5,312682.5,297000.0,135000.0,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0
4,4,21865.5,513000.0,513000.0,121500.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,


In [36]:
test_file_path = "Level_1_stack/test_adab-0.csv"
validation_file_path = 'Level_1_stack/validation_adab-0.csv'
num_folds = 5

In [14]:
imputer = Imputer(missing_values='NaN', strategy='median', axis=0, verbose=0, copy=True)

In [15]:
gc.collect()

0

In [16]:
#df = pd.concat([df.iloc[0:20000], df.iloc[320000:]], axis=0)

In [17]:
df = pd.DataFrame(imputer.fit_transform(df.values), index=df.index, columns=df.columns)

In [18]:
df.shape

(356255, 1518)

In [19]:
train = pd.read_csv('/media/limbo/Home-Credit/data/application_train.csv.zip', nrows= num_rows)
n_train = train.shape[0]

In [None]:
# train_df = df[df['TARGET'].notnull()]
# test_df = df[df['TARGET'].isnull()]

train_df = df.iloc[0:n_train]
test_df = df.iloc[n_train:]

val_df = train_df[['SK_ID_CURR', 'TARGET']].copy()

print("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape))
gc.collect()
# Cross validation model
folds = KFold(n_splits=num_folds, shuffle=True, random_state=1001)
# Create arrays and dataframes to store results
oof_preds = np.zeros(train_df.shape[0])
sub_preds = np.zeros(test_df.shape[0])
feature_importance_df = pd.DataFrame()
feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]

print(pd.isnull(train_df[feats]).sum())


train_df = train_df.replace([np.inf, -np.inf], np.nan).fillna(0)
test_df = test_df.replace([np.inf, -np.inf], np.nan).fillna(0)
#train_df = pd.DataFrame(imputer.fit_transform(train_df.replace([np.inf, -np.inf], np.nan)), columns=train_df.columns, index=train_df.index)
#test_df = pd.DataFrame(imputer.fit_transform(test_df.replace([np.inf, -np.inf], np.nan)), columns=test_df.columns, index=test_df.index)

v_threshold = 0.03
vt = VarianceThreshold(v_threshold)
vt.fit(train_df[feats])


x_train = vt.transform(train_df[feats])
x_test = vt.transform(test_df[feats])


print(train_df.shape, x_train.shape)


for n_fold, (train_idx, valid_idx) in enumerate(folds.split(x_train, train_df['TARGET'])):
    
    model = AdaBoostClassifier(base_estimator=None, n_estimators=150, learning_rate=0.3, 
                               algorithm='SAMME.R', random_state=666)
   
    model.fit(x_train[train_idx], train_df['TARGET'].iloc[train_idx].values)
    oof_preds[valid_idx] = model.predict_proba(x_train[valid_idx])[:, 1]
    sub_preds += model.predict_proba(x_test)[:, 1] / folds.n_splits

    print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(train_df['TARGET'].iloc[valid_idx].values, oof_preds[valid_idx])))
    gc.collect()

print('Full AUC score %.6f' % roc_auc_score(train_df['TARGET'], oof_preds))
# Write submission file and plot feature importance

sub_df = test_df[['SK_ID_CURR']].copy()
sub_df['TARGET'] = sub_preds
sub_df[['SK_ID_CURR', 'TARGET']].to_csv(test_file_path, index= False)


val_df['TARGET'] = oof_preds
val_df[['SK_ID_CURR', 'TARGET']].to_csv(validation_file_path, index= False)

gc.collect()

Starting LightGBM. Train shape: (307511, 1518), test shape: (48744, 1518)
AMT_ANNUITY                                   0
AMT_CREDIT                                    0
AMT_GOODS_PRICE                               0
AMT_INCOME_TOTAL                              0
AMT_REQ_CREDIT_BUREAU_DAY                     0
AMT_REQ_CREDIT_BUREAU_HOUR                    0
AMT_REQ_CREDIT_BUREAU_MON                     0
AMT_REQ_CREDIT_BUREAU_QRT                     0
AMT_REQ_CREDIT_BUREAU_WEEK                    0
AMT_REQ_CREDIT_BUREAU_YEAR                    0
APARTMENTS_AVG                                0
APARTMENTS_MEDI                               0
APARTMENTS_MODE                               0
BASEMENTAREA_AVG                              0
BASEMENTAREA_MEDI                             0
BASEMENTAREA_MODE                             0
CNT_CHILDREN                                  0
CNT_FAM_MEMBERS                               0
CODE_GENDER                                   0
COMMONAREA_AVG