In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
from scipy.stats import chi2
import statsmodels.discrete.discrete_model as dm_

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

%matplotlib inline

from preprocessor import *
from feature_engineering import *

import toad

In [2]:
data_path = '../../data/home-credit-default-risk/'
application_train_ft = pd.read_csv(data_path + 'application_train_processed.csv')
bureau_ft = pd.read_csv(data_path + 'bureau_preprocessed.csv')
bureau_balance_ft = pd.read_csv(data_path + 'bureau_balance_preprocessed.csv')
credit_card_ft = pd.read_csv(data_path + 'credit_card_balance_preprocessed.csv')
installment_ft = pd.read_csv(data_path + 'installment_payments_preprocessed.csv')
pos_card_ft = pd.read_csv(data_path + 'pos_cash_balance_preprocessed.csv')
prev_application_ft = pd.read_csv(data_path + 'previous_application_preprocessed.csv')


# Application features

In [3]:
application_train_ft['DAYS_EMPLOYED_PERC'] = application_train_ft['DAYS_EMPLOYED'] / application_train_ft['DAYS_BIRTH']
application_train_ft['INCOME_CREDIT_PERC'] = application_train_ft['AMT_INCOME_TOTAL'] / application_train_ft['AMT_CREDIT']
application_train_ft['INCOME_PER_PERSON'] = application_train_ft['AMT_INCOME_TOTAL'] / application_train_ft['CNT_FAM_MEMBERS']
application_train_ft['ANNUITY_INCOME_PERC'] = application_train_ft['AMT_ANNUITY'] / application_train_ft['AMT_INCOME_TOTAL']
application_train_ft['PAYMENT_RATE'] = application_train_ft['AMT_ANNUITY'] / application_train_ft['AMT_CREDIT']

## Following create two features: id_renewal_days_delay, income_per_person

In [4]:
application_train_ft, new_application_fts = fe_application(application_train_ft)

In [5]:
df_ft = application_train_ft

# Bureau features

In [6]:
bureau_avg = bureau_ft.groupby('SK_ID_CURR').mean()
bureau_avg['buro_count'] = bureau_ft[['SK_ID_BUREAU','SK_ID_CURR']].groupby('SK_ID_CURR').count()['SK_ID_BUREAU']
bureau_avg.columns = ['b_' + f_ for f_ in bureau_avg.columns]
df_ft = df_ft.merge(right=bureau_avg.reset_index(), how='left', on='SK_ID_CURR')


## Following create two features: bureau_debt_credit_ratio, bureau_average_of_past_loans_per_type

In [7]:
bureau_new_ft, new_bureau_fts = fe_bureau(bureau_ft)
df_ft = df_ft.merge(right=bureau_new_ft[new_bureau_fts + ['SK_ID_CURR']], how='left', on='SK_ID_CURR')

# Credit card features

In [8]:
nb_prevs = credit_card_ft[['SK_ID_CURR', 'SK_ID_PREV']].groupby('SK_ID_CURR').count()
credit_card_ft['SK_ID_PREV'] = credit_card_ft['SK_ID_CURR'].map(nb_prevs['SK_ID_PREV'])

### average of all other columns 
avg_cc_bal = credit_card_ft.groupby('SK_ID_CURR').mean()
avg_cc_bal.columns = ['cc_bal_' + f_ for f_ in avg_cc_bal.columns]
df_ft = df_ft.merge(right=avg_cc_bal.reset_index(), how='left', on='SK_ID_CURR')

## Following create features: credit_card_total_installments, credit_card_number_of_loans, credit_card_installments_per_loan, credit_card_drawings_atm, credit_card_drawings_total, credit_card_cash_card_ratio, credit_card_monthly_diff

In [9]:
credti_card_new_ft, new_credit_fts = fe_credit_card(credit_card_ft)
df_ft = df_ft.merge(right=credti_card_new_ft[new_credit_fts + ['SK_ID_CURR']], how='left', on='SK_ID_CURR')

# POS cash features

In [10]:
pos_count = pos_card_ft[['SK_ID_CURR', 'SK_ID_PREV']].groupby('SK_ID_CURR').count()
pos_card_ft['SK_ID_PREV'] = pos_card_ft['SK_ID_CURR'].map(pos_count['SK_ID_PREV'])

## Average Values for all other variables in pos cash
pos_avg = pos_card_ft.groupby('SK_ID_CURR').mean()
df_ft = df_ft.merge(right=pos_avg.reset_index(), how='left', on='SK_ID_CURR')

## Following create features: pos_cash_paid_late_12month_cnt, SK_DPD_12month_sum, pos_cash_paid_late_last_cnt', SK_DPD_last_sum

In [11]:
pos_card_new_ft, new_pos_fts = fe_pos_cash(pos_card_ft, period=12)
df_ft = df_ft.merge(right=pos_card_new_ft[new_pos_fts + ['SK_ID_CURR']], how='left', on='SK_ID_CURR')

# Installment features

In [12]:
# Days past due and days before due (no negative values)
installment_ft['DPD'] = installment_ft['DAYS_ENTRY_PAYMENT'] - installment_ft['DAYS_INSTALMENT']
installment_ft['DBD'] = installment_ft['DAYS_INSTALMENT'] - installment_ft['DAYS_ENTRY_PAYMENT']
installment_ft['DPD'] = installment_ft['DPD'].apply(lambda x: x if x > 0 else 0)
installment_ft['DBD'] = installment_ft['DBD'].apply(lambda x: x if x > 0 else 0)

In [13]:
cnt_inst = installment_ft[['SK_ID_CURR', 'SK_ID_PREV']].groupby('SK_ID_CURR').count()
installment_ft['SK_ID_PREV'] = installment_ft['SK_ID_CURR'].map(cnt_inst['SK_ID_PREV'])

## Average values for all other variables in installments payments
avg_inst = installment_ft.groupby('SK_ID_CURR').mean()
avg_inst.columns = ['i_' + f_ for f_ in avg_inst.columns]
df_ft = df_ft.merge(right=avg_inst.reset_index(), how='left', on='SK_ID_CURR')

## Following create features: installment_paid_late_30day_cnt, installment_paid_late_30day_sum, installment_paid_over_30day_cnt, installment_paid_over_30day_sum

In [14]:
installment_new_ft, new_installment_fts = fe_install(installment_ft, period=30)
df_ft = df_ft.merge(right=installment_new_ft[new_installment_fts + ['SK_ID_CURR']], how='left', on='SK_ID_CURR')

# Prev applicaiton features

In [15]:
prev_apps_count = prev_application_ft[['SK_ID_CURR', 'SK_ID_PREV']].groupby('SK_ID_CURR').count()
prev_application_ft['SK_ID_PREV'] = prev_application_ft['SK_ID_CURR'].map(prev_apps_count['SK_ID_PREV'])

## Average values for all other features in previous applications
prev_apps_avg = prev_application_ft.groupby('SK_ID_CURR').mean()
prev_apps_avg.columns = ['p_' + col for col in prev_apps_avg.columns]
df_ft = df_ft.merge(right=prev_apps_avg.reset_index(), how='left', on='SK_ID_CURR')

In [20]:
df_ft.to_pickle(data_path + 'features.pkl')

In [18]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df_ft, test_size=0.3)

In [19]:
train_selected, dropped = toad.selection.select(train, target = 'TARGET', empty = 0.5, iv = 0.02, corr = 0.7, return_drop=True, exclude=['SK_ID_CURR'])

  ix, cn = np.where(np.triu(corr.values, 1) > threshold)
