In [4]:
import numpy as np
import pandas as pd
import gc
import time
import category_encoders as ce
from contextlib import contextmanager
import lightgbm as lgb
import xgboost as xgb
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
import category_encoders as ce
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from scipy.cluster.vq import kmeans2, whiten
from sklearn.decomposition import truncated_svd
from catboost import CatBoostClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

num_rows = None

In [5]:
application_train = pd.read_csv("../data/application_train.csv")
application_test = pd.read_csv("../data/application_test.csv")
POS_CASH = pd.read_csv('../data/POS_CASH_balance.csv')
credit_card = pd.read_csv('../data/credit_card_balance.csv')
bureau = pd.read_csv('../data/bureau.csv')
previous_app = pd.read_csv('../data/previous_application.csv')
subm = pd.read_csv("../data/sample_submission.csv")


print("Converting...")
le = LabelEncoder()
POS_CASH['NAME_CONTRACT_STATUS'] = \
    le.fit_transform(POS_CASH['NAME_CONTRACT_STATUS'].astype(str))
nunique_status = \
    POS_CASH[['SK_ID_CURR', 'NAME_CONTRACT_STATUS']].groupby('SK_ID_CURR')\
        .nunique()[['NAME_CONTRACT_STATUS']]\
        .rename(columns={'NAME_CONTRACT_STATUS': 'NUNIQUE_STATUS_POS_CASH'})
nunique_status.reset_index(inplace=True)
POS_CASH = POS_CASH.merge(nunique_status, how='left', on='SK_ID_CURR')
POS_CASH.drop(['SK_ID_PREV', 'NAME_CONTRACT_STATUS'], axis=1, inplace=True)

credit_card['NAME_CONTRACT_STATUS'] = \
    le.fit_transform(credit_card['NAME_CONTRACT_STATUS'].astype(str))
nunique_status = \
    credit_card[['SK_ID_CURR', 'NAME_CONTRACT_STATUS']].groupby('SK_ID_CURR')\
        .nunique()[['NAME_CONTRACT_STATUS']]\
        .rename(columns={'NAME_CONTRACT_STATUS': 'NUNIQUE_STATUS_credit_card'})
nunique_status.reset_index(inplace=True)
credit_card = credit_card.merge(nunique_status, how='left', on='SK_ID_CURR')
credit_card.drop(['SK_ID_PREV', 'NAME_CONTRACT_STATUS'], axis=1, inplace=True)

bureau_cat_features = [f for f in bureau.columns if bureau[f].dtype == 'object']
for f in bureau_cat_features:
    bureau[f] = le.fit_transform(bureau[f].astype(str))
    nunique = bureau[['SK_ID_CURR', f]].groupby('SK_ID_CURR').nunique()[[f]]\
        .rename(columns={f: 'NUNIQUE_'+f+'_bureau'})
    nunique.reset_index(inplace=True)
    bureau = bureau.merge(nunique, how='left', on='SK_ID_CURR')
    bureau.drop([f], axis=1, inplace=True)
bureau.drop(['SK_ID_BUREAU'], axis=1, inplace=True)

previous_app_cat_features = [f for f in previous_app.columns if previous_app[f].dtype == 'object']
for f in previous_app_cat_features:
    previous_app[f] = le.fit_transform(previous_app[f].astype(str))
    nunique = previous_app[['SK_ID_CURR', f]].groupby('SK_ID_CURR').nunique()[[f]]\
        .rename(columns={f: 'NUNIQUE_'+f+'_previous_app'})
    nunique.reset_index(inplace=True)
    previous_app = previous_app.merge(nunique, how='left', on='SK_ID_CURR')
    previous_app.drop([f], axis=1, inplace=True)
previous_app.drop(['SK_ID_PREV'], axis=1, inplace=True)

print("Merging...")
data_train = application_train.merge(POS_CASH.groupby('SK_ID_CURR').mean().reset_index(),
                                                             how='left', on='SK_ID_CURR')
data_test = application_test.merge(POS_CASH.groupby('SK_ID_CURR').mean().reset_index(),
                                                           how='left', on='SK_ID_CURR')

data_train = data_train.merge(credit_card.groupby('SK_ID_CURR').mean().reset_index(),
                                                         how='left', on='SK_ID_CURR')
data_test = data_test.merge(credit_card.groupby('SK_ID_CURR').mean().reset_index(),
                                                       how='left', on='SK_ID_CURR')
                                                       
data_train = data_train.merge(bureau.groupby('SK_ID_CURR').mean().reset_index(),
                                                    how='left', on='SK_ID_CURR')
data_test = data_test.merge(bureau.groupby('SK_ID_CURR').mean().reset_index(),
                                                  how='left', on='SK_ID_CURR')
                                                  
data_train = data_train.merge(previous_app.groupby('SK_ID_CURR').mean().reset_index(),
                                                          how='left', on='SK_ID_CURR')
data_test = data_test.merge(previous_app.groupby('SK_ID_CURR').mean().reset_index(),
                                                        how='left', on='SK_ID_CURR')

Converting...
Merging...


In [16]:
test_file_path = "Level_1_stack/test_catb-0.csv"
validation_file_path = 'Level_1_stack/validation_catb-0.csv'
num_folds = 5

In [7]:
train = pd.read_csv('/media/limbo/Home-Credit/data/application_train.csv.zip', nrows= num_rows)
n_train = train.shape[0]

In [None]:
cat_features = [f for f in data_train.columns if data_train[f].dtype == 'object']
def column_index(df, query_cols):
    cols = df.columns.values
    sidx = np.argsort(cols)
    return sidx[np.searchsorted(cols, query_cols, sorter=sidx)]
cat_features_inds = column_index(data_train, cat_features)    
print("Cat features are: %s" % [f for f in cat_features])
print(cat_features_inds)

for col in cat_features:
    data_train[col] = le.fit_transform(data_train[col].astype(str))
    data_test[col] = le.fit_transform(data_test[col].astype(str))
    
data_train.fillna(-1, inplace=True)
data_test.fillna(-1, inplace=True)
cols = data_train.columns

In [9]:
train_df = data_train
test_df = data_train

In [21]:
encoding = 'ohe'



print("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape))
gc.collect()
# Cross validation model
folds = KFold(n_splits=num_folds, shuffle=True, random_state=1001)
# Create arrays and dataframes to store results
oof_preds = np.zeros(train_df.shape[0])
sub_preds = np.zeros(test_df.shape[0])

feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]

#feats = [col for col in feats_0 if df[col].dtype == 'object']

cat_features = [f for f in train_df[feats].columns if train_df[f].dtype == 'object']

print(train_df[feats].shape)
for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])):
        
      
        categorical_columns = [col for col in train_df.columns if train_df[col].dtype == 'object']
        
        if encoding == 'ohe':
            
            enc = ce.OneHotEncoder(impute_missing=True, cols=categorical_columns).fit(train_df[feats].iloc[train_idx],
                                                                                       train_df['TARGET'].iloc[train_idx])
            x_train = enc.transform(train_df[feats].iloc[train_idx])
            x_valid = enc.transform(train_df[feats].iloc[valid_idx])
            x_test = enc.transform(test_df[feats])
            print(x_train.shape, x_valid.shape, x_test.shape)

        print("\nCatBoost...")                                     
        clf = CatBoostClassifier(iterations=1000,
                              learning_rate=0.1,
                              depth=7,
                              l2_leaf_reg=40,
                              bootstrap_type='Bernoulli',
                              subsample=0.9,
                              rsm=0.75,
                              scale_pos_weight=5,
                              eval_metric='AUC',
                              metric_period=50,
                              od_type='Iter',
                              od_wait=45,
                              random_seed=17,
                              allow_writing_files=False)

        clf.fit(x_train, train_df['TARGET'].iloc[train_idx].values, eval_set=(x_valid, train_df['TARGET'].iloc[valid_idx].values)
                , cat_features=[], use_best_model=True, verbose=True)

        oof_preds[valid_idx] = clf.predict_proba(x_valid)[:, 1]
        sub_preds += clf.predict_proba(x_test)[:, 1] / folds.n_splits

        print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(train_df['TARGET'].iloc[valid_idx].values, oof_preds[valid_idx])))
        del clf
        gc.collect()

# print('Full AUC score %.6f' % roc_auc_score(train_df['TARGET'].iloc[train_idx].values, oof_preds))
# # Write submission file and plot feature importance

sub_df = test_df[['SK_ID_CURR']].copy()
sub_df['TARGET'] = sub_preds
sub_df[['SK_ID_CURR', 'TARGET']].to_csv(test_file_path, index= False)


val_df = train_df[['SK_ID_CURR', 'TARGET']].copy()
val_df['TARGET'] = oof_preds
val_df[['SK_ID_CURR', 'TARGET']].to_csv(validation_file_path, index= False)        
            
gc.collect()

Starting LightGBM. Train shape: (307511, 199), test shape: (307511, 199)
(307511, 197)
(246008, 197) (61503, 197) (307511, 197)

CatBoost...
0:	test: 0.7084029	best: 0.7084029 (0)	total: 245ms	remaining: 4m 4s
50:	test: 0.7678743	best: 0.7678743 (50)	total: 12.8s	remaining: 3m 58s
100:	test: 0.7750154	best: 0.7750154 (100)	total: 25.4s	remaining: 3m 46s
150:	test: 0.7782168	best: 0.7782168 (150)	total: 38s	remaining: 3m 33s
200:	test: 0.7797442	best: 0.7797442 (200)	total: 50.6s	remaining: 3m 21s
250:	test: 0.7803331	best: 0.7803331 (250)	total: 1m 3s	remaining: 3m 8s
300:	test: 0.7810142	best: 0.7810142 (300)	total: 1m 15s	remaining: 2m 56s
350:	test: 0.7812221	best: 0.7812221 (350)	total: 1m 28s	remaining: 2m 43s
Stopped by overfitting detector  (45 iterations wait)

bestTest = 0.7812220604
bestIteration = 350

Shrink model to first 351 iterations.
Fold  1 AUC : 0.781222
(246009, 197) (61502, 197) (307511, 197)

CatBoost...
0:	test: 0.7083629	best: 0.7083629 (0)	total: 248ms	remainin

28

In [None]:
gc.collect()