# Gradient Boost Notebook

## 0. Loading Data & Librairies

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split
from io import StringIO
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder

## 1. Data Processing
### Notice: JUMP To Predict model to load csv file if this part has already been done

In [None]:
application_test = pd.read_csv('data/application_test.csv')
application_train = pd.read_csv('data/application_train.csv')
bureau = pd.read_csv('data/bureau.csv')
credit_card_balance = pd.read_csv("data/credit_card_balance.csv")
pcb = pd.read_csv("data/POS_CASH_balance.csv")
previous_application = pd.read_csv("data/previous_application.csv")
installments_payments = pd.read_csv("data/installments_payments.csv")
bureau_balance = pd.read_csv('data/bureau_balance.csv')

In [None]:
application_test['TARGET'] = np.nan
app = application_train.append(application_test, ignore_index=True)
app.shape

In [None]:
pd.set_option('display.max_rows', 500)

## 2. Feature Engineering

Big thanks to Will Koehrsen and his [kernel](https://www.kaggle.com/willkoehrsen/clean-manual-feature-engineering) for the features and aggregation functions. 

In [None]:
def agg_numeric(df, parent_var, df_name):
    """
    Groups and aggregates the numeric values in a child dataframe
    by the parent variable.

    Parameters
    --------
        df (dataframe): 
            the child dataframe to calculate the statistics on
        parent_var (string): 
            the parent variable used for grouping and aggregating
        df_name (string): 
            the variable used to rename the columns

    Return
    --------
        agg (dataframe): 
            a dataframe with the statistics aggregated by the `parent_var` for 
            all numeric columns. Each observation of the parent variable will have 
            one row in the dataframe with the parent variable as the index. 
            The columns are also renamed using the `df_name`. Columns with all duplicate
            values are removed. 

    """

    # Remove id variables other than grouping variable
    for col in df:
        if col != parent_var and 'SK_ID' in col:
            df = df.drop(columns=col)

    # Only want the numeric variables
    parent_ids = df[parent_var].copy()
    numeric_df = df.select_dtypes('number').copy()
    numeric_df[parent_var] = parent_ids

    # Group by the specified variable and calculate the statistics
    agg = numeric_df.groupby(parent_var).agg(
        ['count', 'mean', 'max', 'min', 'sum'])

    # Need to create new column names
    columns = []

    # Iterate through the variables names
    for var in agg.columns.levels[0]:
        if var != parent_var:
            # Iterate through the stat names
            for stat in agg.columns.levels[1]:
                # Make a new column name for the variable and stat
                columns.append('%s_%s_%s' % (df_name, var, stat))

    agg.columns = columns

    # Remove the columns with all redundant values
    _, idx = np.unique(agg, axis=1, return_index=True)
    agg = agg.iloc[:, idx]

    return agg.reset_index()

In [None]:
def agg_categorical(df, parent_var, df_name):
    """
    Aggregates the categorical features in a child dataframe
    for each observation of the parent variable.

    Parameters
    --------
    df : dataframe 
        The dataframe to calculate the value counts for.

    parent_var : string
        The variable by which to group and aggregate the dataframe. For each unique
        value of this variable, the final dataframe will have one row

    df_name : string
        Variable added to the front of column names to keep track of columns


    Return
    --------
    categorical : dataframe
        A dataframe with aggregated statistics for each observation of the parent_var
        The columns are also renamed and columns with duplicate values are removed.

    """

    # Select the categorical columns
    categorical = pd.get_dummies(df.select_dtypes('object'))

    # Make sure to put the identifying id on the column
    categorical[parent_var] = df[parent_var]

    # Groupby the group var and calculate the sum and mean
    categorical = categorical.groupby(parent_var).agg(['sum', 'count', 'mean'])

    column_names = []

    # Iterate through the columns in level 0
    for var in categorical.columns.levels[0]:
        # Iterate through the stats in level 1
        for stat in ['sum', 'count', 'mean']:
            # Make a new column name
            column_names.append('%s_%s_%s' % (df_name, var, stat))

    categorical.columns = column_names

    # Remove duplicate columns by values
    _, idx = np.unique(categorical, axis=1, return_index=True)
    categorical = categorical.iloc[:, idx]

    return categorical.reset_index()

In [None]:
def agg_child(df, parent_var, df_name):
    """Aggregate a child dataframe for each observation of the parent."""

    # Numeric and then categorical
    df_agg = agg_numeric(df, parent_var, df_name)
    df_agg_cat = agg_categorical(df, parent_var, df_name)

    # Merge on the parent variable
    df_info = df_agg.merge(df_agg_cat, on=parent_var, how='outer')

    # Remove any columns with duplicate values
    _, idx = np.unique(df_info, axis=1, return_index=True)
    df_info = df_info.iloc[:, idx]

    del df_agg, df_agg_cat

    return df_info

In [None]:
def agg_grandchild(df, parent_df, parent_var, grandparent_var, df_name):
    """
    Aggregate a grandchild dataframe at the grandparent level.

    Parameters
    --------
        df : dataframe
            Data with each row representing one observation

        parent_df : dataframe
            Parent table of df that must have the parent_var and 
            the grandparent_var. Used only to get the grandparent_var into
            the dataframe after aggregations

        parent_var : string
            Variable representing each unique observation in the parent.
            For example, `SK_ID_BUREAU` or `SK_ID_PREV`

        grandparent_var : string
            Variable representing each unique observation in the grandparent.
            For example, `SK_ID_CURR`. 

        df_name : string
            String for renaming the resulting columns.
            The columns are name with the `df_name` and with the 
            statistic calculated in the column

    Return
    --------
        df_info : dataframe
            A dataframe with one row for each observation of the grandparent variable.
            The grandparent variable forms the index, and the resulting dataframe
            can be merged with the grandparent to be used for training/testing. 
            Columns with all duplicate values are removed from the dataframe before returning.

    """

    # set the parent_var as the index of the parent_df for faster merges
    parent_df = parent_df[[parent_var, grandparent_var]
                          ].copy()  # .set_index(parent_var)

    # Aggregate the numeric variables at the parent level
    df_agg = agg_numeric(df, parent_var, '%s_LOAN' % df_name)

    # Merge to get the grandparent variable in the data
    df_agg = df_agg.merge(parent_df,
                          on=parent_var, how='left')

    # Aggregate the numeric variables at the grandparent level
    df_agg_client = agg_numeric(df_agg, grandparent_var, '%s_CLIENT' % df_name)

    # Can only apply one-hot encoding to categorical variables
    if any(df.dtypes == 'object'):

        # Aggregate the categorical variables at the parent level
        df_agg_cat = agg_categorical(df, parent_var, '%s_LOAN' % df_name)
        df_agg_cat = df_agg_cat.merge(parent_df,
                                      on=parent_var, how='left')

        # Aggregate the categorical variables at the grandparent level
        df_agg_cat_client = agg_numeric(
            df_agg_cat, grandparent_var, '%s_CLIENT' % df_name)
        df_info = df_agg_client.merge(
            df_agg_cat_client, on=grandparent_var, how='outer')

        del df_agg, df_agg_client, df_agg_cat, df_agg_cat_client

    # If there are no categorical variables, then we only need the numeric aggregations
    else:
        df_info = df_agg_client.copy()

        del df_agg, df_agg_client

    # Drop the columns with all duplicated values
    _, idx = np.unique(df_info, axis=1, return_index=True)
    df_info = df_info.iloc[:, idx]

    return df_info

In [None]:
# BASELINE features

app['LOAN_RATE'] = app['AMT_ANNUITY'] / app['AMT_CREDIT']
app['CREDIT_INCOME_RATIO'] = app['AMT_CREDIT'] / app['AMT_INCOME_TOTAL']
app['EMPLOYED_BIRTH_RATIO'] = app['DAYS_EMPLOYED'] / app['DAYS_BIRTH']

app['EXT_SOURCE_MULT_1'] = app['EXT_SOURCE_1']*app['EXT_SOURCE_2']
app['EXT_SOURCE_MULT_2'] = app['EXT_SOURCE_1']*app['EXT_SOURCE_3']
app['EXT_SOURCE_MULT_3'] = app['EXT_SOURCE_2']*app['EXT_SOURCE_3']
app['EXT_SOURCE_MULT_4'] = app['EXT_SOURCE_1'] * \
    app['EXT_SOURCE_2']*app['EXT_SOURCE_3']

app['EXT_SOURCE_COS_1'] = np.cos(app['EXT_SOURCE_1'])
app['EXT_SOURCE_COS_2'] = np.cos(app['EXT_SOURCE_2'])
app['EXT_SOURCE_COS_3'] = np.cos(app['EXT_SOURCE_3'])

app['EXT_SOURCE_SIN_1'] = np.sin(app['EXT_SOURCE_1'])
app['EXT_SOURCE_SIN_2'] = np.sin(app['EXT_SOURCE_2'])
app['EXT_SOURCE_SIN_3'] = np.sin(app['EXT_SOURCE_3'])

app['EXT_SOURCE_SUM'] = app[['EXT_SOURCE_1',
                             'EXT_SOURCE_2', 'EXT_SOURCE_3']].sum(axis=1)
app['EXT_SOURCE_MEAN'] = app[['EXT_SOURCE_1',
                              'EXT_SOURCE_2', 'EXT_SOURCE_3']].mean(axis=1)
app['AMT_REQ_SUM'] = app[[
    x for x in app.columns if 'AMT_REQ_' in x]].sum(axis=1)

In [None]:
app['app missing'] = app.isnull().sum(axis=1).values
app['app EXT_SOURCE mean'] = app[['EXT_SOURCE_1',
                                  'EXT_SOURCE_2', 'EXT_SOURCE_3']].mean(axis=1)
app['app EXT_SOURCE std'] = app[['EXT_SOURCE_1',
                                 'EXT_SOURCE_2', 'EXT_SOURCE_3']].std(axis=1)
app['app EXT_SOURCE prod'] = app['EXT_SOURCE_1'] * \
    app['EXT_SOURCE_2'] * app['EXT_SOURCE_3']
app['app EXT_SOURCE_1 * DAYS_EMPLOYED'] = app['EXT_SOURCE_1'] * app['DAYS_EMPLOYED']
app['app EXT_SOURCE_2 * DAYS_EMPLOYED'] = app['EXT_SOURCE_2'] * app['DAYS_EMPLOYED']
app['app EXT_SOURCE_3 * DAYS_EMPLOYED'] = app['EXT_SOURCE_3'] * app['DAYS_EMPLOYED']

app['app EXT_SOURCE_1 / DAYS_BIRTH'] = app['EXT_SOURCE_1'] / app['DAYS_BIRTH']
app['app EXT_SOURCE_2 / DAYS_BIRTH'] = app['EXT_SOURCE_2'] / app['DAYS_BIRTH']
app['app EXT_SOURCE_3 / DAYS_BIRTH'] = app['EXT_SOURCE_3'] / app['DAYS_BIRTH']

app['app AMT_CREDIT - AMT_GOODS_PRICE'] = app['AMT_CREDIT'] - app['AMT_GOODS_PRICE']
app['app AMT_CREDIT / AMT_GOODS_PRICE'] = app['AMT_CREDIT'] / app['AMT_GOODS_PRICE']
app['app AMT_CREDIT / AMT_ANNUITY'] = app['AMT_CREDIT'] / app['AMT_ANNUITY']
app['app AMT_CREDIT / AMT_INCOME_TOTAL'] = app['AMT_CREDIT'] / \
    app['AMT_INCOME_TOTAL']

app['app AMT_INCOME_TOTAL / 12 - AMT_ANNUITY'] = app['AMT_INCOME_TOTAL'] / \
    12. - app['AMT_ANNUITY']
app['app AMT_INCOME_TOTAL / AMT_ANNUITY'] = app['AMT_INCOME_TOTAL'] / \
    app['AMT_ANNUITY']
app['app AMT_INCOME_TOTAL - AMT_GOODS_PRICE'] = app['AMT_INCOME_TOTAL'] - \
    app['AMT_GOODS_PRICE']
app['app AMT_INCOME_TOTAL / CNT_FAM_MEMBERS'] = app['AMT_INCOME_TOTAL'] / \
    app['CNT_FAM_MEMBERS']
app['app AMT_INCOME_TOTAL / CNT_CHILDREN'] = app['AMT_INCOME_TOTAL'] / \
    (1 + app['CNT_CHILDREN'])

app['app most popular AMT_GOODS_PRICE'] = app['AMT_GOODS_PRICE'].isin(
    [225000, 450000, 675000, 900000]).map({True: 1, False: 0})
app['app popular AMT_GOODS_PRICE'] = app['AMT_GOODS_PRICE'].isin(
    [1125000, 1350000, 1575000, 1800000, 2250000]).map({True: 1, False: 0})
app['app OWN_CAR_AGE / DAYS_BIRTH'] = app['OWN_CAR_AGE'] / app['DAYS_BIRTH']
app['app OWN_CAR_AGE / DAYS_EMPLOYED'] = app['OWN_CAR_AGE'] / app['DAYS_EMPLOYED']

app['app DAYS_LAST_PHONE_CHANGE / DAYS_BIRTH'] = app['DAYS_LAST_PHONE_CHANGE'] / \
    app['DAYS_BIRTH']
app['app DAYS_LAST_PHONE_CHANGE / DAYS_EMPLOYED'] = app['DAYS_LAST_PHONE_CHANGE'] / \
    app['DAYS_EMPLOYED']
app['app DAYS_EMPLOYED - DAYS_BIRTH'] = app['DAYS_EMPLOYED'] - app['DAYS_BIRTH']
app['app DAYS_EMPLOYED / DAYS_BIRTH'] = app['DAYS_EMPLOYED'] / app['DAYS_BIRTH']
app['app CNT_CHILDREN / CNT_FAM_MEMBERS'] = app['CNT_CHILDREN'] / \
    app['CNT_FAM_MEMBERS']

In [None]:
# BASELINE features
bureau['LOAN_RATE'] = bureau['AMT_ANNUITY'] / bureau['AMT_CREDIT_SUM']
bureau['bureau AMT_CREDIT_SUM - AMT_CREDIT_SUM_DEBT'] = bureau['AMT_CREDIT_SUM'] - \
    bureau['AMT_CREDIT_SUM_DEBT']
bureau['bureau AMT_CREDIT_SUM - AMT_CREDIT_SUM_LIMIT'] = bureau['AMT_CREDIT_SUM'] - \
    bureau['AMT_CREDIT_SUM_LIMIT']
bureau['bureau AMT_CREDIT_SUM - AMT_CREDIT_SUM_OVERDUE'] = bureau['AMT_CREDIT_SUM'] - \
    bureau['AMT_CREDIT_SUM_OVERDUE']
bureau['bureau DAYS_CREDIT - CREDIT_DAY_OVERDUE'] = bureau['DAYS_CREDIT'] - \
    bureau['CREDIT_DAY_OVERDUE']
bureau['bureau DAYS_CREDIT - DAYS_CREDIT_ENDDATE'] = bureau['DAYS_CREDIT'] - \
    bureau['DAYS_CREDIT_ENDDATE']
bureau['bureau DAYS_CREDIT - DAYS_ENDDATE_FACT'] = bureau['DAYS_CREDIT'] - \
    bureau['DAYS_ENDDATE_FACT']
bureau['bureau DAYS_CREDIT_ENDDATE - DAYS_ENDDATE_FACT'] = bureau['DAYS_CREDIT_ENDDATE'] - \
    bureau['DAYS_ENDDATE_FACT']
bureau['bureau DAYS_CREDIT_UPDATE - DAYS_CREDIT_ENDDATE'] = bureau['DAYS_CREDIT_UPDATE'] - \
    bureau['DAYS_CREDIT_ENDDATE']

In [None]:
bureau_info = agg_child(bureau, 'SK_ID_CURR', 'BUREAU')

In [None]:
# BASELINE features
bureau_balance['PAST_DUE'] = bureau_balance['STATUS'].isin(
    ['1', '2', '3', '4', '5'])
bureau_balance['ON_TIME'] = bureau_balance['STATUS'] == '0'

In [None]:
bureau_info.shape

In [None]:
bureau_balance_info = agg_grandchild(
    bureau_balance, bureau, 'SK_ID_BUREAU', 'SK_ID_CURR', 'BB')
del bureau_balance, bureau
bureau_balance_info.head()

In [None]:
bureau_balance_info.shape

In [None]:
app = app.merge(bureau_info, on='SK_ID_CURR', how='left')
del bureau_info
app.shape

In [None]:
app = app.merge(bureau_balance_info, on='SK_ID_CURR', how='left')
del bureau_balance_info

In [None]:
app.shape

In [None]:
# BASELINE features
previous_application['LOAN_RATE'] = previous_application['AMT_ANNUITY'] / \
    previous_application['AMT_CREDIT']
previous_application["AMT_DIFFERENCE"] = previous_application['AMT_CREDIT'] - \
    previous_application['AMT_APPLICATION']

previous_application['prev missing'] = previous_application.isnull().sum(
    axis=1).values
previous_application['prev AMT_APPLICATION - AMT_CREDIT'] = previous_application['AMT_APPLICATION'] - \
    previous_application['AMT_CREDIT']
previous_application['prev AMT_APPLICATION - AMT_GOODS_PRICE'] = previous_application['AMT_APPLICATION'] - \
    previous_application['AMT_GOODS_PRICE']
previous_application['prev AMT_GOODS_PRICE - AMT_CREDIT'] = previous_application['AMT_GOODS_PRICE'] - \
    previous_application['AMT_CREDIT']

previous_application['prev DAYS_FIRST_DRAWING - DAYS_FIRST_DUE'] = previous_application['DAYS_FIRST_DRAWING'] - \
    previous_application['DAYS_FIRST_DUE']
previous_application['prev DAYS_TERMINATION less -500'] = (
    previous_application['DAYS_TERMINATION'] < -500).astype(int)

In [None]:
previous_info = agg_child(previous_application, 'SK_ID_CURR', 'PREVIOUS')
previous_info.shape

In [None]:
app = app.merge(previous_info, on='SK_ID_CURR', how='left')
del previous_info
app.shape

In [None]:
# BASELINE features
installments_payments['LATE'] = installments_payments['DAYS_ENTRY_PAYMENT'] > installments_payments['DAYS_INSTALMENT']
installments_payments['LOW_PAYMENT'] = installments_payments['AMT_PAYMENT'] < installments_payments['AMT_INSTALMENT']
installments_payments['ins DAYS_ENTRY_PAYMENT - DAYS_INSTALMENT'] = installments_payments['DAYS_ENTRY_PAYMENT'] - \
    installments_payments['DAYS_INSTALMENT']
installments_payments['ins NUM_INSTALMENT_NUMBER_100'] = (
    installments_payments['NUM_INSTALMENT_NUMBER'] == 100).astype(int)
installments_payments['ins DAYS_INSTALMENT more NUM_INSTALMENT_NUMBER'] = (
    installments_payments['DAYS_INSTALMENT'] > installments_payments['NUM_INSTALMENT_NUMBER'] * 50 / 3 - 11500 / 3).astype(int)
installments_payments['ins AMT_INSTALMENT - AMT_PAYMENT'] = installments_payments['AMT_INSTALMENT'] - \
    installments_payments['AMT_PAYMENT']
installments_payments['ins AMT_PAYMENT / AMT_INSTALMENT'] = installments_payments['AMT_PAYMENT'] / \
    installments_payments['AMT_INSTALMENT']

In [None]:
installments_info = agg_grandchild(
    installments_payments, previous_application, 'SK_ID_PREV', 'SK_ID_CURR', 'IN')
del installments_payments
installments_info.shape

In [None]:
app = app.merge(installments_info, on='SK_ID_CURR', how='left')
del installments_info
app.shape

In [None]:
# BASELINE features
pcb['LATE_PAYMENT'] = pcb['SK_DPD'] > 0.0
pcb['INSTALLMENTS_PAID'] = pcb['CNT_INSTALMENT'] - pcb['CNT_INSTALMENT_FUTURE']
pcb['pos CNT_INSTALMENT more CNT_INSTALMENT_FUTURE'] = (
    pcb['CNT_INSTALMENT'] > pcb['CNT_INSTALMENT_FUTURE']).astype(int)

In [None]:
cash_info = agg_grandchild(pcb, previous_application,
                           'SK_ID_PREV', 'SK_ID_CURR', 'CASH')
del pcb
cash_info.shape

In [None]:
app = app.merge(cash_info, on='SK_ID_CURR', how='left')
del cash_info
app.shape

In [None]:
# BASELINE features
credit_card_balance['OVER_LIMIT'] = credit_card_balance['AMT_BALANCE'] > credit_card_balance['AMT_CREDIT_LIMIT_ACTUAL']
credit_card_balance['BALANCE_CLEARED'] = credit_card_balance['AMT_BALANCE'] == 0.0
credit_card_balance['LOW_PAYMENT'] = credit_card_balance['AMT_PAYMENT_CURRENT'] < credit_card_balance['AMT_INST_MIN_REGULARITY']
credit_card_balance['LATE'] = credit_card_balance['SK_DPD'] > 0.0
credit_card_balance['card missing'] = credit_card_balance.isnull().sum(
    axis=1).values
credit_card_balance['card SK_DPD - MONTHS_BALANCE'] = credit_card_balance['SK_DPD'] - \
    credit_card_balance['MONTHS_BALANCE']
credit_card_balance['card SK_DPD_DEF - MONTHS_BALANCE'] = credit_card_balance['SK_DPD_DEF'] - \
    credit_card_balance['MONTHS_BALANCE']
credit_card_balance['card SK_DPD - SK_DPD_DEF'] = credit_card_balance['SK_DPD'] - \
    credit_card_balance['SK_DPD_DEF']
credit_card_balance['card AMT_TOTAL_RECEIVABLE - AMT_RECIVABLE'] = credit_card_balance['AMT_TOTAL_RECEIVABLE'] - \
    credit_card_balance['AMT_RECIVABLE']
credit_card_balance['card AMT_TOTAL_RECEIVABLE - AMT_RECEIVABLE_PRINCIPAL'] = credit_card_balance['AMT_TOTAL_RECEIVABLE'] - \
    credit_card_balance['AMT_RECEIVABLE_PRINCIPAL']
credit_card_balance['card AMT_RECIVABLE - AMT_RECEIVABLE_PRINCIPAL'] = credit_card_balance['AMT_RECIVABLE'] - \
    credit_card_balance['AMT_RECEIVABLE_PRINCIPAL']
credit_card_balance['card AMT_BALANCE - AMT_RECIVABLE'] = credit_card_balance['AMT_BALANCE'] - \
    credit_card_balance['AMT_RECIVABLE']
credit_card_balance['card AMT_BALANCE - AMT_RECEIVABLE_PRINCIPAL'] = credit_card_balance['AMT_BALANCE'] - \
    credit_card_balance['AMT_RECEIVABLE_PRINCIPAL']
credit_card_balance['card AMT_BALANCE - AMT_TOTAL_RECEIVABLE'] = credit_card_balance['AMT_BALANCE'] - \
    credit_card_balance['AMT_TOTAL_RECEIVABLE']
credit_card_balance['card AMT_DRAWINGS_CURRENT - AMT_DRAWINGS_ATM_CURRENT'] = credit_card_balance['AMT_DRAWINGS_CURRENT'] - \
    credit_card_balance['AMT_DRAWINGS_ATM_CURRENT']
credit_card_balance['card AMT_DRAWINGS_CURRENT - AMT_DRAWINGS_OTHER_CURRENT'] = credit_card_balance['AMT_DRAWINGS_CURRENT'] - \
    credit_card_balance['AMT_DRAWINGS_OTHER_CURRENT']
credit_card_balance['card AMT_DRAWINGS_CURRENT - AMT_DRAWINGS_POS_CURRENT'] = credit_card_balance['AMT_DRAWINGS_CURRENT'] - \
    credit_card_balance['AMT_DRAWINGS_POS_CURRENT']

In [None]:
credit_info = agg_grandchild(
    credit_card_balance, previous_application, 'SK_ID_PREV', 'SK_ID_CURR', 'CC')
del credit_card_balance, previous_application
credit_info.shape

In [None]:
app = app.merge(credit_info, on='SK_ID_CURR', how='left')
del credit_info
app.shape

In [None]:
le = LabelEncoder()
le_count = 0

for col in app:
    if app[col].dtype == 'object':
        # If 2 or fewer unique categories
        if len(list(app[col].unique())) <= 2:
            # Train on the training data
            le.fit(app[col])
            # Transform both training and testing data
            app[col] = le.transform(app[col])

            # Keep track of how many columns were label encoded
            le_count += 1

print('%d columns were label encoded.' % le_count)
list(app.select_dtypes('object'))

In [None]:
app.to_csv('data/full_features_v2.csv', index=False)

In [None]:
app.set_index('SK_ID_CURR', inplace=True)
app = pd.get_dummies(app)

In [None]:
print('After manual feature engineering, there are {} features.'.format(
    app.shape[1] - 2))

In [None]:
app.TARGET.isnull().sum()

### 2.1 Train and Test Set

In [None]:
train, test = app[app['TARGET'].notnull()].copy(
), app[app['TARGET'].isnull()].copy()

## 3. Prediction Models

In [None]:
import xgboost as xgb
import lightgbm as lgb

In [None]:
# Prepare submission file
def submission(y_pred, name):
    my_submission = pd.DataFrame(
        {'SK_ID_CURR': app_test_align.SK_ID_CURR, 'TARGET': y_pred})
    my_submission.to_csv(name, index=False)
    files.download(name)
    print('Done! :-)')

###  3.1 ROC function

In [None]:
def roc_plot(y_truth, y_pred):
    from sklearn.metrics import roc_curve
    fpr, tpr, treshold = roc_curve(y_truth, y_pred)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.plot(fpr, tpr, label='LR')
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.title('ROC curve')
    plt.legend(loc='best')
    plt.show()

In [None]:
def roc_score(y_truth, y_pred):
    from sklearn.metrics import roc_auc_score
    auc = roc_auc_score(y_truth, y_pred)
    print('####AUC-SCORE### \n{}'.format(auc))

### 3.2 XGBoost

In [None]:
def xg_boost_r(trainset, testset, importance, final):

    start = time.time()

    if (final == 1):
        x_train = trainset.drop('TARGET', axis=1)
        y_train = trainset['TARGET']
        x_test = testset.drop('TARGET', axis=1)
        y_test = testset['TARGET']
        dtest = xgb.DMatrix(x_test, label=y_test)

    else:
        data_input = trainset.drop('TARGET', axis=1)
        data_output = trainset['TARGET']
        x_train, x_test, y_train, y_test = train_test_split(
            data_input, data_output, test_size=0.2, random_state=42)
        dtest = xgb.DMatrix(x_test, label=y_test)

    dtrain = xgb.DMatrix(x_train, label=y_train)

    # Best params after doing gridsearch
    param = {  # 'nthread':[4], #when use hyperthread, xgboost may become slower
        'objective': 'binary:logistic',
        'learning_rate': 0.3,  # so called `eta` value
        'max_depth': 4,
        'min_child_weight': 5,
        'silent': 1,
        'subsample': 0.6,
        'colsample_bytree': 0.5,
        'n_estimators': 300,  # number of trees
        'seed': 42, 'gamma': 0.4}

    bst = xgb.train(param, dtrain)
    y_pred_xgb = bst.predict(dtest)

    if (final == 1):
        print('Time =', str(datetime.timedelta(seconds=time.time() - start)))
        return y_pred_xgb

    if (importance == 1):
        xgb.plot_importance(bst, max_num_features=100, figsize=(20, 20))
        plt.show()

    print('####Accuracy### \n{}'.format(np.mean(y_pred_xgb == y_test)))
    roc_score(y_test, y_pred_xgb)
    roc_plot(y_test, y_pred_xgb)

    print('Time =', str(datetime.timedelta(seconds=time.time() - start)))
    return y_pred_xgb, bst

### 3.3 LightGBM

In [None]:
random_hyp = {'is_unbalance': True,
              'n_estimators': 2673,
              'num_leaves': 77,
              'learning_rate': 0.00764,
              'min_child_samples': 460,
              'boosting_type': 'goss',
              'subsample_for_bin': 240000,
              'reg_lambda': 0.20,
              'reg_alpha': 0.88,
              'subsample': 0.95,
              'colsample_bytree': 0.7,
              'verbose': 200,
              'objective': 'binary',
              }

## 4. Load Data/CSV with full features
### Notice: This step can only be done if the previous part of the notebook has already been run. If the dataset is still loaded in the variable app, this step can be skipped.

In [None]:
app = pd.read_csv('data/full_features_v2.csv')
app.set_index('SK_ID_CURR', inplace=True)
#app.drop(columns='Unnamed: 0',inplace=True)

In [None]:
# If there are some categorical variables left we one-hot encode them.
app = pd.get_dummies(app)

In [None]:
# Testing whether there are still object type columns in our dataset.
list(app.select_dtypes('object'))

In [None]:
# If this has already been done in the steps above you don't need to repeat that!
# Split loaded data to train and test set.

train, test = app[app['TARGET'].notnull()].copy(
), app[app['TARGET'].isnull()].copy()

In [None]:
print(np.unique(train.columns == test.columns))

In [None]:
train.shape

In [None]:
train.TARGET.unique()

### 4.1 LightGBM Prediction

In [None]:
train_lgb = lgb.Dataset(train.drop(columns='TARGET'), label=train.TARGET)

In [None]:
import time
import datetime
start = time.time()
model = lgb.train(random_hyp, train_lgb)
print('Time =', str(datetime.timedelta(seconds=time.time() - start)))

In [None]:
print('Plot feature importances...')
ax = lgb.plot_importance(model, max_num_features=100, figsize=(20, 20))
plt.show()

In [None]:
model.save_model('./lgbm_bin.model')

In [None]:
preds = model.predict(test.drop(columns='TARGET').values)

In [None]:
submission = pd.DataFrame({'SK_ID_CURR': list(test.index),
                           'TARGET': preds})
submission.to_csv('submission_manual.csv', index=False)

### 4.2 XGBoost Prediction

In [None]:
ypred = xg_boost_r(train, test, 0, 1)

In [None]:
submission = pd.DataFrame({'SK_ID_CURR': list(test.index),
                           'TARGET': ypred})
submission.to_csv('submission_manual_xgb.csv', index=False)