The Feature Engineering of application_train/test.csv

(1)feature_engineering_v0（baseline）: including missing_dealing、anomalies_dealing、 and onehot_encode

(2)feature_engineering_v1: baseline + label_encode

(3)feature_engineering_v2: baseline + domain_feature

(4)feature_engineering_v3: baseline + age_feature

(5)feature_engineering_v4: baseline + region_feature

(6)feature_engineering_v5: baseline + EXT_SOURCE_feature

(7)feature_engineering_v6: baseline + document_phone_feature

(8)feature_engineering_v7: ALL

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
def deal_anomalies(app_train, app_test):
    # Create an anomalous flag column
    app_train['DAYS_EMPLOYED_ANOM'] = app_train["DAYS_EMPLOYED"] == 365243
    app_train['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace = True)

    app_test['DAYS_EMPLOYED_ANOM'] = app_test["DAYS_EMPLOYED"] == 365243
    app_test["DAYS_EMPLOYED"].replace({365243: np.nan}, inplace = True)

    num_cols_common = [c for c in app_train.columns if c in app_test.columns and pd.api.types.is_numeric_dtype(app_train[c])]
    miss_rates = app_train[num_cols_common].isna().mean().sort_values(ascending=False)

    selected_miss_cols = [c for c in miss_rates.index if miss_rates[c] >= 0.5]
    for c in selected_miss_cols:
        app_train[f"MISS__{c}"] = app_train[c].isna().astype('int8')
        app_test[f"MISS__{c}"]  = app_test[c].isna().astype('int8')

    app_train['ROW_MISSING_RATIO'] = app_train.isna().mean(axis=1)
    app_test['ROW_MISSING_RATIO']  = app_test.isna().mean(axis=1)
    return app_train, app_test

def onehot_encoder(app_train, app_test):
    app_train = pd.get_dummies(app_train)
    app_test = pd.get_dummies(app_test)
    train_labels = app_train['TARGET']

    # Align the training and testing data, keep only columns present in both dataframes
    app_train, app_test = app_train.align(app_test, join = 'inner', axis = 1)

    # Add the target back in
    app_train['TARGET'] = train_labels
    return app_train, app_test


def label_encoder_1(app_train, app_test):
    # List for storing categorical indices
    features = app_train.copy()
    test_features = app_test.copy()
    # train_sk_id = app_train['SK_ID_CURR']
    # train_target = app_train['TARGET']
    # test_sk_id = app_test['SK_ID_CURR']
    # features = features.drop(columns = ['SK_ID_CURR', 'TARGET'])
    # test_features = test_features.drop(columns = ['SK_ID_CURR'])
    object_list = ['NAME_EDUCATION_TYPE','NAME_HOUSING_TYPE','NAME_INCOME_TYPE','NAME_TYPE_SUITE']
    cat_indices = []
    for obj in object_list:
        obj_dict = {}

        train_obj_mode = features[obj].mode()[0]
        features[obj] = features[obj].fillna(train_obj_mode)
        test_obj_mode = features[obj].mode()[0]
        test_features[obj] = test_features[obj].fillna(test_obj_mode)

        train_cat_perc = features[[obj, 'TARGET']].groupby([obj],as_index=False).mean()
        train_cat_perc.sort_values(by='TARGET', ascending=True, inplace=True)
        for index, x in enumerate(train_cat_perc.values):
            obj_dict[x[0]] = index
        features[obj] = features[obj].map(obj_dict)
        test_features[obj] = test_features[obj].map(obj_dict)
        cat_indices.append(features.columns.get_loc(obj) - 2)

    return features, test_features, cat_indices

def domain_feature(app_train, app_test):
    # The multiple of the trust amount relative to the annual income,
    # Intuition: The higher the leverage, the higher the default risk is usually
    app_train['CREDIT_TO_INCOME'] = app_train['AMT_CREDIT'] / app_train['AMT_INCOME_TOTAL']
    app_test['CREDIT_TO_INCOME']  = app_test['AMT_CREDIT'] / app_test['AMT_INCOME_TOTAL']

    # Annuity (installment annualization) as a percentage of annual income. 
    # Intuition: The higher the proportion, the heavier the burden and the greater the cash flow pressure.
    app_train['ANNUITY_TO_INCOME'] = app_train['AMT_ANNUITY'] / app_train['AMT_INCOME_TOTAL']
    app_test['ANNUITY_TO_INCOME']  = app_test['AMT_ANNUITY'] / app_test['AMT_INCOME_TOTAL']

    # Monthly repayment burden rate. 
    # Intuition: The commonly used debt to income ratio (DTI) approximation term indicates that the higher the risk, the higher the risk.
    # app_train['PAYMENT_BURDEN_MONTHLY'] = app_train['AMT_ANNUITY'] / (app_train['AMT_INCOME_TOTAL'] / 12.0)
    # app_test['PAYMENT_BURDEN_MONTHLY']  = app_test['AMT_ANNUITY'] / (app_test['AMT_INCOME_TOTAL'] / 12.0)

    # The relative annual income of commodity prices. 
    # Intuition: The higher the large consumption relative to income, the higher the possibility of default.
    app_train['GOODS_TO_INCOME'] = app_train['AMT_GOODS_PRICE'] / app_train['AMT_INCOME_TOTAL']
    app_test['GOODS_TO_INCOME']  = app_test['AMT_GOODS_PRICE']  / app_test['AMT_INCOME_TOTAL']

    # The annuity intensity of each unit's credit is a comprehensive reflection of interest rate/term; 
    # Intuition: Higher may mean higher interest rates or shorter terms
    app_train['ANNUITY_TO_CREDIT'] = app_train['AMT_ANNUITY'] / app_train['AMT_CREDIT']
    app_test['ANNUITY_TO_CREDIT']  = app_test['AMT_ANNUITY']  / app_test['AMT_CREDIT']

    # Estimate the loan term (in months), the longer the term
    # Intuition: the lower the monthly payment but the longer the total risk exposure period; The relationship with risk is not monotonous.
    app_train['TERM_MONTHS_EST'] = (app_train['AMT_CREDIT'] / app_train['AMT_ANNUITY']).clip(1, 1200)
    app_test['TERM_MONTHS_EST']  = (app_test['AMT_CREDIT']  / app_test['AMT_ANNUITY']).clip(1, 1200)

    # The credit amount is relative to the commodity price. 
    # Intuition: >1 may include interest/surcharges/bundled amount; Too high may pose greater risks
    app_train['CREDIT_TO_GOODS'] = app_train['AMT_CREDIT'] / app_train['AMT_GOODS_PRICE']
    app_test['CREDIT_TO_GOODS']  = app_test['AMT_CREDIT']  / app_test['AMT_GOODS_PRICE']

    # Per capita income, 
    # Intuition: the lighter the burden of support, the stronger the ability to pay.
    app_train['INCOME_PER_PERSON'] = app_train['AMT_INCOME_TOTAL'] / app_train['CNT_FAM_MEMBERS'].replace(0, np.nan)
    app_test['INCOME_PER_PERSON']  = app_test['AMT_INCOME_TOTAL']  / app_test['CNT_FAM_MEMBERS'].replace(0, np.nan)

    return app_train, app_test

def age_employment_feature(app_train, app_test):
    # Age/seniority/stability interaction
    #The proportion of years of service to age. Longitudinal characterization of career stability; Too low a proportion may result in insufficient job stability
    app_train['EMPLOY_TO_AGE'] = app_train['DAYS_EMPLOYED'] / app_train['DAYS_BIRTH']
    app_test['EMPLOY_TO_AGE']  = app_test['DAYS_EMPLOYED']  / app_test['DAYS_BIRTH']


    # Standardization of vehicle age relative to age, proxy for purchasing power or asset stability
    app_train['OWN_CAR_AGE_NORM'] = -app_train['OWN_CAR_AGE'] / (app_train['DAYS_BIRTH'] / 365.0)
    app_test['OWN_CAR_AGE_NORM']  = -app_test['OWN_CAR_AGE']  / (app_test['DAYS_BIRTH'] / 365.0)

    # Number of children corresponding to age every 10 years. 
    # The coupling of family burden and life stages; Indirect impact on disposable income and default
    app_train['CHILDREN_PER_10Y'] = -app_train['CNT_CHILDREN'] / ((app_train['DAYS_BIRTH'] / 10.0) / 365.0)
    app_test['CHILDREN_PER_10Y']  = -app_test['CNT_CHILDREN']  / ((app_test['DAYS_BIRTH'] / 10.0) / 365.0)
    return app_train, app_test

def region_score_feature(app_train, app_test):
    # Regional rating/address consistency
    app_train['REGION_RATING_DIFF'] = app_train['REGION_RATING_CLIENT'] - app_train['REGION_RATING_CLIENT_W_CITY']
    app_test['REGION_RATING_DIFF']  = app_test['REGION_RATING_CLIENT']  - app_test['REGION_RATING_CLIENT_W_CITY']

    addr_cols = [
        'REG_REGION_NOT_LIVE_REGION','REG_REGION_NOT_WORK_REGION','LIVE_REGION_NOT_WORK_REGION',
        'REG_CITY_NOT_LIVE_CITY','REG_CITY_NOT_WORK_CITY','LIVE_CITY_NOT_WORK_CITY'
    ]
    present_train = [c for c in addr_cols if c in app_train.columns]
    present_test  = [c for c in addr_cols if c in app_test.columns]
    present = [c for c in addr_cols if c in present_train and c in present_test]

    app_train['ADDR_MISMATCH_SUM'] = app_train[present].sum(axis=1)
    app_test['ADDR_MISMATCH_SUM']  = app_test[present].sum(axis=1)
    return app_train, app_test

def EXT_SOURCE_feature(app_train, app_test):
    # External data EXT_SOURCE statistics
    ext_cols = [c for c in ['EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3'] if c in app_train.columns and c in app_test.columns]

    app_train['EXT_MEAN'] = app_train[ext_cols].mean(axis=1)
    app_test['EXT_MEAN']  = app_test[ext_cols].mean(axis=1)

    app_train['EXT_STD'] = app_train[ext_cols].std(axis=1)
    app_test['EXT_STD']  = app_test[ext_cols].std(axis=1)

    app_train['EXT_MIN'] = app_train[ext_cols].min(axis=1)
    app_test['EXT_MIN']  = app_test[ext_cols].min(axis=1)

    app_train['EXT_MAX'] = app_train[ext_cols].max(axis=1)
    app_test['EXT_MAX']  = app_test[ext_cols].max(axis=1)

    app_train['EXT_MEAN_X_AGE'] = app_train['EXT_MEAN'] *  (-app_train['DAYS_BIRTH'] / 365.0)
    app_test['EXT_MEAN_X_AGE']  = app_test['EXT_MEAN']  *  (-app_train['DAYS_BIRTH'] / 365.0)

    app_train['EXT_MEAN_X_LOG_INCOME'] = app_train['EXT_MEAN'] * np.log1p(app_train['AMT_INCOME_TOTAL'])
    app_test['EXT_MEAN_X_LOG_INCOME']  = app_test['EXT_MEAN']  * np.log1p(app_test['AMT_INCOME_TOTAL'])

    app_train['EXT_MISSING_CNT'] = app_train[ext_cols].isna().sum(axis=1)
    app_test['EXT_MISSING_CNT']  = app_test[ext_cols].isna().sum(axis=1)

    return app_train, app_test

def document_phone_feature(app_train, app_test):
    # Documents and contact information
    doc_cols = [c for c in app_train.columns if c.startswith('FLAG_DOCUMENT_') and c in app_test.columns]
    app_train['DOCS_SUBMITTED_CNT'] = app_train[doc_cols].sum(axis=1)
    app_test['DOCS_SUBMITTED_CNT']  = app_test[doc_cols].sum(axis=1)

    app_train['DOCS_ANY'] = (app_train['DOCS_SUBMITTED_CNT'] > 0).astype('int8')
    app_test['DOCS_ANY']  = (app_test['DOCS_SUBMITTED_CNT']  > 0).astype('int8')

    phone_cols = [c for c in ['FLAG_MOBIL','FLAG_EMP_PHONE','FLAG_WORK_PHONE','FLAG_CONT_MOBILE','FLAG_PHONE','FLAG_EMAIL'] if c in app_train.columns and c in app_test.columns]
    app_train['CONTACT_FLAGS_SUM'] = app_train[phone_cols].sum(axis=1)
    app_test['CONTACT_FLAGS_SUM']  = app_test[phone_cols].sum(axis=1)

    return app_train, app_test


def feature_engineering_v0(app_train, app_test):
    # missing anomalies and onehot
    app_train, app_test = deal_anomalies(app_train, app_test)
    app_train, app_test = onehot_encoder(app_train, app_test)
    cat_indices = []
    return app_train, app_test, cat_indices

def feature_engineering_v1(app_train, app_test):
    # missing anomalies labelcode and onehot
    app_train, app_test = deal_anomalies(app_train, app_test)
    app_train, app_test, cat_indices = label_encoder_1(app_train, app_test)
    app_train, app_test = onehot_encoder(app_train, app_test)
    return app_train, app_test, cat_indices


def feature_engineering_v2(app_train, app_test):
    # missing anomalies domain and onehot
    app_train, app_test = deal_anomalies(app_train, app_test)
    app_train, app_test = domain_feature(app_train, app_test)
    app_train, app_test = onehot_encoder(app_train, app_test)
    cat_indices = []
    return app_train, app_test, cat_indices


def feature_engineering_v3(app_train, app_test):
    # missing anomalies age and onehot
    app_train, app_test = deal_anomalies(app_train, app_test)
    app_train, app_test = age_employment_feature(app_train, app_test)
    app_train, app_test = onehot_encoder(app_train, app_test)
    cat_indices = []
    return app_train, app_test, cat_indices

def feature_engineering_v4(app_train, app_test):
    # missing anomalies region and onehot
    app_train, app_test = deal_anomalies(app_train, app_test)
    app_train, app_test = region_score_feature(app_train, app_test)
    app_train, app_test = onehot_encoder(app_train, app_test)
    cat_indices = []
    return app_train, app_test, cat_indices

def feature_engineering_v5(app_train, app_test):
    # missing anomalies EXT_SOURCE_feature and onehot
    app_train, app_test = deal_anomalies(app_train, app_test)
    app_train, app_test = EXT_SOURCE_feature(app_train, app_test)
    app_train, app_test = onehot_encoder(app_train, app_test)
    cat_indices = []
    return app_train, app_test, cat_indices

def feature_engineering_v6(app_train, app_test):
    # missing anomalies document_phone_feature and onehot
    app_train, app_test = deal_anomalies(app_train, app_test)
    app_train, app_test = document_phone_feature(app_train, app_test)
    app_train, app_test = onehot_encoder(app_train, app_test)
    cat_indices = []
    return app_train, app_test, cat_indices


def feature_engineering_v7(app_train, app_test):
    # all feature
    # V0
    app_train, app_test = deal_anomalies(app_train, app_test)
    # V1
    app_train, app_test, cat_indices = label_encoder_1(app_train, app_test)
    # V2
    app_train, app_test = domain_feature(app_train, app_test)
    # V3
    app_train, app_test = age_employment_feature(app_train, app_test)
    # V5
    app_train, app_test = EXT_SOURCE_feature(app_train, app_test)
    # V6
    app_train, app_test = document_phone_feature(app_train, app_test)

    app_train, app_test = onehot_encoder(app_train, app_test)
    cat_indices = []
    return app_train, app_test, cat_indices