### Preamble

In [1]:
import numpy as np
import pandas as pd

### Processing Application Data

This is the main dataset which contains our response of interest (**TARGET**).

In [2]:
df = pd.read_csv("./data/application_train.csv", index_col = "SK_ID_CURR")
print("Dataset Shape: {}".format(df.shape))

Dataset Shape: (307511, 121)


In [3]:
#Column names for feautres, response
features = [x for x in df.columns if x != "TARGET"]
response = "TARGET"

#### Add an average external source feature

In [12]:
df['EXT_SOURCE_AVG'] = df[['EXT_SOURCE_1', 'EXT_SOURCE_2', "EXT_SOURCE_3"]].mean(axis=1)

KeyboardInterrupt: 

#### Checking on missing data

In [6]:
df_missing = len(df.index) - df.count()
df_missing.sort_values(ascending = False)
df_missing = df_missing.loc[lambda x: x > 100] #no more than 100 missing values

In [7]:
full_features = [x for x in features if x not in list(df_missing.index)]
full_features

['NAME_CONTRACT_TYPE',
 'CODE_GENDER',
 'FLAG_OWN_CAR',
 'FLAG_OWN_REALTY',
 'CNT_CHILDREN',
 'AMT_INCOME_TOTAL',
 'AMT_CREDIT',
 'AMT_ANNUITY',
 'NAME_INCOME_TYPE',
 'NAME_EDUCATION_TYPE',
 'NAME_FAMILY_STATUS',
 'NAME_HOUSING_TYPE',
 'REGION_POPULATION_RELATIVE',
 'DAYS_BIRTH',
 'DAYS_EMPLOYED',
 'DAYS_REGISTRATION',
 'DAYS_ID_PUBLISH',
 'FLAG_MOBIL',
 'FLAG_EMP_PHONE',
 'FLAG_WORK_PHONE',
 'FLAG_CONT_MOBILE',
 'FLAG_PHONE',
 'FLAG_EMAIL',
 'CNT_FAM_MEMBERS',
 'REGION_RATING_CLIENT',
 'REGION_RATING_CLIENT_W_CITY',
 'WEEKDAY_APPR_PROCESS_START',
 'HOUR_APPR_PROCESS_START',
 'REG_REGION_NOT_LIVE_REGION',
 'REG_REGION_NOT_WORK_REGION',
 'LIVE_REGION_NOT_WORK_REGION',
 'REG_CITY_NOT_LIVE_CITY',
 'REG_CITY_NOT_WORK_CITY',
 'LIVE_CITY_NOT_WORK_CITY',
 'ORGANIZATION_TYPE',
 'DAYS_LAST_PHONE_CHANGE',
 'FLAG_DOCUMENT_2',
 'FLAG_DOCUMENT_3',
 'FLAG_DOCUMENT_4',
 'FLAG_DOCUMENT_5',
 'FLAG_DOCUMENT_6',
 'FLAG_DOCUMENT_7',
 'FLAG_DOCUMENT_8',
 'FLAG_DOCUMENT_9',
 'FLAG_DOCUMENT_10',
 'FLAG_DOCUM

#### Process data for a given feature set

In [None]:
def clean_app_data(x, feat, resp):
    """Helper function to clean application data"""
    cur = x[feat + [resp]]
    cur = pd.get_dummies(cur.dropna(axis = 0))
    return cur

In [None]:
df_proc = clean_app_data(df, full_features, response)
print("Processed Shape: {}".format(df_proc.shape))

At this point all of data is numerical and all missing data has been dropped.

### Train - Test Split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
def split_data(df, resp, train_prob):
    """Helper function to get a train-test split for processed data"""
    
    feat = [x for x in df.columns if x != resp]
    X = np.array(df[feat])
    y = np.array(df[resp])
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = train_prob)
    
    return X_train, X_test, y_train, y_test

In [None]:
X_train, X_test, y_train, y_test = split_data(df_proc, response, train_prob = 0.8)


In [None]:
df[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]]