In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import roc_auc_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier


from sklearn.model_selection import GridSearchCV


import gc #Trash Collection
import re #Regular Expression
import joblib #Export Pre-Processor/Model 

# Load datasets

In [8]:
#Train Dataset
train = pd.read_csv('application_train.csv')
print(train.shape)
train.head()

(307511, 122)


Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
#Test Dataset
test = pd.read_csv('application_test.csv')
print(test.shape)
test.head()

(48744, 121)


Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100001,Cash loans,F,N,Y,0,135000.0,568800.0,20560.5,450000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
1,100005,Cash loans,M,N,Y,0,99000.0,222768.0,17370.0,180000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
2,100013,Cash loans,M,Y,Y,0,202500.0,663264.0,69777.0,630000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,1.0,4.0
3,100028,Cash loans,F,N,Y,2,315000.0,1575000.0,49018.5,1575000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
4,100038,Cash loans,M,Y,N,1,180000.0,625500.0,32067.0,625500.0,...,0,0,0,0,,,,,,


In [10]:
def onehot_encoder(df):
    cat_columns = [col for col in df.columns if df[col].dtype == 'object']
    num_columns = [col for col in df.columns if df[col].dtype != 'object']
    df = pd.get_dummies(df)    
    enc_columns = [c for c in df.columns if c not in num_columns]

    return df, enc_columns

In [11]:
def process_application(data_path):
    print('Processing application data.')
    
    train = pd.read_csv(f'{data_path}/application_train.csv')
    test = pd.read_csv(f'{data_path}/application_test.csv')
    
     
    # Dropping all columns with more than 60% missing data
     
    
    perc = 80.0 
    min_count =  int(((100-perc)/100)*train.shape[0] + 1)
    train = train.dropna( axis=1, 
                thresh=min_count)
    
     
    # Set Index
     
    train.set_index('SK_ID_CURR', inplace=True)
    test.set_index('SK_ID_CURR', inplace=True)
    
     
    # One-Hot Encoding *Train/Test*
     
    
    train['train'] = 1
    test['train'] = 0
    combined = pd.concat([train, test])
    combined = pd.get_dummies(combined)
    train = combined.query('train == 1').copy()
    test = combined.query('train == 0').copy()
    train.drop('train', axis='columns', inplace=True)
    test.drop(['TARGET', 'train'], axis='columns', inplace=True)

    del combined
    gc.collect()
    
     
    # Fill Missing Values for Numerical Columns w/ mean
     
    
    num_columns = [col for col in train.columns[2:] if train[col].dtype != 'object']
    for col in num_columns:
        if train[col].isna().values.sum() > 0:
            train[col].fillna(train[col].mean(), inplace=True)
        if test[col].isna().values.sum() > 0:
            test[col].fillna(train[col].mean(), inplace=True)

     
    # Fill Missing Values for Categorical Columns w/ mode
     
    
    train = train.fillna(train.mode().iloc[0])
    test = test.fillna(test.mode().iloc[0])
    
     
    # Align Columns Train/Test
    
     
    # Feature Engineering
     
    def add_app_features(df):
        df['DAYS_EMPLOYED_PERCENT'] = df['DAYS_EMPLOYED'] / train['DAYS_BIRTH']
        df['INCOME_TO_CREDIT_RATIO'] = df['AMT_INCOME_TOTAL'] / train['AMT_CREDIT']
        df['INCOME_PER_PERSON'] = df['AMT_INCOME_TOTAL'] / train['CNT_FAM_MEMBERS']
        df['ANNUITY_INCOME_RATIO'] = df['AMT_ANNUITY'] / train['AMT_INCOME_TOTAL']
        df['PAYMENT_RATE'] = df['AMT_ANNUITY'] / df['AMT_CREDIT']
        df['CREDIT_INCOME_PERCENT'] = df['AMT_CREDIT'] / df['AMT_INCOME_TOTAL']

    add_app_features(train)
    add_app_features(test)

    
     
    # End - process_application Function
     
    
    print('Data Preprocessed')
    
    print(train.shape)
    print(test.shape)
    
    return train, test

#Calling Function -------------------------

train, test = process_application('C:/Users/chams/Desktop/Ml_FLOW')

Processing application data.


  df['DAYS_EMPLOYED_PERCENT'] = df['DAYS_EMPLOYED'] / train['DAYS_BIRTH']
  df['INCOME_TO_CREDIT_RATIO'] = df['AMT_INCOME_TOTAL'] / train['AMT_CREDIT']
  df['INCOME_PER_PERSON'] = df['AMT_INCOME_TOTAL'] / train['CNT_FAM_MEMBERS']
  df['ANNUITY_INCOME_RATIO'] = df['AMT_ANNUITY'] / train['AMT_INCOME_TOTAL']
  df['PAYMENT_RATE'] = df['AMT_ANNUITY'] / df['AMT_CREDIT']
  df['CREDIT_INCOME_PERCENT'] = df['AMT_CREDIT'] / df['AMT_INCOME_TOTAL']
  df['DAYS_EMPLOYED_PERCENT'] = df['DAYS_EMPLOYED'] / train['DAYS_BIRTH']


Data Preprocessed
(307511, 251)
(48744, 250)


  df['INCOME_TO_CREDIT_RATIO'] = df['AMT_INCOME_TOTAL'] / train['AMT_CREDIT']
  df['INCOME_PER_PERSON'] = df['AMT_INCOME_TOTAL'] / train['CNT_FAM_MEMBERS']
  df['ANNUITY_INCOME_RATIO'] = df['AMT_ANNUITY'] / train['AMT_INCOME_TOTAL']
  df['PAYMENT_RATE'] = df['AMT_ANNUITY'] / df['AMT_CREDIT']
  df['CREDIT_INCOME_PERCENT'] = df['AMT_CREDIT'] / df['AMT_INCOME_TOTAL']


In [12]:
from sklearn.ensemble import GradientBoostingRegressor
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.datasets import load_boston
from sklearn.metrics import mean_absolute_error

In [13]:
train.fillna(0, inplace=True)
test.fillna(0, inplace=True)

train = train.rename(columns=lambda x: re.sub('[^A-Za-z0-9_]+', '_', x))
test = test.rename(columns=lambda x: re.sub('[^A-Za-z0-9_]+', '_', x))

In [14]:
y_train = train.TARGET
X_train = train.iloc [:, 1:]
X_test = test

print(y_train.shape)
print(X_train.shape)
print(X_test.shape)

(307511,)
(307511, 250)
(48744, 250)


In [16]:
#Creating Sample/Validation Sets from final_train set
X_sample, X_valid, y_sample, y_valid = train_test_split(X_train, y_train, test_size=0.8, stratify=y_train, random_state=1)
print(X_sample.shape)
print(X_valid.shape)

(61502, 250)
(246009, 250)


In [17]:
regressor = GradientBoostingRegressor(
    max_depth=2,
    n_estimators=3,
    learning_rate=1.0
)
regressor.fit(X_train, y_train)