In [24]:
import os, sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(''))))

import pandas as pd
import numpy as np
import gc
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from matplotlib import pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import classification_report

# classifiers  
from sklearn.linear_model import LogisticRegression

# sampling 
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline


from utils.data_extract_utils import get_home_credit_data, extract_zip, extract_features_from_bureau, EXTRACTRED_BUREAU_COLUMNS, extract_features_from_installments_payments

In [2]:
# Download, and extract data from dropbox into memory. 
data = get_home_credit_data()

In [3]:
def get_temp_data(data):
    ## combine train and test set
    train_test = data['application_train'].append(data['application_test']).reset_index()

    # join tables: bureau, bureau_balance - 1 min to execute
    bb = extract_features_from_bureau(data['bureau'], data['bureau_balance'])
    ip = extract_features_from_installments_payments(data['installments_payments'])

    # Join ID same datatype
    train_test['SK_ID_CURR'] = train_test['SK_ID_CURR'].astype('Int64')
    bb.index = bb.index.astype('Int64')
    ip.index = ip.index.astype('Int64')

    # Join Bureau(s) and Application tables
    df = train_test.join([bb, ip])
    
    # drop index column generated by groupby
    df.drop(['index'], axis=1, inplace=True)
    
    # clean memory 
    del bb
    gc.collect()
    
    # preprocess 
    ## DAYS_EMPLOYED
    df['DAYS_EMPLOYED'].replace(365243, np.nan, inplace=True)
    df['DAYS_EMPLOYED'] = df['DAYS_EMPLOYED'].apply(lambda x: abs(x))

    df['BIRTH_IN_YEARS'] = df['DAYS_BIRTH'].apply(lambda x: abs(x)/365)

    # remove 4 instances
    df = df.loc[df['CODE_GENDER'] != 'XNA', :]

    # add new features 
    df['ALL_EXT_SOURCE_MEAN']=df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].mean(axis = 1)
    df['PAYMENT_RATE'] = df['AMT_ANNUITY']/df['AMT_CREDIT']
    df['INCOME_PER_PERSON'] = df['AMT_INCOME_TOTAL']/df['CNT_FAM_MEMBERS']
    df['INCOME_CREDIT_IN_PERCENTAGE '] = df['AMT_INCOME_TOTAL']/df['AMT_CREDIT']
    df['ANNUITY_INCOME_IN_PERCENTAGE'] = df['AMT_ANNUITY']/df['AMT_INCOME_TOTAL']

    ## label encoder for binary values
    bin_features = ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY']

    replace_dict = {'CODE_GENDER': {'M': 0, 'F': 1},
                    'FLAG_OWN_CAR': {'Y': 0, 'N': 1}, 
                    'FLAG_OWN_REALTY':{'Y': 0, 'N': 1} }
    df.replace(replace_dict, inplace=True)
    
    # fix inf values
    df['AMT_CREDIT_DEBT_RATIO'] = df['AMT_CREDIT_DEBT_RATIO'].apply(lambda x: x if ~np.isinf(x) else 0)
    
    # test ids
    test_ids = df.loc[df['TARGET'].isnull(), 'SK_ID_CURR']
    
    # drop ids
    df.drop(columns=['SK_ID_CURR'], axis=1, inplace=True)
    
    return df, test_ids

In [4]:
df, test_ids = get_temp_data(data)

In [5]:
display(df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 356251 entries, 0 to 356254
Columns: 131 entries, TARGET to ANNUITY_INCOME_IN_PERCENTAGE
dtypes: float64(115), int64(3), object(13)
memory usage: 358.8+ MB


None

In [11]:
X, y = (df.loc[df['TARGET'].notnull(), :].drop(columns=['TARGET'], axis=1), 
        df.loc[df['TARGET'].notnull(), 'TARGET'])

# test set for kaggle
X_test = df.loc[df['TARGET'].isnull(), :].drop(columns=['TARGET'], axis=1)

# columns for pipeline
cat_cols = list(X.select_dtypes(include=object).columns)
num_cols = list(X.select_dtypes(include=[int, float]).columns)
len(cat_cols),len(num_cols)

(13, 114)

In [17]:
## pipelines

# one hot encoding 
categorical_pipe = Pipeline(steps=[
    ('cat_imp', SimpleImputer(strategy='most_frequent', add_indicator=False)),
  ('one_hot_encoder', OneHotEncoder(sparse=False, handle_unknown='ignore'))
])

# numerical 
numerical_pipe = Pipeline(steps=[
    ('num_imp', SimpleImputer(strategy='median', add_indicator=False)),
    ("scale", StandardScaler())  
])


# transform columns 
column_transformer = ColumnTransformer(transformers=[    
    ('num_pip', numerical_pipe, num_cols),
    ('cat_pipe', categorical_pipe, cat_cols)
])

In [18]:
# apply preprocessing to X
X_trans = column_transformer.fit_transform(X)

In [26]:
# smote sampling
smote = SMOTE(sampling_strategy='minority')
X_sm, y_sm = smote.fit_resample(X_trans,y)

# split data into k folds
# sss = StratifiedShuffleSplit(n_splits=3, test_size=0.3, random_state=42)
# score_val = []
# for train_index, val_index in sss.split(X_sm, y_sm):
#     # initalize model 
#     model = LogisticRegression(solver='lbfgs', random_state=42,max_iter=1000)
#     # fit model 
#     model.fit(X_sm[train_index], y_sm[train_index])
#     # test model
#     score_val.append(balanced_accuracy_score(y_sm[val_index], model.predict(X_sm[val_index])))
## takes about 6 min to execute ... 2 min per fold 

In [27]:
X_train, X_val, y_train, y_val = train_test_split(X_sm, y_sm, test_size=0.2, random_state= 42, stratify=y_sm)
logreg= LogisticRegression(solver='lbfgs', random_state=42,max_iter=1000)
logreg.fit(X_train, y_train)
y_pred_train = logreg.predict(X_train)
y_pred_val = logreg.predict(X_val)
print('classifaction report on training set')
print(classification_report(y_train, y_pred_train, labels=[0,1]))
print('--------------------------------------------------------------')
print('classifaction report on validation set')
print(classification_report(y_val, y_pred_val, labels=[0,1]))

classifaction report on training set
              precision    recall  f1-score   support

           0       0.71      0.69      0.70    226145
           1       0.70      0.71      0.70    226146

    accuracy                           0.70    452291
   macro avg       0.70      0.70      0.70    452291
weighted avg       0.70      0.70      0.70    452291

--------------------------------------------------------------
classifaction report on validation set
              precision    recall  f1-score   support

           0       0.71      0.70      0.70     56537
           1       0.70      0.71      0.71     56536

    accuracy                           0.70    113073
   macro avg       0.70      0.70      0.70    113073
weighted avg       0.70      0.70      0.70    113073

