# Libraries

In [1]:
import pandas as pd
import numpy as np
import pickle
from pathlib import Path
import dill
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import TargetEncoder, OrdinalEncoder
import os
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, log_loss, precision_score, recall_score
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier 
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.impute import KNNImputer
from xgboost import XGBClassifier
from helper_functions import *
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Input, BatchNormalization, Dropout
from keras.callbacks import EarlyStopping
import imblearn
from imblearn.under_sampling import RandomUnderSampler

# Load Data

In [2]:
df_test = load_data('application_test.csv')

In [3]:
df_test

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100001,Cash loans,F,N,Y,0,135000.0,568800.0,20560.5,450000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
1,100005,Cash loans,M,N,Y,0,99000.0,222768.0,17370.0,180000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
2,100013,Cash loans,M,Y,Y,0,202500.0,663264.0,69777.0,630000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,1.0,4.0
3,100028,Cash loans,F,N,Y,2,315000.0,1575000.0,49018.5,1575000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
4,100038,Cash loans,M,Y,N,1,180000.0,625500.0,32067.0,625500.0,...,0,0,0,0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48739,456221,Cash loans,F,N,Y,0,121500.0,412560.0,17473.5,270000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
48740,456222,Cash loans,F,N,N,2,157500.0,622413.0,31909.5,495000.0,...,0,0,0,0,,,,,,
48741,456223,Cash loans,F,Y,Y,1,202500.0,315000.0,33205.5,315000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,3.0,1.0
48742,456224,Cash loans,M,N,N,0,225000.0,450000.0,25128.0,450000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,2.0


# Drop Features

In [4]:
id_test = df_test['SK_ID_CURR']

In [5]:
df_test = drop_features(df_test, features_to_drop=['SK_ID_CURR'])

In [6]:
df_test

Unnamed: 0,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,Cash loans,F,N,Y,0,135000.0,568800.0,20560.5,450000.0,Unaccompanied,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
1,Cash loans,M,N,Y,0,99000.0,222768.0,17370.0,180000.0,Unaccompanied,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
2,Cash loans,M,Y,Y,0,202500.0,663264.0,69777.0,630000.0,,...,0,0,0,0,0.0,0.0,0.0,0.0,1.0,4.0
3,Cash loans,F,N,Y,2,315000.0,1575000.0,49018.5,1575000.0,Unaccompanied,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
4,Cash loans,M,Y,N,1,180000.0,625500.0,32067.0,625500.0,Unaccompanied,...,0,0,0,0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48739,Cash loans,F,N,Y,0,121500.0,412560.0,17473.5,270000.0,Unaccompanied,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
48740,Cash loans,F,N,N,2,157500.0,622413.0,31909.5,495000.0,Unaccompanied,...,0,0,0,0,,,,,,
48741,Cash loans,F,Y,Y,1,202500.0,315000.0,33205.5,315000.0,Unaccompanied,...,0,0,0,0,0.0,0.0,0.0,0.0,3.0,1.0
48742,Cash loans,M,N,N,0,225000.0,450000.0,25128.0,450000.0,Family,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,2.0


# Impute Data

In [7]:
df_test = clean_data(df_test)

In [8]:
df_test.isna().sum().sum()

0

# Encode Data

In [9]:
df_test = df_test.reset_index(drop=True)

In [10]:
# Get only categorical features:
categorical_cols = df_test.select_dtypes(include=['object']).columns.tolist()

df_test = encode_data(df_test, 'TARGET', categorical_cols, train=False,  model=TargetEncoder)


<class 'pandas.core.frame.DataFrame'>
<class 'numpy.ndarray'>


In [11]:
df_test

Unnamed: 0,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,0.083459,0.069993,0.085002,0.079616,0,135000.0,568800.0,20560.5,450000.0,0.081687,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.083459,0.101419,0.085002,0.079616,0,99000.0,222768.0,17370.0,180000.0,0.081687,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
2,0.083459,0.101419,0.072437,0.079616,0,202500.0,663264.0,69777.0,630000.0,0.081687,...,0,0,0,0,0.0,0.0,0.0,0.0,1.0,4.0
3,0.083459,0.069993,0.085002,0.079616,2,315000.0,1575000.0,49018.5,1575000.0,0.081687,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
4,0.083459,0.101419,0.072437,0.083249,1,180000.0,625500.0,32067.0,625500.0,0.081687,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48739,0.083459,0.069993,0.085002,0.079616,0,121500.0,412560.0,17473.5,270000.0,0.081687,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
48740,0.083459,0.069993,0.085002,0.083249,2,157500.0,622413.0,31909.5,495000.0,0.081687,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,2.0
48741,0.083459,0.069993,0.072437,0.079616,1,202500.0,315000.0,33205.5,315000.0,0.081687,...,0,0,0,0,0.0,0.0,0.0,0.0,3.0,1.0
48742,0.083459,0.101419,0.085002,0.083249,0,225000.0,450000.0,25128.0,450000.0,0.074946,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,2.0


# Predictions

### Logistic Regression Model Predictions

In [12]:
with open('trained_logistic_model.pickle', 'rb') as f: 
    lr_model = dill.load(f)

y_new_pred_LR = predict_model(df_test, lr_model)

In [13]:
y_new_pred_LR

array([0.54130837, 0.56556341, 0.65831245, ..., 0.55868914, 0.44216256,
       0.53817157])

In [14]:
print(f"Length of id_test: {len(id_test)}")
print(f"Length of y_new_pred_LR: {len(y_new_pred_LR.flatten())}")

Length of id_test: 48744
Length of y_new_pred_LR: 48744


In [15]:
# Save predictions with corresponding IDs
model_LR = pd.DataFrame({'SK_ID_CURR': id_test, 'TARGET': y_new_pred_LR})
model_LR.to_csv('prediction_lr.csv', index=False)

### Random Forest Model Predictions

In [16]:
with open('trained_random_forest_model.pickle', 'rb') as f: 
    rf_model = dill.load(f)

with open('splitted_train_test.pickle', 'rb') as f: 
    _, _, _, y_test = dill.load(f)

y_test = y_test.reset_index(drop=True)

y_new_pred_RF = predict_model(df_test, rf_model)

In [17]:
y_new_pred_RF

array([0.48, 0.59, 0.29, ..., 0.36, 0.36, 0.66])

In [18]:
# Save predictions with corresponding IDs
model_RF = pd.DataFrame({'SK_ID_CURR': id_test, 'TARGET': y_new_pred_RF})
model_RF.to_csv('prediction_rf.csv', index=False)

### XGBoost Model Predictions

In [19]:
with open('trained_xgb_model.pickle', 'rb') as f: 
    xgb_model = dill.load(f)

y_new_pred_xgb = predict_model(df_test, xgb_model)

In [20]:
y_new_pred_xgb

array([0.32391846, 0.7181123 , 0.26786843, ..., 0.21535191, 0.22230947,
       0.7131505 ], dtype=float32)

In [21]:
# Save predictions with corresponding IDs
model_xgb = pd.DataFrame({'SK_ID_CURR': id_test, 'TARGET': y_new_pred_xgb})
model_xgb.to_csv('prediction_xgb.csv', index=False)

### Neural Network Model Predictions

In [22]:
with open('trained_nn_model.pickle', 'rb') as f:
    nn_model = dill.load(f)

with open('predict_model.pickle', 'rb') as f:
    predict_model = dill.load(f)


y_new_pred_nn = predict_model(df_test, nn_model)

[1m1524/1524[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 537us/step


In [23]:
y_new_pred_nn.flatten()

array([0.46537057, 0.46537057, 0.46537057, ..., 0.46537057, 0.46537057,
       0.46537057], dtype=float32)

In [24]:
# Save predictions with corresponding IDs
model_nn = pd.DataFrame({'SK_ID_CURR': id_test, 'TARGET': y_new_pred_nn.flatten()})
model_nn.to_csv('prediction_nn.csv', index=False)

# Some Testing

In [None]:
# Ensure y_test and y_new_pred are of the same length 
common_indices = y_test.index.intersection(df_test.index) 
y_test_aligned = y_test.loc[common_indices] 
y_new_pred_aligned = pd.Series(y_new_pred_LR, index=df_test.index).loc[common_indices]
y_new_pred_aligned1 = pd.Series(y_new_pred_RF, index=df_test.index).loc[common_indices]
y_new_pred_aligned2 = pd.Series(model_xgb, index=df_test.index).loc[common_indices]
# y_new_pred_aligned3 = pd.Series(y_new_pred_nn, index=df_test.index).loc[common_indices]


In [28]:
if len(y_test_aligned) == len(y_new_pred_aligned): 
    lr_accuracy = accuracy_score(y_test_aligned.values.flatten(), y_new_pred_aligned) 
    lr_f1 = f1_score(y_test_aligned.values.flatten(), y_new_pred_aligned)
    lr_auc = roc_auc_score(y_test_aligned.values.flatten(), y_new_pred_aligned)

    rf_accuracy = accuracy_score(y_test_aligned.values.flatten(), y_new_pred_aligned1) 
    rf_f1 = f1_score(y_test_aligned.values.flatten(), y_new_pred_aligned1)
    rf_auc = roc_auc_score(y_test_aligned.values.flatten(), y_new_pred_aligned1)


    xgb_accuracy = accuracy_score(y_test_aligned.values.flatten(), y_new_pred_aligned2) 
    xgb_f1 = f1_score(y_test_aligned.values.flatten(), y_new_pred_aligned2)
    xgb_auc = roc_auc_score(y_test_aligned.values.flatten(), y_new_pred_aligned2)

    print(f"LR: Accuracy Score: {lr_accuracy}, F1_score: {lr_f1}, AUC: {lr_auc}") 
    print(f"RF: Accuracy Score: {rf_accuracy}, F1_score: {rf_f1}, AUC: {rf_auc}") 
    print(f"xgb: Accuracy Score: {xgb_accuracy}, F1_score: {xgb_f1}, AUC: {xgb_auc}") 

else: 
    print("Length mismatch: y_test_aligned has length", len(y_test_aligned), "while y_new_pred_aligned has length", len(y_new_pred_aligned))

ValueError: Classification metrics can't handle a mix of binary and continuous targets