In [1]:
import pandas as pd
import numpy as np
import pickle
from pathlib import Path
import dill
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import TargetEncoder
import category_encoders as ce
import os
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, log_loss
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier 
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.impute import KNNImputer
from xgboost import XGBClassifier
import imblearn
from imblearn.under_sampling import RandomUnderSampler

In [2]:
with open('read_file.pickle', 'rb') as f: 
    load_data = dill.load(f)


df_test = load_data('application_test.csv')

In [3]:
with open('clean_data.pickle', 'rb') as f:
    clean_data = dill.load(f)

df_test = clean_data(df_test)

In [4]:
df_test.isna().sum().sum()

np.int64(0)

In [5]:
with open('encode_data.pickle', 'rb') as f:
    encode_data = dill.load(f)


# Get only categorical features:
categorical_cols = df_test.select_dtypes(include=['object']).columns.tolist()

df_test = encode_data(df_test, 'TARGET', categorical_cols, train=False)


In [6]:
df_test

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100001,0.083459,0.069993,0.085002,0.079616,0,135000.0,568800.0,20560.5,450000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
1,100005,0.083459,0.101419,0.085002,0.079616,0,99000.0,222768.0,17370.0,180000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
2,100013,0.083459,0.101419,0.072437,0.079616,0,202500.0,663264.0,69777.0,630000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,1.0,4.0
3,100028,0.083459,0.069993,0.085002,0.079616,2,315000.0,1575000.0,49018.5,1575000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
4,100038,0.083459,0.101419,0.072437,0.083249,1,180000.0,625500.0,32067.0,625500.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48739,456221,0.083459,0.069993,0.085002,0.079616,0,121500.0,412560.0,17473.5,270000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
48740,456222,0.083459,0.069993,0.085002,0.083249,2,157500.0,622413.0,31909.5,495000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,2.0
48741,456223,0.083459,0.069993,0.072437,0.079616,1,202500.0,315000.0,33205.5,315000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,3.0,1.0
48742,456224,0.083459,0.101419,0.085002,0.083249,0,225000.0,450000.0,25128.0,450000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,2.0


In [7]:
with open('scale_data.pickle', 'rb') as f:
    scale_data = dill.load(f)


df_test = scale_data(df_test, target=None, features_to_scale=None)

In [8]:
df_test

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,-1.723353,0.095331,-0.701175,0.713154,-0.669488,-0.559988,-0.427809,0.142475,-0.553580,-0.037477,...,-0.039517,0.0,0.0,0.0,-0.042538,-0.036585,-0.048265,-0.078421,-0.711321,-1.153865
1,-1.723314,0.095331,1.426178,0.713154,-0.669488,-0.559988,-0.782413,-0.804537,-0.752831,-0.839362,...,-0.039517,0.0,0.0,0.0,-0.042538,-0.036585,-0.048265,-0.078421,-0.711321,0.589324
2,-1.723236,0.095331,1.426178,-1.402222,-0.669488,-0.559988,0.237075,0.401002,2.520066,0.497113,...,-0.039517,0.0,0.0,0.0,-0.042538,-0.036585,-0.048265,-0.078421,0.773588,1.170387
3,-1.723091,0.095331,-0.701175,0.713154,-0.669488,2.260729,1.345214,2.896221,1.223666,3.303709,...,-0.039517,0.0,0.0,0.0,-0.042538,-0.036585,-0.048265,-0.078421,-0.711321,0.589324
4,-1.722994,0.095331,1.426178,-1.402222,1.493678,0.850370,0.015447,0.297651,0.165019,0.483748,...,-0.039517,0.0,0.0,0.0,-0.042538,-0.036585,-0.048265,-0.078421,-0.711321,0.008261
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48739,1.729446,0.095331,-0.701175,0.713154,-0.669488,-0.559988,-0.560785,-0.285119,-0.746368,-0.572067,...,-0.039517,0.0,0.0,0.0,-0.042538,-0.036585,-0.048265,-0.078421,-0.711321,-0.572802
48740,1.729456,0.095331,-0.701175,0.713154,1.493678,2.260729,-0.206181,0.289202,0.155183,0.096170,...,-0.039517,0.0,0.0,0.0,-0.042538,-0.036585,-0.048265,-0.078421,-0.711321,0.008261
48741,1.729465,0.095331,-0.701175,-1.402222,-0.669488,0.850370,0.237075,-0.552119,0.236120,-0.438420,...,-0.039517,0.0,0.0,0.0,-0.042538,-0.036585,-0.048265,-0.078421,3.743406,-0.572802
48742,1.729475,0.095331,1.426178,0.713154,1.493678,-0.559988,0.458703,-0.182654,-0.268332,-0.037477,...,-0.039517,0.0,0.0,0.0,-0.042538,-0.036585,-0.048265,-0.078421,-0.711321,0.008261


In [9]:
with open('predict_model.pickle', 'rb') as f:
    pred = dill.load(f)

with open('trained_model.pickle', 'rb') as f: 
    model = dill.load(f)

y_new_pred = pred(df_test, model, features = ['SK_ID_CURR'])

In [11]:
y_new_pred

array([0, 1, 0, ..., 0, 0, 1])