In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
sns.set(context='paper', style='darkgrid', rc={'figure.facecolor':'white'}, font_scale=1.2)

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression

from sklearn import metrics

from sklearn.metrics import confusion_matrix
import itertools

from sklearn.model_selection import GridSearchCV

from statsmodels.stats.outliers_influence import variance_inflation_factor

from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier

import pickle

In [5]:
df = pd.read_csv('validation_set.csv', index_col = 0)

In [2]:
with open('Features_Selected_by_RF', 'rb') as handle:
    X = pickle.load(handle)

## Cleaning data to match the training set

In [7]:
df.rename(columns = {'default payment next month': 'Default',
                     'BILL_AMT1': 'September_bill',
                     'BILL_AMT2': 'August_bill',
                     'BILL_AMT3': 'July_bill',
                     'BILL_AMT4': 'June_bill',
                     'BILL_AMT5': 'May_bill',
                     'BILL_AMT6': 'April_bill', 
                     'PAY_0': 'September_status',
                     'PAY_2': 'August_status',
                     'PAY_3': 'July_status',
                     'PAY_4': 'June_status',
                     'PAY_5': 'May_status',
                     'PAY_6': 'April_status',
                     'PAY_AMT1': 'September_payment',
                     'PAY_AMT2': 'August_payment',
                     'PAY_AMT3': 'July_payment',
                     'PAY_AMT4': 'June_payment',
                     'PAY_AMT5': 'May_payment',
                     'PAY_AMT6': 'April_payment',}, inplace = True)

In [None]:
df['SEX'] = np.where(df['SEX'] == 1, 1, 0)

In [8]:
edu_dict = {1: 1,
           2: 2,
           3: 3,
           4: 4,
           0: 4, 
           5: 4, 
           6: 4}

df['EDUCATION'] = df['EDUCATION'].replace(edu_dict)

In [9]:
marriage_dict = {0: 2,
                1:1,
                2:2,
                3:2}

df['MARRIAGE'] = df['MARRIAGE'].replace(marriage_dict)

In [10]:
status_dict = {-2:-1}

df['September_status'] = df['September_status'].replace(status_dict)
df['August_status'] = df['August_status'].replace(status_dict)
df['July_status'] = df['July_status'].replace(status_dict)
df['June_status'] = df['June_status'].replace(status_dict)
df['May_status'] = df['May_status'].replace(status_dict)
df['April_status'] = df['April_status'].replace(status_dict)

In [11]:
df['September_balance'] = [df['September_bill'][i] - df['September_payment'][i] 
                       for i in range(len(df['September_payment']))]

df['August_balance'] = [df['August_bill'][i] - df['August_payment'][i] 
                       for i in range(len(df['August_payment']))]

df['July_balance'] = [df['July_bill'][i] - df['July_payment'][i] 
                       for i in range(len(df['July_payment']))]

df['June_balance'] = [df['June_bill'][i] - df['June_payment'][i] 
                       for i in range(len(df['June_payment']))]

df['May_balance'] = [df['May_bill'][i] - df['May_payment'][i] 
                       for i in range(len(df['May_payment']))]

df['April_balance'] = [df['April_bill'][i] - df['April_payment'][i] 
                       for i in range(len(df['April_payment']))]

In [16]:
df['Min_payment_sep'] = [1 if df['September_bill'][i] <= 0 
                                 else 1 if (df['September_payment'][i] / df['September_bill'][i]) > .03       
                                 else 0 for i in range(len(df['September_bill']))]

df['Min_payment_aug'] = [1 if df['August_bill'][i] <= 0
                                 else 1 if (df['August_payment'][i] / df['August_bill'][i]) > .03       
                                 else 0 for i in range(len(df['August_bill']))]

df['Min_payment_jul'] = [1 if df['July_bill'][i] <= 0
                                 else 1 if (df['July_payment'][i] / df['July_bill'][i]) > .03       
                                 else 0 for i in range(len(df['July_bill']))]

df['Min_payment_jun'] = [1 if df['June_bill'][i] <= 0
                                 else 1 if (df['June_payment'][i] / df['June_bill'][i]) > .03       
                                else 0 for i in range(len(df['June_bill']))]

df['Min_payment_may'] = [1 if df['May_bill'][i] <= 0
                                 else 1 if (df['May_payment'][i] / df['May_bill'][i]) > .03       
                                 else 0 for i in range(len(df['May_bill']))]

df['Min_payment_apr'] = [1 if df['April_bill'][i] <= 0
                                 else 1 if (df['April_payment'][i] / df['April_bill'][i]) > .03       
                                 else 0 for i in range(len(df['April_bill']))]

In [17]:
edu_dummies = pd.get_dummies(df['EDUCATION'], prefix = 'edu', drop_first = True)

marriage_dummies = pd.get_dummies(df['MARRIAGE'], prefix = 'marriage', drop_first = True)

status_dummies_sep = pd.get_dummies(df['September_status'], 
                                    prefix = 'September_status', drop_first = True)

status_dummies_aug = pd.get_dummies(df['August_status'], 
                                    prefix = 'August_status', drop_first = True)

status_dummies_jul = pd.get_dummies(df['July_status'], 
                                    prefix = 'July_status', drop_first = True)

status_dummies_jun = pd.get_dummies(df['June_status'], 
                                    prefix = 'June_status', drop_first = True)

status_dummies_may = pd.get_dummies(df['May_status'], 
                                    prefix = 'May_status', drop_first = True)

status_dummies_apr = pd.get_dummies(df['April_status'], 
                                    prefix = 'April_status', drop_first = True)

In [18]:
df = pd.concat([df, edu_dummies, marriage_dummies, status_dummies_sep, 
                status_dummies_aug, status_dummies_jul, status_dummies_jun, 
                status_dummies_may, status_dummies_apr], axis = 1)

In [20]:
df['Limit_min_sep'] = [df['LIMIT_BAL'][i] * df['Min_payment_sep'][i]
                            for i in range(len(df['LIMIT_BAL']))]

df['Limit_min_aug'] = [df['LIMIT_BAL'][i] * df['Min_payment_aug'][i]
                            for i in range(len(df['LIMIT_BAL']))]

df['Limit_min_jul'] = [df['LIMIT_BAL'][i] * df['Min_payment_jul'][i]
                            for i in range(len(df['LIMIT_BAL']))]

df['Limit_min_jun'] = [df['LIMIT_BAL'][i] * df['Min_payment_jun'][i]
                            for i in range(len(df['LIMIT_BAL']))]

df['Limit_min_may'] = [df['LIMIT_BAL'][i] * df['Min_payment_may'][i]
                            for i in range(len(df['LIMIT_BAL']))]

df['Limit_min_apr'] = [df['LIMIT_BAL'][i] * df['Min_payment_apr'][i]
                            for i in range(len(df['LIMIT_BAL']))]

In [21]:
df.drop(columns = ['September_bill', 'August_bill', 'July_bill','June_bill', 'May_bill', 
                   'April_bill', 'September_payment', 'August_payment', 'July_payment', 
                   'June_payment', 'May_payment','April_payment', 'EDUCATION', 'MARRIAGE', 
                   'September_status', 'August_status', 'July_status', 'June_status', 
                   'May_status', 'April_status', 'EDUCATION', 'MARRIAGE'], inplace = True)

In [25]:
df = df[X.columns]

In [29]:
df.head()

Unnamed: 0,LIMIT_BAL,SEX,AGE,September_balance,August_balance,July_balance,June_balance,May_balance,April_balance,Min_payment_sep,...,May_status_0,May_status_2,May_status_3,May_status_4,May_status_7,April_status_0,April_status_2,April_status_3,April_status_4,April_status_7
0,300000,2,32,25254,17902,19111,8488,250,0,1,...,1,0,0,0,0,0,0,0,0,0
1,360000,2,47,-3249,-1492,-17731,17985,935,4999,1,...,0,0,0,0,0,0,0,0,0,0
2,120000,2,27,10179,8059,10861,8955,10490,9866,1,...,1,0,0,0,0,1,0,0,0,0
3,140000,2,40,133241,107485,81465,38202,38646,31279,1,...,1,0,0,0,0,1,0,0,0,0
4,180000,2,31,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


## Making Predictions

#### Bringing in predictions with pickle

In [32]:
with open('Best_model', 'rb') as handle:
    rfc_clf = pickle.load(handle)

In [33]:
with open('Final_scaler', 'rb') as handle:
    final_scaler = pickle.load(handle)

### Applying scaler and making predicitons

In [34]:
df_scaled = pd.DataFrame(data=final_scaler.transform(df))

In [36]:
default_preds = rfc_clf.predict(df_scaled)

In [44]:
default_pred_df = pd.DataFrame(default_preds)

In [46]:
default_pred_df.rename(columns = {0 : 'Default_status'}, inplace = True)

In [50]:
default_pred_df.to_csv('Default_predictions_AH.csv', header = None)