In [15]:
import pandas as pd
import numpy as np
np.random.seed(42)
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
import shap

from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler

import time

import keras as ke
import keras.backend as K
from keras.layers import Input, Dense, Dropout
from keras.models import Model

Using TensorFlow backend.


In [7]:
df = pd.read_csv('../../data/processed/cleaned_ohe_normalized.csv')

In [8]:
df.head(3)

Unnamed: 0.1,Unnamed: 0,Country_Afghanistan,Country_Armenia,Country_Azerbaijan,Country_Benin,Country_Bolivia,Country_Bulgaria,Country_Burkina Faso,Country_Burundi,Country_Cambodia,...,Town_gaza,Town_huvsgul,Town_kirkuk,Town_kyengera,Town_litein,Town_malindi,Loan Amount,Loan Term,Funded Time,Status
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.035088,0.039216,0.83675,0
1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.010025,0.008403,0.87755,0
2,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.077694,0.044818,0.41835,0


### model

In [16]:

def nn_classifier(n_features):
    inputs = Input(shape=(n_features,))
    dense1 = Dense(32, activation='relu')(inputs)
    dropout1 = Dropout(0.2)(dense1)
    dense2 = Dense(32, activation='relu')(dropout1)
    dropout2 = Dropout(0.2)(dense2)
    dense3 = Dense(32, activation="relu")(dropout2)
    dropout3 = Dropout(0.2)(dense3)
    outputs = Dense(1, activation='sigmoid')(dropout3)
    model = Model(inputs=[inputs], outputs=[outputs])
    model.compile(loss='binary_crossentropy', optimizer='adam')
    return model

def make_results_df(n_train):
    return pd.DataFrame(
        data = [[0.0] * 3] * len(metrics),
        index = metrics,
        columns=[int(n_train / 3), int(2 * n_train / 3), int(n_train)])

def train_classifier(clf, X_train, y_train):
    start = time.time()
    clf.fit(X_train.values, y_train.values, epochs=20, verbose=0)
    end = time.time()
    return end-start

def predict_labels(clf, X, y):
    start = time.time()
    y_pred = pd.Series(clf.predict(X).ravel(), index=y.index)
    end = time.time()
    return y_pred, end-start

def train_predict(clf, X_train, y_train, X_test, y_test, df):
    print(len(X_train), len(y_train), len(X_test), len(y_test))
    
    df.at['Training time', len(y_train)] = train_classifier(clf, X_train, y_train)
    y_pred, t_pred = predict_labels(clf, X_train, y_train)
    df.at['F1 score (train)' , len(y_train)] = f1_score(y_train.values, y_pred>0.5)
    df.at['Precision (train)', len(y_train)] = precision_score(y_train.values, y_pred>0.5)
    df.at['Recall (train)'   , len(y_train)] = recall_score(y_train.values, y_pred>0.5)
    df.at['Accuracy (train)' , len(y_train)] = accuracy_score(y_train.values, y_pred>0.5)
    df.at['ROC AUC (train)'  , len(y_train)] = roc_auc_score(y_train.values, y_pred)
    y_pred, t_pred = predict_labels(clf, X_test, y_test)
    df.at['F1 score (test)' , len(y_train)] = f1_score(y_test.values, y_pred>0.5)
    df.at['Precision (test)', len(y_train)] = precision_score(y_test.values, y_pred>0.5)
    df.at['Recall (test)'   , len(y_train)] = recall_score(y_test.values, y_pred>0.5)
    df.at['Accuracy (test)' , len(y_train)] = accuracy_score(y_test.values, y_pred>0.5)
    df.at['ROC AUC (test)'  , len(y_train)] = roc_auc_score(y_test.values, y_pred)
    df.at['Prediction time' , len(y_train)] = t_pred
    print(df.head())
    return y_pred

def make_training_and_test_sets(X, y, num_train):
    num_all = X.shape[0]
    num_test = num_all - num_train
    test_frac = float(num_test) / float(num_all)
    print(test_frac)

    (X_train, X_test,
     y_train, y_test) = train_test_split(X, y, test_size=test_frac,
                                                  stratify=y,
                                                  random_state=0)
    (X_train2, X_train1,
     y_train2, y_train1) = train_test_split(X_train, y_train,
                                            test_size=0.333333,
                                            stratify=y_train,
                                            random_state=0)

    X_train = X_train.reset_index(drop=True)
    X_train2 = X_train2.reset_index(drop=True)
    X_train1 = X_train1.reset_index(drop=True)
    X_test = X_test.reset_index(drop=True)
    y_train = y_train.reset_index(drop=True)
    y_train2 = y_train2.reset_index(drop=True)
    y_train1 = y_train1.reset_index(drop=True)
    y_test = y_test.reset_index(drop=True)

    return (X_train, X_train2, X_train1, X_test,
            y_train, y_train2, y_train1, y_test)


In [17]:
y_all = df['Status']
X_all = df.drop(columns=['Status'])

In [18]:

metrics = ['Training time',
           'Prediction time',
           'F1 score (train)',
           'F1 score (test)',
           'Precision (train)',
           'Precision (test)',
           'Recall (train)',
           'Recall (test)',
           'Accuracy (train)',
           'Accuracy (test)',
           'ROC AUC (train)',
           'ROC AUC (test)']


n_train =  int(X_all.shape[0] * 0.2)
print(n_train)
(X_train, X_train2, X_train1, X_test,
y_train, y_train2, y_train1, y_test) = make_training_and_test_sets(X_all, y_all, n_train)

clf_nn = nn_classifier(n_features=X_train.shape[1])
results_df = make_results_df(n_train)

# Train on different size training sets and predict on a separate test set
y_pred = train_predict(clf_nn, X_train1, y_train1, X_test, y_test, results_df)
y_pred = train_predict(clf_nn, X_train2, y_train2, X_test, y_test, results_df)
y_pred = train_predict(clf_nn, X_train, y_train, X_test, y_test, results_df)


27580
0.8000029006105785
9194 9194 110322 110322


F-score is ill-defined and being set to 0.0 due to no predicted samples.
Precision is ill-defined and being set to 0.0 due to no predicted samples.
F-score is ill-defined and being set to 0.0 due to no predicted samples.
Precision is ill-defined and being set to 0.0 due to no predicted samples.


                   9193   18386  27580      9194 
Training time        0.0    0.0    0.0  22.375350
Prediction time      0.0    0.0    0.0   3.902282
F1 score (train)     0.0    0.0    0.0   0.000000
F1 score (test)      0.0    0.0    0.0   0.000000
Precision (train)    0.0    0.0    0.0   0.000000
18386 18386 110322 110322


F-score is ill-defined and being set to 0.0 due to no predicted samples.
Precision is ill-defined and being set to 0.0 due to no predicted samples.
F-score is ill-defined and being set to 0.0 due to no predicted samples.
Precision is ill-defined and being set to 0.0 due to no predicted samples.


                   9193       18386  27580      9194 
Training time        0.0  44.505931    0.0  22.375350
Prediction time      0.0   4.762462    0.0   3.902282
F1 score (train)     0.0   0.000000    0.0   0.000000
F1 score (test)      0.0   0.000000    0.0   0.000000
Precision (train)    0.0   0.000000    0.0   0.000000
27580 27580 110322 110322


F-score is ill-defined and being set to 0.0 due to no predicted samples.
Precision is ill-defined and being set to 0.0 due to no predicted samples.


                   9193       18386      27580      9194 
Training time        0.0  44.505931  61.245789  22.375350
Prediction time      0.0   4.762462   3.111798   3.902282
F1 score (train)     0.0   0.000000   0.000000   0.000000
F1 score (test)      0.0   0.000000   0.000000   0.000000
Precision (train)    0.0   0.000000   0.000000   0.000000


F-score is ill-defined and being set to 0.0 due to no predicted samples.
Precision is ill-defined and being set to 0.0 due to no predicted samples.


In [None]:
print(y_pred)