In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.set_option('display.max_colwidth', None)

import tensorflow as tf

# Metrics and auxiliar libraries from sklearn.
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, f1_score, roc_auc_score

# Some auxiliary functions for scoring and tuning
import scoring_utils, tuning_utils

#DEV
import importlib as imp


In [22]:
# Import the dataframe cleaned during the feature importance process.
df = pd.read_csv('../data/data_clean.csv')
target = 'Default'

In [23]:
from sklearn.preprocessing import OneHotEncoder
# Add UrbanRural one-hot encoded version diretly to the dataframe. That encoding is straighforward.
encoder = OneHotEncoder(sparse_output=False)
encoded_urban_rural = encoder.fit_transform(df[['UrbanRural']])
encoded_urban_rural = pd.DataFrame(encoded_urban_rural, columns=encoder.get_feature_names_out(['UrbanRural']))

encoded_urban_rural.index = df.index

# Concatenating the encoded DataFrame with the original DataFrame
df = pd.concat([df, encoded_urban_rural], axis=1)

all_features = [feature for feature in df.columns if feature not in target]

In [24]:
# Add count encoded features
experimental_features = ['City', 'State', 'Bank', 'ApprovalFY', 'NAICS_i', 'FranchiseCode']
features = [f for f in all_features if f not in experimental_features]
# Count encoding
count_encoded_features = ['City', 'Bank', 'State']
features_count_encoding = features + count_encoded_features

for feature in count_encoded_features:
    df[feature + 'Loans'] = df.groupby(feature)[feature].transform('count')
    df[feature + 'Loans'].fillna(0, inplace=True)

    features_count_encoding.remove(feature)
    features_count_encoding.append(feature+'Loans')

In [28]:
X_total = df[features_count_encoding]
y_total = df[target]

X_train, X_, y_train, y_ = train_test_split(X_total, y_total, train_size=.8)
X_cv, X_test, y_cv, y_test = train_test_split(X_, y_, train_size=.5)

In [29]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_cv_scaled = scaler.transform(X_cv)
X_test_scaled = scaler.transform(X_test)
X_total = scaler.transform(X_total)

In [30]:
pd.DataFrame(X_total, columns=features_count_encoding).describe()

Unnamed: 0,Term,NoEmp,CreateJob,RetainedJob,UrbanRural,RevLineCr,LowDoc,GrAppv,SBA_Appv,isNewBusiness,isFranchise,SBARatio,InterestRate,UrbanRural_0,UrbanRural_1,UrbanRural_2,CityLoans,BankLoans,StateLoans
count,870514.0,870514.0,870514.0,870514.0,870514.0,870514.0,870514.0,870514.0,870514.0,870514.0,870514.0,870514.0,870514.0,870514.0,870514.0,870514.0,870514.0,870514.0,870514.0
mean,-0.00039,0.000156,0.000406,0.000309,8.7e-05,2.8e-05,-0.000349,-0.000151,-0.000105,-0.000472,0.000618,-0.000278,-8.4e-05,-0.000369,0.000599,-0.000377,-0.000368,0.000213,-0.000557
std,0.999668,1.007419,1.007053,1.00626,0.999677,1.000019,0.999603,0.999924,1.000333,0.999773,1.001165,1.000007,1.000288,0.999895,0.999976,0.999552,1.000386,1.000287,0.999605
min,-1.408263,-0.154891,-0.035737,-0.0456,-1.164794,-0.545535,-0.376277,-0.682837,-0.659975,-0.627502,-0.248748,-3.945579,-1.764203,-0.754264,-1.040714,-0.364785,-0.531032,-0.774806,-1.037924
25%,-0.650724,-0.127879,-0.035737,-0.0456,-1.164794,-0.545535,-0.376277,-0.563259,-0.564257,-0.627502,-0.248748,-1.221454,-0.968518,-0.754264,-1.040714,-0.364785,-0.495604,-0.753909,-0.713635
50%,-0.347708,-0.100867,-0.035737,-0.041415,0.378339,-0.545535,-0.376277,-0.355755,-0.383527,-0.627502,-0.248748,0.221837,0.25702,-0.754264,0.960879,-0.364785,-0.394311,-0.535774,-0.454755
75%,0.106816,-0.01983,-0.03154,-0.028859,0.378339,-0.545535,-0.376277,0.122557,0.111457,1.593621,-0.248748,0.799154,0.631998,1.325796,0.960879,-0.364785,-0.035542,0.521853,0.377974
max,5.775737,134.893409,36.894968,39.715138,1.921471,1.833062,2.657617,16.898667,19.006093,1.593621,4.020129,1.665128,6.938032,1.325796,0.960879,2.741338,5.016666,2.243311,2.173594


The features have been succesfully scaled!

In [31]:
neural_sizes = [
    [10],
    [16, 5],
    [32, 10, 20],
    [10, 20, 30, 10],
    [128, 64, 32, 64, 16, 4],
    [16, 64, 256, 1024, 512, 128, 32, 4],
    [190, 153, 120, 91, 66, 45, 28, 15, 6]
]

models = []

for size in neural_sizes:
    print(f"Creating neural network of size {size}")
    model_name = '_'.join(map(str, size))
    layers = []
    #input = tf.keras.layers.Input((1,))
    #layers.append(input)
    for layer in size:
        dense = tf.keras.layers.Dense(activation="relu", units=layer)
        layers.append(dense)
    output = tf.keras.layers.Dense(activation="sigmoid", units=1)
    layers.append(output)

    model = tf.keras.Sequential(layers, name=model_name)
    #print(f"Params: {model.count_params()}")
    models.append(model)
    

Creating neural network of size [10]
Creating neural network of size [16, 5]
Creating neural network of size [32, 10, 20]
Creating neural network of size [10, 20, 30, 10]
Creating neural network of size [128, 64, 32, 64, 16, 4]
Creating neural network of size [16, 64, 256, 1024, 512, 128, 32, 4]
Creating neural network of size [190, 153, 120, 91, 66, 45, 28, 15, 6]


In [36]:
f1s = {
    "train": [],
    "cv": []
}

for model in models:
    # Setup the loss and optimizer
    model.compile(
        loss='BinaryCrossentropy',
    
        optimizer=tf.keras.optimizers.legacy.Adam(learning_rate=0.1),
    )
    # Train the model
    
    model.fit(
        X_train_scaled, y_train,
        epochs=10,
        verbose=2
    )
    print(f"Fitted the model {model.name} with {model.count_params()} params!")
    
    # Record the training MSEs
    yhat = (model.predict(X_train_scaled) > 0.5).astype(int)
    train_f1 = f1_score(y_train, yhat)
    f1s['train'].append(train_f1)
    
    # Record the cross validation MSEs 
    yhat = (model.predict(X_cv_scaled) > 0.5).astype(int)
    cv_ms = f1_score(y_cv, yhat)
    f1s['cv'].append(cv_ms)

# print results
print("RESULTS:")
for model_num in range(len(f1s['cv'])):
    print(
        f"Model {models[model_num].name}: Training Accuracy: {f1s['train'][model_num]:.2f}, " +
        f"CV Accuracy: {f1s['cv'][model_num]:.2f}"
        )

Epoch 1/10
21763/21763 - 9s - loss: 0.3508 - 9s/epoch - 405us/step
Epoch 2/10
21763/21763 - 9s - loss: 0.3513 - 9s/epoch - 404us/step
Epoch 3/10
21763/21763 - 9s - loss: 0.3508 - 9s/epoch - 403us/step
Epoch 4/10
21763/21763 - 8s - loss: 0.3508 - 8s/epoch - 374us/step
Epoch 5/10
21763/21763 - 8s - loss: 0.3517 - 8s/epoch - 374us/step
Epoch 6/10
21763/21763 - 8s - loss: 0.3506 - 8s/epoch - 373us/step
Epoch 7/10
21763/21763 - 8s - loss: 0.3509 - 8s/epoch - 383us/step
Epoch 8/10
21763/21763 - 8s - loss: 0.3512 - 8s/epoch - 372us/step
Epoch 9/10
21763/21763 - 8s - loss: 0.3508 - 8s/epoch - 378us/step
Epoch 10/10
21763/21763 - 8s - loss: 0.3508 - 8s/epoch - 383us/step
Fitted the model 10 with 211 params!
Epoch 1/10
21763/21763 - 9s - loss: 0.3491 - 9s/epoch - 394us/step
Epoch 2/10
21763/21763 - 9s - loss: 0.3527 - 9s/epoch - 399us/step
Epoch 3/10
21763/21763 - 9s - loss: 0.3551 - 9s/epoch - 398us/step
Epoch 4/10
21763/21763 - 9s - loss: 0.3539 - 9s/epoch - 406us/step
Epoch 5/10
21763/21763 -