In [1]:
#Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.set_option('display.max_colwidth', None)


# Metrics and auxiliary libraries from sklearn.
from sklearn.model_selection import train_test_split


# Some auxiliary functions for scoring and tuning
import scoring_utils, tuning_utils

#DEV
import importlib as imp


In [2]:
# Import the dataframe cleaned during the feature importance process.
df = pd.read_csv('../data/data_clean.csv')
target = 'Default'

In [3]:
from sklearn.preprocessing import OneHotEncoder
# Add UrbanRural one-hot encoded version diretly to the dataframe. That encoding is straighforward.
encoder = OneHotEncoder(sparse_output=False)
encoded_urban_rural = encoder.fit_transform(df[['UrbanRural']])
encoded_urban_rural = pd.DataFrame(encoded_urban_rural, columns=encoder.get_feature_names_out(['UrbanRural']))

encoded_urban_rural.index = df.index

# Concatenating the encoded DataFrame with the original DataFrame
df = pd.concat([df, encoded_urban_rural], axis=1)

all_features = [feature for feature in df.columns if feature not in target]

In [4]:
# Add count encoded features
experimental_features = ['City', 'State', 'Bank', 'ApprovalFY', 'NAICS_i', 'FranchiseCode']
features = [f for f in all_features if f not in experimental_features]
# Count encoding
count_encoded_features = ['City', 'Bank', 'State']
features_count_encoding = features + count_encoded_features

for feature in count_encoded_features:
    df[feature + 'Loans'] = df.groupby(feature)[feature].transform('count')
    df[feature + 'Loans'].fillna(0, inplace=True)

    features_count_encoding.remove(feature)
    features_count_encoding.append(feature+'Loans')

In [5]:
X_total = df[features_count_encoding]
y_total = df[target]

X_train, X_, y_train, y_ = train_test_split(X_total, y_total, train_size=.8)
X_cv, X_test, y_cv, y_test = train_test_split(X_, y_, train_size=.5)

In [6]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_cv_scaled = scaler.transform(X_cv)
X_test_scaled = scaler.transform(X_test)
X_total_scaled = pd.DataFrame(scaler.transform(X_total), columns=features_count_encoding)

In [7]:
X_total_scaled.sample(20)

Unnamed: 0,Term,NoEmp,CreateJob,RetainedJob,UrbanRural,RevLineCr,LowDoc,DisbursementGross,GrAppv,SBA_Appv,isNewBusiness,isFranchise,SBARatio,InterestRate,UrbanRural_0,UrbanRural_1,UrbanRural_2,CityLoans,BankLoans,StateLoans
606223,1.622888,-0.033046,0.002524,-0.045527,0.377991,-0.545597,-0.376095,0.348958,0.382224,0.665597,1.594626,4.005736,1.666068,-1.223943,-0.753666,0.960058,-0.364679,-0.408771,-0.772989,-0.199842
697986,1.622888,-0.127831,-0.035574,-0.045527,1.921632,-0.545597,2.658899,-0.531931,-0.510358,-0.476222,1.594626,-0.249642,0.799992,-1.324563,-0.753666,-1.041604,2.742136,-0.527261,-0.774806,-0.880052
58065,-0.347483,-0.127831,0.070252,-0.037071,0.377991,-0.545597,-0.376095,0.161683,0.192463,0.156909,1.594626,-0.249642,0.222609,-0.139995,-0.753666,0.960058,-0.364679,-0.365274,-0.583996,-0.690608
651702,2.380723,-0.127831,-0.035574,-0.045527,-1.16565,-0.545597,-0.376095,0.50849,0.543873,0.407978,-0.627106,-0.249642,-0.066083,0.038377,1.326847,-1.041604,-0.364679,0.736637,1.559425,2.175992
821461,-0.347483,-0.019506,-0.035574,-0.045527,-1.16565,-0.545597,-0.376095,-0.531931,-0.510358,-0.487139,-0.627106,-0.249642,0.5113,0.632947,1.326847,-1.041604,-0.364679,-0.160791,-0.551519,2.175992
525320,2.380723,0.007576,-0.035574,-0.045527,0.377991,-0.545597,-0.376095,0.251852,0.28383,0.242054,-0.627106,-0.249642,0.222609,-1.004409,-0.753666,0.960058,-0.364679,-0.320278,-0.754101,2.175992
698106,1.622888,0.319013,0.112582,-0.045527,-1.16565,-0.545597,-0.376095,2.509566,2.57151,3.385879,1.594626,-0.249642,1.666068,0.248763,1.326847,-1.041604,-0.364679,-0.505763,-0.755274,-0.90704
96664,1.622888,0.021117,-0.018642,-0.045527,-1.16565,-0.545597,-0.376095,0.487681,0.522788,0.840254,-0.627106,-0.249642,1.666068,0.719846,1.326847,-1.041604,-0.364679,-0.473265,-0.770983,-0.764779
403335,-0.650617,-0.141372,-0.018642,-0.024388,0.377991,-0.545597,-0.376095,-0.566612,-0.545499,-0.504604,1.594626,-0.249642,1.088684,-1.704173,-0.753666,0.960058,-0.364679,-0.529261,-0.757924,-0.798938
465657,1.698672,-0.019506,-0.035574,-0.045527,-1.16565,-0.545597,-0.376095,0.590735,0.62721,0.643643,-0.627106,-0.249642,0.5113,0.587211,1.326847,-1.041604,-0.364679,-0.331277,-0.721359,2.175992


In [19]:
len(X_total_scaled.columns)

20

The features have been succesfully scaled!

In [20]:
# import relevant tensorflow libraries
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

cats = len(X_total_scaled.columns)

# initialize the model
model = Sequential([
    Dense(20, activation='relu', input_shape=(X_train.shape[1],)),  # First hidden layer with 128 neurons and 'relu' activation function
    Dense(19, activation='relu'),  # Second hidden layer with 64 neurons
    Dense(17, activation='relu'),  # Third hidden layer with 64 neurons
    Dense(14, activation='relu'),  # Fourth hidden layer with 64 neurons
    Dense(10, activation='relu'),  # Fifth hidden layer with 64 neurons
    Dense(5, activation='relu'),  # Second hidden layer with 64 neurons
    Dense(1, activation='sigmoid')  # Output layer with 1 neuron (for binary classification)
])




In [21]:
# compile the model

model.compile(optimizer='adam',
              loss='binary_crossentropy',  # For binary classification
              metrics=['accuracy'])


In [22]:
# Train the model

history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)


Epoch 1/10
  297/17411 [..............................] - ETA: 3:01:31 - loss: 12754.1953 - accuracy: 0.7297

KeyboardInterrupt: 

In [None]:
# Evaluate model

loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test accuracy: {accuracy}")


Test accuracy: 0.8284932971000671


In [None]:
for _ in range(10):
    print(f'Sample {_}: \n')
    print(f'{y_test.to_frame().sample(5)}')

Sample 0: 

        Default
150947        0
338401        0
743529        0
501323        0
286016        0
Sample 1: 

        Default
707727        0
704449        1
174622        0
600762        0
266210        0
Sample 2: 

        Default
264969        1
76948         0
163066        0
450309        1
639461        0
Sample 3: 

        Default
14367         1
594486        0
12819         0
123957        0
277986        1
Sample 4: 

        Default
804490        0
198316        0
818850        0
305591        1
253771        0
Sample 5: 

        Default
19900         0
83082         0
48250         0
258780        0
695272        0
Sample 6: 

        Default
600142        1
112835        0
286960        0
40852         0
860317        0
Sample 7: 

        Default
734599        1
492465        0
338658        1
606334        0
494653        0
Sample 8: 

        Default
181962        0
204303        1
264874        0
382719        0
716994        1
Sample 9: 

        Default
