In [74]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.set_option('display.max_colwidth', None)

import tensorflow as tf

# Metrics and auxiliar libraries from sklearn.
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, f1_score, roc_auc_score

# Some auxiliary functions for scoring and tuning
import scoring_utils, tuning_utils

#DEV
import importlib as imp


In [75]:
# Import the dataframe cleaned during the feature importance process.
df = pd.read_csv('../data/data_clean.csv')
target = 'Default'

In [76]:
from sklearn.preprocessing import OneHotEncoder
# Add UrbanRural one-hot encoded version diretly to the dataframe. That encoding is straighforward.
encoder = OneHotEncoder(sparse_output=False)
encoded_urban_rural = encoder.fit_transform(df[['UrbanRural']])
encoded_urban_rural = pd.DataFrame(encoded_urban_rural, columns=encoder.get_feature_names_out(['UrbanRural']))

encoded_urban_rural.index = df.index

# Concatenating the encoded DataFrame with the original DataFrame
df = pd.concat([df, encoded_urban_rural], axis=1)

all_features = [feature for feature in df.columns if feature not in target]

In [77]:
# Add count encoded features
experimental_features = ['City', 'State', 'Bank', 'ApprovalFY', 'NAICS_i', 'FranchiseCode']
features = [f for f in all_features if f not in experimental_features]
# Count encoding
count_encoded_features = ['City', 'Bank', 'State']
features_count_encoding = features + count_encoded_features

for feature in count_encoded_features:
    df[feature + 'Loans'] = df.groupby(feature)[feature].transform('count')
    df[feature + 'Loans'].fillna(0, inplace=True)

    features_count_encoding.remove(feature)
    features_count_encoding.append(feature+'Loans')

In [78]:
# Look for and remove outliers

def remove_outliers(df:pd.DataFrame, c:str):
    z_scores = np.abs((df[c] - df[c].mean())/df[c].std())
    no_outliers = df[z_scores < 3]
    return no_outliers

#remove outliers from features

have_outliers = ['Term','NoEmp','CreateJob','RetainedJob','GrAppv','SBA_Appv']

for col in have_outliers:
    # print(f"\n\nCol: {col}")
    # print("\nBefore:")
    # for s in df[col].describe().to_string().split('\n'):
    #     print(f'\t{s}')
    df_filtered = remove_outliers(df,col)
    # print("\nAfter:")
    # for s in df[col].describe().to_string().split('\n'):
    #     print(f'\t{s}')


In [80]:
# Apply logarithmic transform to selected columns
log_columns = ['Term', 'DisbursementGross', 'GrAppv', 'SBA_Appv','CreateJob']
for col in log_columns:
    df[col] = np.log1p(df[col])  # Using np.log1p to avoid log(0) issues

In [81]:
X_total = df_filtered[features_count_encoding]
y_total = df_filtered[target]

X_train, X_, y_train, y_ = train_test_split(X_total, y_total, train_size=.8)
X_cv, X_test, y_cv, y_test = train_test_split(X_, y_, train_size=.5)

In [82]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_cv_scaled = scaler.transform(X_cv)
X_test_scaled = scaler.transform(X_test)
X_total_scaled = pd.DataFrame(scaler.transform(X_total), columns=features_count_encoding)

In [83]:
X_total_scaled.sample(20)

Unnamed: 0,Term,NoEmp,CreateJob,RetainedJob,UrbanRural,RevLineCr,LowDoc,DisbursementGross,GrAppv,SBA_Appv,isNewBusiness,isFranchise,SBARatio,InterestRate,UrbanRural_0,UrbanRural_1,UrbanRural_2,CityLoans,BankLoans,StateLoans
111147,0.137865,-0.054494,-0.035296,-0.015807,0.387977,-0.552163,-0.380602,1.151649,1.362151,1.313584,1.58897,-0.244781,0.241124,0.47484,-0.763211,0.969311,-0.362837,-0.505395,-0.384514,-0.521525
550731,-0.325298,0.05363,-0.022887,-0.032331,0.387977,1.81106,-0.380602,0.320145,0.364542,-0.03126,-0.629338,-0.244781,-1.207336,-1.007206,-0.763211,0.969311,-0.362837,3.257084,-0.636427,0.387531
656850,-0.325298,-0.135587,-0.035296,-0.040593,0.387977,1.81106,-0.380602,-0.632714,-0.665101,-0.700706,-0.629338,-0.244781,-1.207336,-1.345698,-0.763211,0.969311,-0.362837,-0.209683,2.219625,-0.488894
166584,-0.325298,-0.122072,-0.035296,-0.036462,0.387977,1.81106,-0.380602,-0.767553,-0.742896,-0.751287,1.58897,-0.244781,-1.207336,0.59377,-0.763211,0.969311,-0.362837,1.160668,2.219625,0.703957
140237,-0.325298,-0.095041,-0.027024,-0.0282,0.387977,-0.552163,-0.380602,-0.758564,-0.733743,-0.724509,1.58897,-0.244781,0.820509,0.59377,-0.763211,0.969311,-0.362837,-0.231624,0.077539,-0.899749
238238,2.453678,-0.149103,-0.018751,-0.044724,0.387977,-0.552163,-0.380602,0.765112,0.817584,0.263297,1.58897,-0.244781,-1.207336,0.369633,-0.763211,0.969311,-0.362837,2.401363,1.006203,-0.30126
20123,2.453678,-0.095041,-0.035296,-0.044724,-1.153145,-0.552163,2.627415,-0.455177,-0.424851,-0.40615,-0.629338,-0.244781,0.530816,0.712699,1.310253,-1.03166,-0.362837,-0.400176,-0.774399,-0.521525
389210,-0.479686,-0.06801,-0.035296,-0.044724,0.387977,-0.552163,2.627415,-0.623725,-0.596458,-0.584669,-0.629338,-0.244781,0.530816,1.174695,-0.763211,0.969311,-0.362837,-0.225142,-0.778316,0.703957
659150,-0.325298,-0.135587,-0.035296,-0.040593,0.387977,1.81106,-0.380602,-0.758564,-0.733743,-0.745336,-0.629338,-0.244781,-1.207336,-1.345698,-0.763211,0.969311,-0.362837,-0.352303,1.006203,0.703957
789174,1.68174,-0.095041,-0.035296,-0.0282,0.387977,-0.552163,-0.380602,-0.138306,-0.102229,0.105605,-0.629338,-0.244781,1.689585,-0.417132,-0.763211,0.969311,-0.362837,-0.435581,-0.739143,2.193773


The features have been succesfully scaled!

In [90]:
network = tf.keras.Sequential([
    tf.keras.layers.Dense(activation='relu', units=16),
    tf.keras.layers.Dense(activation='relu', units=32),
    tf.keras.layers.Dense(activation='sigmoid', units=64),
    tf.keras.layers.Dense(activation='relu', units=8),
    tf.keras.layers.Dense(activation='sigmoid', units=1)
])

network.compile(
    loss='BinaryCrossentropy',
    optimizer=tf.keras.optimizers.legacy.Adam(.001)
)

In [91]:
network.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_cv_scaled, y_cv))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x2febfb210>

In [None]:
from tensorflow.keras.layers import Input, Dense, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers.legacy import Adam

# Assuming 'input_shape' is the number of features

# Wide part of the network (linear relationships)
input_layer = Input(shape=(20,))
wide_branch = Dense(128, activation='relu')(input_layer)
wide_branch = Dense(64, activation='relu')(wide_branch)

# Deep part of the network (non-linear relationships)
deep_branch = Dense(64, activation='relu')(input_layer)
deep_branch = Dense(128, activation='relu')(deep_branch)
deep_branch = Dense(64, activation='relu')(deep_branch)

# Concatenate wide and deep parts
concatenated = Concatenate()([wide_branch, deep_branch])

# Output layer
output = Dense(1, activation='sigmoid')(concatenated)

# Create model
model = Model(inputs=input_layer, outputs=output)

# Compile the model
model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])


In [None]:
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_cv_scaled, y_cv))

In [None]:
y_cv_df = pd.DataFrame(y_cv)
yhat_cv_df = pd.DataFrame(model.predict(X_cv_scaled), index=y_cv_df.index)

pd.concat([
    y_cv_df,
    yhat_cv_df
], axis=1).head(20)

In [None]:
scoring_utils.get_metrics(y_cv, yhat_cv_df>.5, "Neural Network")

In [None]:
y_cv

In [None]:
yhat_cv_df.shape

In [None]:
scoring_utils.get_confusion_matrix(y_cv, yhat_cv_df[0]>.5, "Neural Network")