In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.set_option('display.max_colwidth', None)

import tensorflow as tf

# Metrics and auxiliar libraries from sklearn.
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, f1_score, roc_auc_score

# Some auxiliary functions for scoring and tuning
import scoring_utils, tuning_utils

#DEV
import importlib as imp


In [2]:
# Import the dataframe cleaned during the feature importance process.
df = pd.read_csv('../data/data_clean.csv')
target = 'Default'

In [3]:
from sklearn.preprocessing import OneHotEncoder
# Add UrbanRural one-hot encoded version diretly to the dataframe. That encoding is straighforward.
encoder = OneHotEncoder(sparse_output=False)
encoded_urban_rural = encoder.fit_transform(df[['UrbanRural']])
encoded_urban_rural = pd.DataFrame(encoded_urban_rural, columns=encoder.get_feature_names_out(['UrbanRural']))

encoded_urban_rural.index = df.index

# Concatenating the encoded DataFrame with the original DataFrame
df = pd.concat([df, encoded_urban_rural], axis=1)

all_features = [feature for feature in df.columns if feature not in target]

In [4]:
# Add count encoded features
experimental_features = ['City', 'State', 'Bank', 'ApprovalFY', 'NAICS_i', 'FranchiseCode']
features = [f for f in all_features if f not in experimental_features]
# Count encoding
count_encoded_features = ['City', 'Bank', 'State']
features_count_encoding = features + count_encoded_features

for feature in count_encoded_features:
    df[feature + 'Loans'] = df.groupby(feature)[feature].transform('count')
    df[feature + 'Loans'].fillna(0, inplace=True)

    features_count_encoding.remove(feature)
    features_count_encoding.append(feature+'Loans')

In [5]:
X_total = df[features_count_encoding]
y_total = df[target]

X_train, X_, y_train, y_ = train_test_split(X_total, y_total, train_size=.8)
X_cv, X_test, y_cv, y_test = train_test_split(X_, y_, train_size=.5)

In [6]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_cv_scaled = scaler.transform(X_cv)
X_test_scaled = scaler.transform(X_test)
X_total = scaler.transform(X_total)

In [7]:
pd.DataFrame(X_total, columns=features_count_encoding).describe()

Unnamed: 0,Term,NoEmp,CreateJob,RetainedJob,UrbanRural,RevLineCr,LowDoc,GrAppv,SBA_Appv,isNewBusiness,isFranchise,SBARatio,InterestRate,UrbanRural_0,UrbanRural_1,UrbanRural_2,CityLoans,BankLoans,StateLoans
count,870514.0,870514.0,870514.0,870514.0,870514.0,870514.0,870514.0,870514.0,870514.0,870514.0,870514.0,870514.0,870514.0,870514.0,870514.0,870514.0,870514.0,870514.0,870514.0
mean,0.000542,-0.000971,0.000169,0.000188,-0.000131,2.8e-05,-0.000135,-0.00012,-7.8e-05,3.2e-05,-9.3e-05,0.000361,0.000107,-8e-06,0.000185,-0.000275,0.000659,-0.000611,3.9e-05
std,1.000278,0.953557,1.003488,1.00353,0.999841,1.000019,0.999846,1.000379,1.000275,1.000016,0.999825,0.999817,1.000204,0.999998,0.999993,0.999674,1.001738,0.999298,1.000095
min,-1.408191,-0.147729,-0.035846,-0.045597,-1.165203,-0.545535,-0.376155,-0.683117,-0.65991,-0.627151,-0.249126,-3.944193,-1.763864,-0.75398,-1.041145,-0.364727,-0.530723,-0.774864,-1.037837
25%,-0.650189,-0.122161,-0.035846,-0.045597,-1.165203,-0.545535,-0.376155,-0.563485,-0.564197,-0.627151,-0.249126,-1.220584,-0.968246,-0.75398,-1.041145,-0.364727,-0.495247,-0.753987,-0.713389
50%,-0.346988,-0.096592,-0.035846,-0.041423,0.378183,-0.545535,-0.376155,-0.355887,-0.383478,-0.627151,-0.249126,0.222434,0.257189,-0.75398,0.960481,-0.364727,-0.393816,-0.536068,-0.454383
75%,0.107813,-0.019888,-0.031664,-0.028901,0.378183,-0.545535,-0.376155,0.122643,0.111477,1.594513,-0.249126,0.799641,0.632136,1.326295,0.960481,-0.364727,-0.034563,0.520513,0.378755
max,5.780192,127.680157,36.764139,39.607255,1.921569,1.833062,2.658477,16.906392,19.005004,1.594513,4.01404,1.665451,6.93764,1.326295,0.960481,2.741775,5.024471,2.240268,2.175256


The features have been succesfully scaled!