In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.set_option('display.max_colwidth', None)

import tensorflow as tf

# Metrics and auxiliar libraries from sklearn.
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, f1_score, roc_auc_score

# Some auxiliary functions for scoring and tuning
import scoring_utils, tuning_utils

#DEV
import importlib as imp


In [19]:
# Import the dataframe cleaned during the feature importance process.
df = pd.read_csv('../data/data_clean.csv')
target = 'Default'

In [20]:
from sklearn.preprocessing import OneHotEncoder
# Add UrbanRural one-hot encoded version diretly to the dataframe. That encoding is straighforward.
encoder = OneHotEncoder(sparse_output=False)
encoded_urban_rural = encoder.fit_transform(df[['UrbanRural']])
encoded_urban_rural = pd.DataFrame(encoded_urban_rural, columns=encoder.get_feature_names_out(['UrbanRural']))

encoded_urban_rural.index = df.index

# Concatenating the encoded DataFrame with the original DataFrame
df = pd.concat([df, encoded_urban_rural], axis=1)

all_features = [feature for feature in df.columns if feature not in target]

In [21]:
# Add count encoded features
experimental_features = ['City', 'State', 'Bank', 'ApprovalFY', 'NAICS_i', 'FranchiseCode']
features = [f for f in all_features if f not in experimental_features]
# Count encoding
count_encoded_features = ['City', 'Bank', 'State']
features_count_encoding = features + count_encoded_features

for feature in count_encoded_features:
    df[feature + 'Loans'] = df.groupby(feature)[feature].transform('count')
    df[feature + 'Loans'].fillna(0, inplace=True)

    features_count_encoding.remove(feature)
    features_count_encoding.append(feature+'Loans')

In [22]:
X_total = df[features_count_encoding]
y_total = df[target]

X_train, X_, y_train, y_ = train_test_split(X_total, y_total, train_size=.8)
X_cv, X_test, y_cv, y_test = train_test_split(X_, y_, train_size=.5)

In [23]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_cv_scaled = scaler.transform(X_cv)
X_test_scaled = scaler.transform(X_test)
X_total_scaled = pd.DataFrame(scaler.transform(X_total), columns=features_count_encoding)

In [39]:
X_total_scaled.sample(20)

Unnamed: 0,Term,NoEmp,CreateJob,RetainedJob,UrbanRural,RevLineCr,LowDoc,GrAppv,SBA_Appv,isNewBusiness,isFranchise,SBARatio,InterestRate,UrbanRural_0,UrbanRural_1,UrbanRural_2,CityLoans,BankLoans,StateLoans
843719,-0.346813,-0.098365,-0.036044,-0.045697,-1.16486,-0.54602,2.655809,-0.333997,-0.311917,-0.626955,-0.248921,0.51051,0.589958,1.325804,-1.040957,-0.364586,1.097805,-0.637978,2.175178
437577,-0.649898,-0.124602,-0.036044,-0.045697,0.378586,-0.54602,2.655809,-0.52046,-0.487105,1.59501,-0.248921,0.79909,0.631115,-0.754259,0.960655,-0.364586,-0.40118,-0.725138,0.378949
437375,-1.256067,0.360783,-0.036044,0.115252,1.922032,1.831433,-0.376533,-0.633042,-0.62905,-0.626955,-0.248921,-1.220974,-1.710258,-0.754259,-1.040957,2.742839,-0.430093,-0.772257,-0.529501
699760,-0.346813,-0.124602,-0.036044,-0.045697,-1.16486,-0.54602,2.655809,-0.545087,-0.504384,1.59501,4.017344,1.087671,0.246983,1.325804,-1.040957,-0.364586,-0.014345,-0.761584,-0.984536
408173,-0.346813,-0.124602,-0.011236,-0.037443,0.378586,-0.54602,-0.376533,-0.333997,-0.443144,1.59501,-0.248921,-1.220974,1.179874,-0.754259,0.960655,-0.364586,-0.238171,2.242443,-0.040369
27308,-0.536241,-0.124602,-0.036044,-0.045697,-1.16486,-0.54602,2.655809,-0.527497,-0.504384,1.59501,-0.248921,0.51051,0.718002,1.325804,-1.040957,-0.364586,-0.434081,-0.757118,0.693621
248099,-0.46047,-0.072128,-0.036044,-0.020935,0.378586,1.831433,-0.376533,-0.509906,-0.552501,-0.626955,-0.248921,-1.220974,0.603677,-0.754259,0.960655,-0.364586,-0.472465,1.023219,-0.492618
31747,-1.167668,-0.13772,-0.031909,-0.037443,1.922032,1.831433,-0.376533,-0.650633,-0.639986,-0.626955,-0.248921,-1.220974,-0.52128,-0.754259,-1.040957,2.742839,2.400879,0.451017,-0.306023
517160,-0.346813,0.059057,-0.036044,0.020333,0.378586,1.831433,-0.376533,0.19373,-0.115075,-0.626955,-0.248921,-1.220974,-1.723977,-0.754259,0.960655,-0.364586,1.209967,1.023219,-0.525068
762978,0.562441,-0.098365,-0.036044,-0.045697,-1.16486,-0.54602,2.655809,-0.349829,-0.28589,1.59501,-0.248921,1.087671,0.946651,1.325804,-1.040957,-0.364586,-0.37775,-0.761471,-0.803414


The features have been succesfully scaled!