In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('../data/SBAnational.csv')
df.head(1)

  df = pd.read_csv('../data/SBAnational.csv')


Unnamed: 0,LoanNr_ChkDgt,Name,City,State,Zip,Bank,BankState,NAICS,ApprovalDate,ApprovalFY,...,RevLineCr,LowDoc,ChgOffDate,DisbursementDate,DisbursementGross,BalanceGross,MIS_Status,ChgOffPrinGr,GrAppv,SBA_Appv
0,1000014003,ABC HOBBYCRAFT,EVANSVILLE,IN,47711,FIFTH THIRD BANK,OH,451120,28-Feb-97,1997,...,N,Y,,28-Feb-99,"$60,000.00",$0.00,P I F,$0.00,"$60,000.00","$48,000.00"


In [3]:
# I'm gonna drop a bumch of columns that I don't care about for now. I will leave the feature "name" for traceability.
drop_columns = ['Name', 'LoanNr_ChkDgt', 'City', 'State', 'Zip', 'Bank','BankState', 'ApprovalFY',
                'CreateJob', 'RetainedJob', 'ChgOffDate', 'DisbursementDate', 'DisbursementGross',
                'BalanceGross', 'ChgOffPrinGr']
df.drop(drop_columns, inplace=True, axis=1)
df.head(1)

Unnamed: 0,NAICS,ApprovalDate,Term,NoEmp,NewExist,FranchiseCode,UrbanRural,RevLineCr,LowDoc,MIS_Status,GrAppv,SBA_Appv
0,451120,28-Feb-97,84,4,2.0,1,0,N,Y,P I F,"$60,000.00","$48,000.00"


# Data cleaning

### Target
The target will be binary encoded, 1 representing a defaulted loand and 0 the case of full payment.

In [4]:
# Drop data with null MIS_Status, as it is useless
df.dropna(subset=['MIS_Status'], inplace=True)
df['MIS_Status'].describe()

count     897167
unique         2
top        P I F
freq      739609
Name: MIS_Status, dtype: object

In [5]:
# Encode the target variable to binary values. This will encode Default as 1, 
# which in my opinion makes everything more intuitive.
target = 'Default'
df[target] = np.where(df['MIS_Status'] == 'CHGOFF', 1, 0)
df[target].describe()

count    897167.000000
mean          0.175617
std           0.380494
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max           1.000000
Name: Default, dtype: float64

In [6]:
# Now let's go over all of these columns
df.columns

Index(['NAICS', 'ApprovalDate', 'Term', 'NoEmp', 'NewExist', 'FranchiseCode',
       'UrbanRural', 'RevLineCr', 'LowDoc', 'MIS_Status', 'GrAppv', 'SBA_Appv',
       'Default'],
      dtype='object')


### NAICS
North American Industry Classification System code\
**Encoding:**\
It does not make sense as a float number. Needs to be changed to a string.\
One-Hot for tree-based models. See [census](https://www.census.gov/naics/?58967?yearbck=2012) for identifying the industries. Note: It's the 2002 naming for some reason, not the 2012 one. Probably they are even mixed.
For regression-based models we could do the same as for *bank* and just substitute it for its default ratio.\

As we're keeping things simple in this first iteration, we will only take a look at the first two digits of NAICS, which encode the broad industry.

In [7]:
## Convert NAICS to a string to make easier getting two digits.
df['NAICS'] = df['NAICS'].astype(str)

## Get only first two digits of NAICS, an convert it back to integer.
def NAICS_conversion(naics_string):
    return int(naics_string[:2])

df['NAICS_i'] = df['NAICS'].apply(NAICS_conversion).astype(int)

In [8]:
df['NAICS_i'].value_counts().head(5)

NAICS_i
0     201667
44     84567
81     72395
54     67922
72     67511
Name: count, dtype: int64

In [9]:
# We are going to create an array of features to be considered in the model.
features = ['NAICS_i']

### ApprovalDate

Will get engineered to obtain the interest rate

### Term
We will use term in months as a numerical feature

In [10]:
# Term gets directly used as a numerical feature.
features.append('Term')

### NoEmp

Keep it for now

### NewExist
We will change it to isNewBusiness, which is 1 when the company is of new creation.

In [11]:
# NewExist has samples with value 0.0 and nulls. Let's drop these.
df = df[(df['NewExist'] != 0) & (df['NewExist'].notnull())]
df['NewExist'].value_counts()

NewExist
1.0    643446
2.0    252559
Name: count, dtype: int64

In [12]:
# Make a column called NewBusiness, which is 0 when NewExist is 1 and 1 when it's 2.
df['isNewBusiness'] = df['NewExist'] - 1
df['isNewBusiness'].describe()

count    896005.000000
mean          0.281872
std           0.449912
min           0.000000
25%           0.000000
50%           0.000000
75%           1.000000
max           1.000000
Name: isNewBusiness, dtype: float64

In [13]:
# Gets added as a feature
features.append('isNewBusiness')

### FranchiseCode
For now, we will use it as isFranchise, being 1 when a business is a franchise and 0 otherwise.

In [14]:
# Function to create the 'isFranchise' column
def is_franchise(code):
    return 0 if code in [0, 1] else 1

df['isFranchise'] = df['FranchiseCode'].apply(is_franchise)

In [15]:
df['isFranchise'].describe()

count    896005.000000
mean          0.057647
std           0.233075
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max           1.000000
Name: isFranchise, dtype: float64

In [16]:
# Gets added to our features
features.append('isFranchise')

### UrbanRural
Each of the three options of UrbanRural (0, 1 and 2) are considered as valid inputs.

In [17]:
df['UrbanRural'] = df['UrbanRural'].astype(str)
df['UrbanRural'].describe()

count     896005
unique         3
top            1
freq      468811
Name: UrbanRural, dtype: object

In [18]:
# Gets added to our features
features.append('UrbanRural')

### RevLineCr
We will reduce it to 1, if it is a Revolving Line of Credit, or 0 otherwise.

In [19]:
df['RevLineCr'].value_counts()

RevLineCr
N    418229
0    257375
Y    200588
T     15232
1        22
R        14
`        11
2         6
C         2
3         1
,         1
7         1
A         1
5         1
.         1
4         1
-         1
Q         1
Name: count, dtype: int64

In [20]:
# A lot of the possible values are junk. We assume that 0 and N are equivalent and drop the rest of outliers.
df['RevLineCr'] = df['RevLineCr'].replace('N', '0')
df['RevLineCr'] = df['RevLineCr'].replace('0', 0)
df['RevLineCr'] = df['RevLineCr'].replace('Y', 1)

In [21]:
df = df[df['RevLineCr'].isin([0, 1])]
df['RevLineCr'] = df['RevLineCr'].astype(int)

In [22]:
df['RevLineCr'].describe()

count    876192.000000
mean          0.228932
std           0.420145
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max           1.000000
Name: RevLineCr, dtype: float64

In [23]:
# Gets added to our features
features.append('RevLineCr')

### LowDoc
Similarly, will be 1 if the loan is part of the LowDoc program and 0 otherwise.

In [24]:
df['LowDoc'].value_counts()

LowDoc
N    762649
Y    107865
0      1238
C       752
S       599
A       491
R        73
1         1
Name: count, dtype: int64

In [25]:
# All values which are not N or Y get dropped.
df['LowDoc'] = df['LowDoc'].replace('N', 0)
df['LowDoc'] = df['LowDoc'].replace('Y', 1)

In [26]:
df = df[df['LowDoc'].isin([0, 1])]
df['LowDoc'] = df['LowDoc'].astype(int)

In [27]:
df['LowDoc'].describe()

count    870514.000000
mean          0.123910
std           0.329479
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max           1.000000
Name: LowDoc, dtype: float64

In [28]:
features.append('LowDoc')

### GrAppv
The size of the loan will be used as a feature after being converted to float numbers.

In [29]:
#Parse dollar values to float value
def dollar_to_float(dollar_string: str):
    return float(dollar_string.replace('$','').replace(',',''))

In [30]:
df['GrAppv'] = df['GrAppv'].apply(dollar_to_float)

In [31]:
df['GrAppv'].describe()

count    8.705140e+05
mean     1.951101e+05
std      2.843112e+05
min      1.000000e+03
25%      3.500000e+04
50%      9.400000e+04
75%      2.300000e+05
max      5.000000e+06
Name: GrAppv, dtype: float64

In [32]:
# Gets added to our features
features.append('GrAppv')

### SBA_Appv

In [33]:
# Convert SBA_Appv to float numbers
df['SBA_Appv'] = df['SBA_Appv'].apply(dollar_to_float)

In [34]:
df['SBA_Appv'].describe()

count    8.705140e+05
mean     1.514752e+05
std      2.288713e+05
min      5.000000e+02
25%      2.240000e+04
50%      6.375000e+04
75%      1.770000e+05
max      4.500000e+06
Name: SBA_Appv, dtype: float64

In [35]:
# Gets added to our features
features.append('SBA_Appv')

# Feature Exploration

In [None]:
# Encode the target variable to binary values
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df['PaidInFull'] = label_encoder.fit_transform(df['MIS_Status'])
df.drop(['MIS_Status'], inplace=True, axis=1)
print(df['PaidInFull'].describe())
# Paid in Full is encoded as 1

In [None]:
print(f'Our ratio of target variable is {df["PaidInFull"].mean()} ')

## Direct Inclusion

Can be directly included or with minimum manipulation.

### NAICS
North American Industry Classification System code\
**Encoding:**\
It does not make sense as a float number. Needs to be changed to a string.\
One-Hot for tree-based models. We can do a similar encoding as in the *Bank* feature. See [census](https://www.census.gov/naics/?58967?yearbck=2012) for identifying the industries. Note: It's the 2002 naming for some reason, not the 2012 one. Probably they are even mixed.
<mark>TODO: Identify most common industries.</mark> \
For regression-based models we could do the same as for *bank* and just substitute it for its default ratio.\
**Cleaning:**\
A bunch of them have 0 value.
<mark>TODO: Decide what to do with the zeros.</mark>

In [None]:
# Change to a string to get categorical descriptions.
df['NAICS'] = df['NAICS'].astype(str)
df['NAICS'].describe()

In [None]:
top_naics = df['NAICS'].value_counts().head()
print(f"\nThe top 5 most common NAICS:")
top_naics

### Term
Loan term in months\
**Encoding:**\
For regression models it is inmediate, being a numeric feature.\
For tree-based models we may have to build bins. An idea would be to divide it into short, medium and long terms.
<mark>TODO: Design bins to divide the terms.</mark>

In [None]:
df['Term'].describe()

In [None]:
# Helper function to draw histograms
def draw_histogram(column: pd.DataFrame, name: str, nbins: int = 50, log=False):
    plt.figure(figsize=(8, 6))
    plt.hist(column, bins=nbins, alpha=0.7, color='skyblue', edgecolor='black', log=log)
    plt.title(f'Histogram of {name}')
    plt.xlabel(name)
    plt.ylabel('Frequency')
    plt.grid(True)
    plt.show()

In [None]:
draw_histogram(df['Term'], 'Term, in months', 40)

### NoEmp
Number of Business Employees\
**Encoding:**\
For linear models it makes sense to use directly as a feature.\
For tree based models we might want to make bins for different business sizes.
<mark>TODO: Design bins to divide the business sizes.</mark>

In [None]:
df['NoEmp'].describe()

In [None]:
# These are some of the biggest ones. They could maybe be considered as outliers.
df[df['NoEmp'] >= 9800]

In [None]:
# What should we do with the fellas with 0 employees?
print(f"{len(df[df['NoEmp'] < 1])} have 0 employees")

In [None]:
# I'll only pull the ones under 50 to see with bigger detail.
#draw_histogram(df[df['NoEmp']<=50]['NoEmp'], "Number of employees", 50)
draw_histogram(df['NoEmp'], "Number of employees", 50, log=True)

### NewExist
1 = Existing Business, 2 = New Business\
**Encoding:**\
For both trees and regressions, the encoding is straightforward binary, 1 = existing, 0 = new.\
**Cleaning:**\
1028 values with 0.0.\
We can fill them with the mode (Existing) or toss the samples.
<mark>TODO: Decide what to do with the nulls.</mark>

In [None]:
df['NewExist'] = df['NewExist'].astype(str)
df['NewExist'].describe()

In [None]:
# 136 nulls.
df['NewExist'].value_counts()

### FranchiseCode
Franchise Code 00000 or 00001 = No Franchise\
**Encoding:**\
Has to be changed to strings.
For both tree and regression models it would be good to feature engineer it into a division between *no-franchise, major-franchise and other-franchise*. We could have one bin for Subway, Quiznos, etc; one for other franchises and one for no franchise.\
Some values included are:\
78760: Subway\
68020: Quiznos\
50564: Mail Boxes Etc\
21780: Dairy Queen\
25650: Dunkin\
79140: Super 8\
<mark>TODO: Design bins to divide the franchises.</mark>

In [None]:
df['FranchiseCode'] = df['FranchiseCode'].astype(str)
df['FranchiseCode'].describe()

In [None]:
top_franchises = df['FranchiseCode'].value_counts().head(10)
print(f"\nThe top 10 most common franchises:")
top_franchises

### UrbanRural
1= Urban, 2= Rural, 0 = Undefined\
**Encoding:**\
Changed to string. It has to be one-hot encoding for both trees and regression. 105343 are undefined, so it should be its own cathegory.

In [None]:
df['UrbanRural'] = df['UrbanRural'].astype(str)
df['UrbanRural'].describe()

In [None]:
df['UrbanRural'].value_counts()

### RevLineCr
Revolving Line of Credit: Y = Yes\
**Encoding:**\
Makes sense to binary-encode it for both trees and regression. \
**Cleaning:**\
There is a significative amount of possible values which don't have a clear meaning (i.e. 0, T, 1, R, 2, C). There are also nulls.
<mark>TODO: Clean this data. </mark>

In [None]:
df['RevLineCr'].describe()

In [None]:
df['RevLineCr'].value_counts()

In [None]:
print(f"{len(df[df['RevLineCr'].isnull()])} nulls")

### LowDoc
LowDoc Loan Program: Y = Yes, N = No\
**Encoding:**\
Makes sense to binary-encode it for both trees and regression. \
**Cleaning:**\
There is a significative amount of possible values which don't have a clear meaning (i.e. 0, C, S, A, 1). There are also nulls.
<mark>TODO: Clean this data. </mark>

In [None]:
df['LowDoc'].describe()

In [None]:
df['LowDoc'].value_counts()

In [None]:
print(f"{len(df[df['LowDoc'].isnull()])} nulls")

In [None]:
#Convert dollar values to float value
def dollar_to_float(dollar_string: str):
    return float(dollar_string.replace('$','').replace(',',''))

### GRAppv
Gross Amount of Loan Approved by Bank\
**Encoding:**\
Has to be translated into floats.

In [None]:
df['GrAppv'] = df['GrAppv'].apply(dollar_to_float)

# Scientific notation is useless for now
pd.set_option('display.float_format', lambda x: '%.3f' % x)

df['GrAppv'].describe()


In [None]:
# I will go over the loans under $1 million.

#draw_histogram(df[df['GrAppv'] < 1000000]['GrAppv'], 'Gross Amount Approved, $', 50)
draw_histogram(df['GrAppv'], 'Gross Amount Approved, $', 50, log=True)

### SBA_Appv
SBA’s Guaranteed Amount of Approved Loan\
**Encoding:**\
Has to be translated into floats.

In [None]:
df['SBA_Appv'] = df['SBA_Appv'].apply(dollar_to_float)
df['SBA_Appv'].describe()

In [None]:
# I will go over the loans under $200k.

#draw_histogram(df[df['SBA_Appv'] < 200000]['SBA_Appv'], 'Gross Amount Approved, $', 50)
draw_histogram(df['SBA_Appv'], 'Gross Amount Approved, $', 50, log=True)

## Engineered Features

### SBARatio
The percentage of loan insured by SBA

In [None]:
df['SBARatio'] = df['SBA_Appv'] / df['GrAppv']
df['SBARatio'].describe()

In [None]:
draw_histogram(df['SBARatio'], 'SBA ratio of insurance', 20)
# Most are located in round numbers, like 50%, 75%, 80% etc.

### NAICS_i
NAICS as an integer to perform correlation analysis

In [None]:
def NAICS_conversion(naics_string):
    return int(naics_string[:2])

In [None]:

#df['NAICS_i'] = df['NAICS'].astype(int)
df['NAICS_i'] = df['NAICS'].apply(NAICS_conversion).astype(int)
draw_histogram(df['NAICS_i'], "NAICS as an integer", 50)

In [None]:
print(f"{len(df['NAICS_i'].value_counts())} cathegories left")

### FranchiseCode_i
FranchiseCode as an integer to perform correlation analysis

In [None]:
df['FranchiseCode_i'] = df['FranchiseCode'].astype(int)
draw_histogram(df['FranchiseCode_i'], "Franchise code as an integer", 50, True)

### isFranchise
Binary version of FranchiseCode

In [None]:
# Function to create the 'isFranchise' column
def is_franchise(code):
    return 0 if code in [0, 1] else 1

df['isFranchise'] = df['FranchiseCode_i'].apply(is_franchise)

In [None]:
df['isFranchise'].describe()

In [None]:
df['isFranchise'].value_counts()

## Correlation study

In [None]:
df.columns

In [None]:
target = 'PaidInFull'
features = [feature for feature in df.columns if feature not in target]
num_features = ['Term', 'NoEmp', 'GrAppv', 'SBA_Appv', 'SBARatio', 'NAICS_i', 'FranchiseCode_i', 'isFranchise']
cat_features = [feature for feature in features if feature not in num_features]

print(f"Cathegorical features: {cat_features}")
print(f"Numeric features: {num_features}")

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Calculate correlation matrix
numeric_corr = df[num_features + [target]].corr()

# Generate heatmap
sns.heatmap(numeric_corr, annot=True, cmap='coolwarm', fmt='.2f', vmin=-1, vmax=1)
plt.title('Correlation Heatmap (Numerical)')
plt.show()
# The correlation between the number of employees and the target seems to be minimal.

In [None]:
import matplotlib.pyplot as plt
i = 241
plt.figure(figsize=(15, 10))
for num_feature in num_features:
    plt.subplot(i)
    plt.hist(x = [df[df[target]==1][num_feature], df[df[target]==0][num_feature]], 
            stacked=True, color = ['g','r'])
    plt.title(num_feature)
    plt.ylabel('# of loans')
    i += 1


# plt.subplot(235)
# plt.hist(x = [df[df[target]==1]['NoEmp'], df[df[target]==0]['NoEmp']], 
#          stacked=True, color = ['g','r'])
# plt.title('NoEmp')
# plt.xlabel('NoEmp')
# plt.ylabel('# of loans')

# plt.subplot(236)
# plt.hist(x = [df[df[target]==1]['SBARatio'], df[df[target]==0]['SBARatio']], 
#          stacked=True, color = ['g','r'])
# plt.title('SBARatio')
# plt.xlabel('SBARatio')
# plt.ylabel('# of loans')


## Crosstab analysis

In [None]:

NAICS_not_zero = df[df['NAICS_i'] != 0]
NAICS_not_zero_ratio = len(NAICS_not_zero[NAICS_not_zero[target] == 1])/len(NAICS_not_zero)
whole_ratio = len(df[df[target] == 1])/len(df)
print(whole_ratio)
print(NAICS_not_zero_ratio)
cross_tab = pd.crosstab(df['NAICS_i'], df[target])
cross_tab['Ratio'] = cross_tab[1] / cross_tab.sum(axis=1)
print(cross_tab)

In [None]:
cross_tab = pd.crosstab(df['NewExist'], df[target])
cross_tab['Ratio'] = cross_tab[1] / cross_tab.sum(axis=1)

cross_tab

In [None]:
cross_tab = pd.crosstab(df['UrbanRural'], df[target])
cross_tab['Ratio'] = cross_tab[1] / cross_tab.sum(axis=1)

cross_tab

In [None]:
cross_tab = pd.crosstab(df['RevLineCr'], df[target])
cross_tab['Ratio'] = cross_tab[1] / cross_tab.sum(axis=1)

cross_tab

In [None]:
cross_tab = pd.crosstab(df['LowDoc'], df[target])
cross_tab['Ratio'] = cross_tab[1] / cross_tab.sum(axis=1)

cross_tab

In [None]:
cross_tab = pd.crosstab(df['isFranchise'], df[target])
cross_tab['Ratio'] = cross_tab[1] / cross_tab.sum(axis=1)

cross_tab

In [None]:
cross_tab = pd.crosstab(df['FranchiseCode'], df[target])
cross_tab['Ratio'] = cross_tab[1] / cross_tab.sum(axis=1)

cross_tab = cross_tab.sort_values(by=0, ascending=False)
cross_tab.head(20)

In [None]:
df[df['FranchiseCode_i'] == 34845].head(5)

In [None]:
cross_tab = pd.crosstab(df['Bank'], df[target])
cross_tab['Ratio'] = cross_tab[1] / cross_tab.sum(axis=1)

cross_tab = cross_tab.sort_values(by=0, ascending=False)
cross_tab.head(20)

## Density study of numerical vars

In [None]:
a = sns.FacetGrid( df, hue = target, aspect=4 )
a.map(sns.kdeplot, 'Term', fill= True )
a.set(xlim=(0 , df['Term'].max()))
a.add_legend()

In [None]:
a = sns.FacetGrid( df, hue = target, aspect=4 )
a.map(sns.kdeplot, 'NoEmp', fill= True )
a.set(xlim=(0 , 100))
a.add_legend()

In [None]:
a = sns.FacetGrid( df, hue = target, aspect=4 )
a.map(sns.kdeplot, 'GrAppv', fill= True )
a.set(xlim=(0, 100000))
a.add_legend()

In [None]:
a = sns.FacetGrid( df, hue = target, aspect=4 )
a.map(sns.kdeplot, 'SBA_Appv', fill= True )
a.set(xlim=(0, 100000))
a.add_legend()

In [None]:
a = sns.FacetGrid( df, hue = target, aspect=4 )
a.map(sns.kdeplot, 'SBARatio', fill= True )
a.set(xlim=(0, df['SBARatio'].max()))
a.add_legend()

In [None]:
a = sns.FacetGrid( df, hue = target, aspect=4 )
a.map(sns.kdeplot, 'NAICS_i', fill= True )
a.set(xlim=(0, df['NAICS_i'].max()))
a.add_legend()

In [None]:
a = sns.FacetGrid( df, hue = target, aspect=4)
a.map(sns.kdeplot, 'FranchiseCode_i', fill= True)
a.set(xlim=(0, df['FranchiseCode_i'].max()))
a.add_legend()