In [1]:
import math
import numpy as np
import h5py
import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd
import scipy.io
import sklearn
import sklearn.datasets

from tensorflow.python.framework import ops
%matplotlib inline

## Data Preprocessing

In [2]:
rawdata = pd.read_excel('SBA_Loan_data_.xlsx')

In [3]:
rawdata.head()

Unnamed: 0,Program,BorrName,BorrStreet,BorrCity,BorrState,BorrZip,CDC_Name,CDC_Street,CDC_City,CDC_State,...,InitialInterestRate,TermInMonths,NaicsCode,NaicsDescription,ProjectCounty,ProjectState,BusinessType,LoanStatus,ChargeOffDate,GrossChargeOffAmount
0,504,ROBERT G. FIELDS AND MARY D. F,55 & RICHLAND,KANSAS CITY,KS,66106,Avenue Area Incorporated,3324 Emerald Lane,Jefferson City,MO,...,,12,,,WYANDOTTE,KS,INDIVIDUAL,PIF,NaT,0.0
1,504,CANDLELAMP COMPANY,3454 NIKI WAY,RIVERSIDE,CA,92507,CDC Small Business Finance Cor,2448 Historic Decatur,San Diego,CA,...,,240,,,RIVERSIDE,CA,INDIVIDUAL,PIF,NaT,0.0
2,504,"NATIONAL COMPOSITES, INC.",2303 N. BENDIX DRIVE,SOUTH BEND,IN,46628,Business Development Corporati,218 W. Washington Street,South Bend,IN,...,,120,,,ST JOSEPH,IN,CORPORATION,PIF,NaT,0.0
3,504,WENDY'S OF HARRISBURG,OUTLOT A. TR. 45N & SMALL ST,HARRISBURG,IL,62946,Small Business Growth Corporat,2401 West White Oaks Drive,Springfield,IL,...,,240,,,SALINE,IL,CORPORATION,CHGOFF,2003-03-28,0.0
4,504,"RON SAUNORIS GARDEN CTR, INC.",13747 W. 159TH ST.,LOCKPORT,IL,60441,MISSNG/INACTV LENDER,,,,...,,240,,,WILL,IL,CORPORATION,CANCLD,NaT,0.0


In [4]:
rawdata.columns

Index(['Program', 'BorrName', 'BorrStreet', 'BorrCity', 'BorrState', 'BorrZip',
       'CDC_Name', 'CDC_Street', 'CDC_City', 'CDC_State', 'CDC_Zip',
       'ThirdPartyLender_Name', 'ThirdPartyLender_City',
       'ThirdPartyLender_State', 'ThirdPartyDollars', 'GrossApproval',
       'ApprovalDate', 'ApprovalFiscalYear', 'DeliveryMethod', 'subpgmdesc',
       'InitialInterestRate', 'TermInMonths', 'NaicsCode', 'NaicsDescription',
       'ProjectCounty', 'ProjectState', 'BusinessType', 'LoanStatus',
       'ChargeOffDate', 'GrossChargeOffAmount'],
      dtype='object')

In [5]:
# drop the first column
print(np.unique(rawdata['Program']))
#checkCol(rawdata, 'Program', numRow, isCata = False)
rawdata = rawdata.drop(['Program'],axis = 1)

[504]


In [6]:
# remove cancelled and exempt
rawdata = rawdata[(rawdata['LoanStatus'] == 'PIF') | (rawdata['LoanStatus'] == 'CHGOFF')]
rawdata['target'] = [int(i) for i in rawdata['LoanStatus'] == 'PIF']
rawdata.head()

Unnamed: 0,BorrName,BorrStreet,BorrCity,BorrState,BorrZip,CDC_Name,CDC_Street,CDC_City,CDC_State,CDC_Zip,...,TermInMonths,NaicsCode,NaicsDescription,ProjectCounty,ProjectState,BusinessType,LoanStatus,ChargeOffDate,GrossChargeOffAmount,target
0,ROBERT G. FIELDS AND MARY D. F,55 & RICHLAND,KANSAS CITY,KS,66106,Avenue Area Incorporated,3324 Emerald Lane,Jefferson City,MO,65109.0,...,12,,,WYANDOTTE,KS,INDIVIDUAL,PIF,NaT,0.0,1
1,CANDLELAMP COMPANY,3454 NIKI WAY,RIVERSIDE,CA,92507,CDC Small Business Finance Cor,2448 Historic Decatur,San Diego,CA,92106.0,...,240,,,RIVERSIDE,CA,INDIVIDUAL,PIF,NaT,0.0,1
2,"NATIONAL COMPOSITES, INC.",2303 N. BENDIX DRIVE,SOUTH BEND,IN,46628,Business Development Corporati,218 W. Washington Street,South Bend,IN,46601.0,...,120,,,ST JOSEPH,IN,CORPORATION,PIF,NaT,0.0,1
3,WENDY'S OF HARRISBURG,OUTLOT A. TR. 45N & SMALL ST,HARRISBURG,IL,62946,Small Business Growth Corporat,2401 West White Oaks Drive,Springfield,IL,62704.0,...,240,,,SALINE,IL,CORPORATION,CHGOFF,2003-03-28,0.0,0
6,"ROCKY MTN EMPLOYE BENEFTS, INC",3200 S. 700 E.,SALT LAKE CITY,UT,84106,Mountain West Small Business F,2595 East 3300 South,Salt Lake City,UT,84109.0,...,240,,,SALT LAKE,UT,CORPORATION,PIF,NaT,0.0,1


In [7]:
# add feature GSP in the borrower state and year
state = pd.read_csv('state.csv',header = None)
state.set_index(0,inplace = True)
statedict = state.to_dict()[1]
gsp = pd.read_excel('GSP.xls')
gsp['state'] = gsp['GeoName'].map(lambda x: statedict[x] if x in statedict else '')
gsp.set_index('state',inplace = True)

In [8]:
Borrgsp = []
BorrState = list(rawdata['BorrState'])
year = list(rawdata['ApprovalFiscalYear'])
for i in range(len(year)):
    Borrgsp.append(gsp.loc[BorrState[i],str(year[i])] if BorrState[i] in statedict.values() else 0)

In [9]:
Borrgsp

[51873.7,
 773460.0,
 110859.6,
 279019.1,
 31249.3,
 31249.3,
 227412.9,
 227412.9,
 144970.6,
 100235.8,
 0,
 493191.9,
 773460.0,
 100235.8,
 70631.7,
 23767.8,
 30980.0,
 773460.0,
 112834.8,
 51873.7,
 773460.0,
 773460.0,
 773460.0,
 31249.3,
 773460.0,
 773460.0,
 100235.8,
 103565.7,
 94086.7,
 31249.3,
 103565.7,
 68412.1,
 110859.6,
 71610.0,
 95177.1,
 493191.9,
 140646.1,
 31249.3,
 227412.9,
 103565.7,
 256589.3,
 71610.0,
 56121.5,
 100235.8,
 25039.5,
 773460.0,
 12770.3,
 75570.5,
 23256.1,
 56121.5,
 773460.0,
 100235.8,
 140646.1,
 140646.1,
 31249.3,
 493191.9,
 71610.0,
 227412.9,
 773460.0,
 773460.0,
 18004.2,
 56565.6,
 71610.0,
 139658.4,
 139658.4,
 773460.0,
 227412.9,
 71610.0,
 102756.6,
 773460.0,
 773460.0,
 103565.7,
 110859.6,
 103565.7,
 21663.9,
 38757.0,
 95177.1,
 773460.0,
 159504.9,
 100235.8,
 23767.8,
 31249.3,
 103565.7,
 144970.6,
 30980.0,
 30980.0,
 100235.8,
 773460.0,
 493191.9,
 31249.3,
 140646.1,
 140646.1,
 773460.0,
 23767.8,
 31249.3,

In [37]:
# add feature unemployment rate in the borrower state and year





In [None]:
# add feature unemployment rate in the project state and year




In [20]:
rawdata.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 54805 entries, 0 to 140496
Data columns (total 30 columns):
BorrName                  54805 non-null object
BorrStreet                54797 non-null object
BorrCity                  54805 non-null object
BorrState                 54805 non-null object
BorrZip                   54805 non-null int64
CDC_Name                  54805 non-null object
CDC_Street                54613 non-null object
CDC_City                  54613 non-null object
CDC_State                 54613 non-null object
CDC_Zip                   54613 non-null float64
ThirdPartyLender_Name     16953 non-null object
ThirdPartyLender_City     16953 non-null object
ThirdPartyLender_State    17068 non-null object
ThirdPartyDollars         16881 non-null float64
GrossApproval             54805 non-null int64
ApprovalDate              54805 non-null datetime64[ns]
ApprovalFiscalYear        54805 non-null int64
DeliveryMethod            54805 non-null object
subpgmdesc         

In [23]:
numRow = len(rawdata)
def checkCol(df, colName, numRow, isCata = True):
    print(colName, ':')
    print('Any NA?: ', df[colName].isnull().values.any())
    if df[colName].isnull().values.any():
        numNA = sum(df[colName].isnull().values)
        print ('    # of NA: ', numNA)
        print ('    NA%:     ', numNA/numRow)
    if isCata:
        levelList = df[colName].unique()
        print('Different levels: ', len(levelList), levelList)
        #speList = ['(not set)', '(not provided)', '(none)', '(Other)']
        #for spe in speList:
            #if spe in levelList:
                #print(spe, 'percentage: ', sum(df[colName] == spe)/numRow)
        #matplotlib.rcParams['figure.figsize'] = (12, 6)
        #sns.catplot(y=colName, kind="count", data=dfTrain)
        
    else:
        print('range: ', min(df[colName].astype(float).dropna()), max(df[colName].astype(float).dropna()))
        
def fillna(df, colName, isCata = True):
    if isCata:
        df[colName] = df[colName].fillna(0)
    else:
        df[colName] = df[colName].fillna(0)
    return df


In [37]:
catCols = ['BorrName', 'BorrStreet', 'BorrCity', 'BorrState','CDC_Name', 'CDC_Street', 'CDC_City', 'CDC_State', 'ThirdPartyLender_Name', 'ThirdPartyLender_City',
       'ThirdPartyLender_State','ApprovalDate', 'ApprovalFiscalYear', 'DeliveryMethod', 'subpgmdesc','NaicsCode', 'NaicsDescription',
       'ProjectCounty', 'ProjectState', 'BusinessType', 'LoanStatus',
       'ChargeOffDate']
numCols = ['BorrZip','CDC_Zip','ThirdPartyDollars', 'GrossApproval','InitialInterestRate', 'TermInMonths','GrossChargeOffAmount', 'target']
for col in catCols:
    checkCol(rawdata, col, numRow, isCata = True)
for col in numCols:
    checkCol(rawdata, col, numRow, isCata = False)

BorrName :
Any NA?:  False
Different levels:  52068 ['ROBERT G. FIELDS AND MARY D. F' 'CANDLELAMP COMPANY'
 'NATIONAL COMPOSITES, INC.' ... 'Master Precision Global, LLC'
 'Precision Electric Group, Inc.' 'Material Innovations, Incorpor']
BorrStreet :
Any NA?:  True
    # of NA:  8
    NA%:      0.00014597208283915702
Different levels:  53510 ['55 & RICHLAND' '3454 NIKI WAY' '2303 N. BENDIX DRIVE' ...
 '1212 Fairplains Street.' '7710 185th Avenue NE.' '17611 Metzler Lane.']
BorrCity :
Any NA?:  False
Different levels:  7442 ['KANSAS CITY' 'RIVERSIDE' 'SOUTH BEND' ... 'Sonoma' 'Vicksburg' 'Redmond']
BorrState :
Any NA?:  False
Different levels:  54 ['KS' 'CA' 'IN' 'IL' 'UT' 'OH' 'VA' 'WI' 'PR' 'NY' 'AZ' 'NH' 'NV' 'MD'
 'MO' 'TN' 'KY' 'AL' 'LA' 'GA' 'FL' 'IA' 'AK' 'SD' 'CO' 'ME' 'ID' 'OR'
 'NC' 'MN' 'RI' 'MS' 'MA' 'TX' 'WA' 'VT' 'MI' 'CT' 'ND' 'MT' 'HI' 'NE'
 'OK' 'SC' 'PA' 'AR' 'NJ' 'WV' 'DE' 'WY' 'GU' 'NM' 'DC' 'VI']
CDC_Name :
Any NA?:  False
Different levels:  296 ['Avenue Area Incor

    # of NA:  192
    NA%:      0.0035033299881397683
Different levels:  334 ['3324 Emerald Lane' '2448 Historic Decatur' '218 W. Washington Street'
 '2401 West White Oaks Drive' '2595 East 3300 South' '900 Michigan Avenue'
 '130 W. Second St.' '1964 Wakefield Street' '100 River Place'
 '239 Arterial Hostos Avenue' '16 James Street' '410 Hemsted Drive'
 '335 N. Wilmot Road' 'One Cate Street' '626 South 9th Street'
 '426 D Street' nan '200 W. Douglas' '441 East Whittier Boulevard'
 '300 East State Street,' '5333 South Adams Ave.' '1100 14th Street'
 '5217 Hwy. B' '111 St. James Court' '151 N. Delaware'
 '8132 Old Federal Rd.' '5210 Hollywood Avenue' '50 Beaver Street'
 '6445 Powers Ferry Road' '1100 Walnut' '6801 Lake Worth Road'
 '1500 1st Avenue North' '5409 NW 88th Street' '809 North Broadway'
 '619 Warehouse Ave' '711 E. Wells Avenue' '1175 Osage Street'
 '40 Harlow Street' '400 Robert D Ray Drive' '175-C Emory Highway'
 '7370 Liberty One Drive' '1631 17th Street' '1161 W River Stre

Any NA?:  True
    # of NA:  37853
    NA%:      0.6906851564638263
Different levels:  2386 [nan 'Ocean Bank' 'Zions First National Bank' ... 'Currie State Bank'
 'Postal CU' 'KALSEE CU']
ThirdPartyLender_City :
Any NA?:  True
    # of NA:  37853
    NA%:      0.6906851564638263
Different levels:  1582 [nan 'MIAMI' 'SALT LAKE CITY' ... 'LOS ALTOS' 'MODESTO' 'CURRIE']
ThirdPartyLender_State :
Any NA?:  True
    # of NA:  37738
    NA%:      0.6885868077730134
Different levels:  62 [nan 'FL' 'UT' 'NJ' 'CA' 'NC' 'TX' 'DE' 'NH' 'RI' 'NY' 'VA' 'CT' 'IL' 'OH'
 'MA' 'SD' 'MN' 'MO' 'WA' 'AZ' 'CO' 'GA' 'SC' 'ND' 'OR' 'NV' 'IA' 'KS'
 'AL' 'TN' 'MI' 'PA' 'NE' 'HI' 'IN' 'WV' 'PR' 'NM' 'OK' 'MT' 'MS' 'KY'
 'ID' '#N' 'MD' 'WI' 'DC' 'VT' 'ME' 'AK' 'LA' 'AR' 'Ba' '0' 'WY' 'St' 'AB'
 'BC' 'D.' 'Ro' 'GU']
ApprovalDate :
Any NA?:  False
Different levels:  5246 ['1990-01-02T00:00:00.000000000' '1990-01-03T00:00:00.000000000'
 '1990-01-04T00:00:00.000000000' ... '2012-09-25T00:00:00.000000000'
 '2012-10-10