In [1]:
#Setup Notebook
# Import 3rd party libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


# Configure Notebook
%matplotlib inline
plt.style.use('fivethirtyeight')
sns.set_context("notebook")
import warnings
warnings.filterwarnings('ignore')



In [31]:
# import CSV

SBA_Loan = pd.read_csv('SBAnational.csv', low_memory=False)


In [32]:
SBA_Loan.head()

Unnamed: 0,LoanNr_ChkDgt,Name,City,State,Zip,Bank,BankState,NAICS,ApprovalDate,ApprovalFY,...,RevLineCr,LowDoc,ChgOffDate,DisbursementDate,DisbursementGross,BalanceGross,MIS_Status,ChgOffPrinGr,GrAppv,SBA_Appv
0,1000014003,ABC HOBBYCRAFT,EVANSVILLE,IN,47711,FIFTH THIRD BANK,OH,451120,28-Feb-97,1997,...,N,Y,,28-Feb-99,"$60,000.00",$0.00,P I F,$0.00,"$60,000.00","$48,000.00"
1,1000024006,LANDMARK BAR & GRILLE (THE),NEW PARIS,IN,46526,1ST SOURCE BANK,IN,722410,28-Feb-97,1997,...,N,Y,,31-May-97,"$40,000.00",$0.00,P I F,$0.00,"$40,000.00","$32,000.00"
2,1000034009,"WHITLOCK DDS, TODD M.",BLOOMINGTON,IN,47401,GRANT COUNTY STATE BANK,IN,621210,28-Feb-97,1997,...,N,N,,31-Dec-97,"$287,000.00",$0.00,P I F,$0.00,"$287,000.00","$215,250.00"
3,1000044001,"BIG BUCKS PAWN & JEWELRY, LLC",BROKEN ARROW,OK,74012,1ST NATL BK & TR CO OF BROKEN,OK,0,28-Feb-97,1997,...,N,Y,,30-Jun-97,"$35,000.00",$0.00,P I F,$0.00,"$35,000.00","$28,000.00"
4,1000054004,"ANASTASIA CONFECTIONS, INC.",ORLANDO,FL,32801,FLORIDA BUS. DEVEL CORP,FL,0,28-Feb-97,1997,...,N,N,,14-May-97,"$229,000.00",$0.00,P I F,$0.00,"$229,000.00","$229,000.00"


In [33]:
#Check shape and description

SBA_Loan.dtypes

LoanNr_ChkDgt          int64
Name                  object
City                  object
State                 object
Zip                    int64
Bank                  object
BankState             object
NAICS                  int64
ApprovalDate          object
ApprovalFY            object
Term                   int64
NoEmp                  int64
NewExist             float64
CreateJob              int64
RetainedJob            int64
FranchiseCode          int64
UrbanRural             int64
RevLineCr             object
LowDoc                object
ChgOffDate            object
DisbursementDate      object
DisbursementGross     object
BalanceGross          object
MIS_Status            object
ChgOffPrinGr          object
GrAppv                object
SBA_Appv              object
dtype: object

In [42]:
SBA_Loan.isnull().sum()

#lets decide which columns to drop
#lots of NaNs for Bank and Bank state, disbursement date

SBA_Loan = SBA_Loan[SBA_Loan['State'].notna()]

In [43]:
SBA_Loan.isnull().sum()

LoanNr_ChkDgt             0
Name                     14
City                     30
State                     0
Zip                       0
Bank                   1558
BankState              1565
NAICS                     0
ApprovalDate              0
ApprovalFY                0
Term                      0
NoEmp                     0
NewExist                136
CreateJob                 0
RetainedJob               0
FranchiseCode             0
UrbanRural                0
RevLineCr              4528
LowDoc                 2582
ChgOffDate           736454
DisbursementDate       2367
DisbursementGross         0
BalanceGross              0
MIS_Status             1996
ChgOffPrinGr              0
GrAppv                    0
SBA_Appv                  0
dtype: int64

In [44]:
from sklearn.model_selection import train_test_split

#stratify on state to create a working data_zet
test, working_data = train_test_split(SBA_Loan, test_size=0.05, random_state=0, stratify=SBA_Loan['State'])


In [45]:
working_data.shape

(44958, 27)

In [46]:
test.shape

(854192, 27)

In [47]:
working_data

Unnamed: 0,LoanNr_ChkDgt,Name,City,State,Zip,Bank,BankState,NAICS,ApprovalDate,ApprovalFY,...,RevLineCr,LowDoc,ChgOffDate,DisbursementDate,DisbursementGross,BalanceGross,MIS_Status,ChgOffPrinGr,GrAppv,SBA_Appv
774010,8176773007,RAFAEL'S RESTAURANT,EDINBURG,TX,78539,TEXAS ST. BANK A DIVISION OF,TX,0,16-Mar-95,1995,...,N,Y,,30-Apr-95,"$20,000.00",$0.00,P I F,$0.00,"$20,000.00","$17,400.00"
210377,2485365001,JULIE L DAVIS,YORBA LINDA,CA,92886,WELLS FARGO BANK NATL ASSOC,SD,541211,30-May-07,2007,...,Y,N,,30-Jun-07,"$67,401.00",$0.00,P I F,$0.00,"$30,000.00","$15,000.00"
830534,8919103007,"CHAPTER III, INC.",GOWEN,MI,49326,THE HUNTINGTON NATIONAL BANK,MI,453220,11-Jan-96,1996,...,N,Y,,30-Apr-96,"$100,000.00",$0.00,P I F,$0.00,"$100,000.00","$80,000.00"
340408,3374925005,Richard M. Adams D.P.M.P.A.,Granbury,TX,76048,COMMUNITY BANK,TX,621391,10-Apr-09,2009,...,N,N,,30-Apr-09,"$160,000.00",$0.00,P I F,$0.00,"$160,000.00","$144,000.00"
565809,5574173006,RMTS ASSOCIATES INC.,NEW YORK,NY,10013,"CITIBANK, N.A.",NY,0,9-Mar-93,1993,...,N,N,,14-Jun-93,"$325,000.00",$0.00,P I F,$0.00,"$325,000.00","$276,250.00"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
356265,3481896000,KIDDIE KOLLEGE INSTITUTE,CAMDEN,DE,19934,WILMINGTON TRUST COMPANY,DE,624410,17-Oct-08,2009,...,0,N,,31-Oct-08,"$25,000.00",$0.00,P I F,$0.00,"$25,000.00","$12,500.00"
783211,8303661010,MACDUFFS RESTAURANT INC,JAMESTOWN,NY,14701,FIRST NIAGARA BANK NATL ASSOC,NY,0,20-Sep-79,1979,...,N,N,17-Jul-95,11-Oct-79,"$61,000.00",$0.00,CHGOFF,"$7,759.00","$61,000.00","$54,900.00"
880913,9570023008,SHADES OF SUMMER,ST. SHARLES,MO,63366,BANK OF AMERICA NATL ASSOC,MO,0,24-Sep-96,1996,...,0,N,,31-May-98,"$40,000.00",$0.00,P I F,$0.00,"$40,000.00","$20,000.00"
414985,4010164010,FINN CORP,KALAMAZOO,MI,49008,BANK OF AMERICA NATL ASSOC,MI,722211,17-Oct-00,2001,...,0,N,3-Apr-06,30-Nov-00,"$309,000.00",$0.00,CHGOFF,"$119,385.00","$309,000.00","$231,750.00"


In [48]:
#export mini df

working_data.to_csv('mini_SBA_data.csv',index=False)