In [5]:
# load necessary libraries
from pathlib import Path
import pandas as pd
import numpy as np

In [4]:
# set up directories 
ROOT = Path.cwd()
DATA_DIR = ROOT/ "data"
DATA_DIR.mkdir(exist_ok= True)

In [13]:
# load the data
dat_path = Path("./data/SBAnational.csv")
df = pd.read_csv(dat_path)
df_copy = df.copy()

  df = pd.read_csv(dat_path)


The original data contains almost 90k rows and 27 columns and it is not cleaned, which means some rows might have missing values. In addition, some of the important features (e.g., GrAppv, Gross amount of loan approved by bank is an object rather than numeric). The following part demonstrates how I cleaned the data.

In [22]:
# examine counts of missing values in each column
df_copy.isna().sum()

LoanNr_ChkDgt             0
Name                     14
City                     30
State                    14
Zip                       0
Bank                   1559
BankState              1566
NAICS                     0
ApprovalDate              0
ApprovalFY                0
Term                      0
NoEmp                     0
NewExist                136
CreateJob                 0
RetainedJob               0
FranchiseCode             0
UrbanRural                0
RevLineCr              4528
LowDoc                 2582
ChgOffDate           736465
DisbursementDate       2368
DisbursementGross         0
BalanceGross              0
MIS_Status             1997
ChgOffPrinGr              0
GrAppv                    0
SBA_Appv                  0
dtype: int64

In [24]:
# drop rows that contains any missing value
df_copy.dropna(subset= ["NewExist", "RevLineCr", "LowDoc", "DisbursementDate", "MIS_Status"],
               inplace= True)
df_copy.shape

(887797, 27)

In [None]:
# convert some columns to its correct data type
# they are object prior to the transformation 
df_copy[["DisbursementGross", "BalanceGross", "ChgOffPrinGr", "GrAppv", "SBA_Appv"]] = \
    df_copy[["DisbursementGross", "BalanceGross", "ChgOffPrinGr", "GrAppv", "SBA_Appv"]].map(lambda x:
        x.strip().replace("$", "").replace(",", "")).astype("float64")

# change the dtype of ApprovalFY to int
# it's a mix of str and int 
def str_cleaner(x):
    if isinstance(x, str):
        return x.replace("A", "") # there's one row with "A"
    return x
df_copy["ApprovalIFY"] = df_copy["Approval"].apply(str_cleaner).astype("int64")

# change the dtype of other cols
# NewExist -> int; Zip, UrbanRural -> str(categorical)
df_copy = df_copy.astype({"Zip": "str", "NewExist": "int8", "UrbanRural": "str"})

# extract industry information and convert it to NACE 
# create a dict for the corresponding values 

naics_2_to_nace = {
    '11': 'A', # Agriculture, Forestry and Fishing
    '21': 'B', # Mining and Quarrying
    '22': 'D', # Electricity, Gas, Steam and Air Conditioning
    '23': 'F', # Construction
    '31': 'C', # Manufacturing
    '32': 'C', # Manufacturing
    '33': 'C', # Manufacturing
    '42': 'G', # Wholesale Trade
    '44': 'G', # Retail Trade
    '45': 'G', # Retail Trade
    '48': 'H', # Transportation and Storage
    '49': 'H', # Transportation and Storage
    '51': 'J', # Information and Communication
    '52': 'K', # Financial and Insurance Activities
    '53': 'L', # Real Estate Activities
    '54': 'M', # Professional, Scientific and Technical
    '55': 'M', # Management of Companies
    '56': 'N', # Administrative and Support Service
    '61': 'P', # Education
    '62': 'Q', # Human Health and Social Work
    '71': 'R', # Arts, Entertainment and Recreation
    '72': 'I', # Accommodation and Food Service
    '81': 'S', # Other Service Activities
    '92': 'O'  # Public Administration and Defence
}

df_copy["NAICS"] = df_copy["NAICS"].astype("str").apply(lambda x: x[:2])
df_copy["NACE"] = df_copy['NAICS'].map(naics_2_to_nace)

df_copy.info()

<class 'pandas.core.frame.DataFrame'>
Index: 887797 entries, 0 to 899163
Data columns (total 27 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   LoanNr_ChkDgt      887797 non-null  int64  
 1   Name               887783 non-null  object 
 2   City               887767 non-null  object 
 3   State              887784 non-null  object 
 4   Zip                887797 non-null  int64  
 5   Bank               886302 non-null  object 
 6   BankState          886295 non-null  object 
 7   NAICS              887797 non-null  int64  
 8   ApprovalDate       887797 non-null  object 
 9   ApprovalFY         887797 non-null  object 
 10  Term               887797 non-null  int64  
 11  NoEmp              887797 non-null  int64  
 12  NewExist           887797 non-null  float64
 13  CreateJob          887797 non-null  int64  
 14  RetainedJob        887797 non-null  int64  
 15  FranchiseCode      887797 non-null  int64  
 16  UrbanRu

In [None]:
df_copy["NAICS"].dtype

array([451120, 722410, 621210, ..., 315280, 922140, 221121], shape=(1311,))

In [None]:
print("hello")