# Interest Rate

modified values from interest_rates.csv where the date is split into year and month for search and match purposes.

In [None]:
# import pandas as pd

# interest_rates_expanded = pd.read_csv('../data/interest_rates.csv')

# # DATE info in format: YYYY-MM-DD
# def get_year(date:str):
#     return date.split('-')[0]

# def get_month(date:str):
#     return date.split('-')[1]

# # add the YEAR and MONTH columns
# interest_rates_expanded['YEAR'] = interest_rates_expanded['DATE'].apply(get_year)
# interest_rates_expanded['MONTH'] = interest_rates_expanded['DATE'].apply(get_month)

# # remove the redundant DATE column
# interest_rates_expanded = interest_rates_expanded.drop(columns='DATE')

# # sort by YEAR, MONTH so that the indexes are meaningful (higher index = later)
# interest_rates_expanded.sort_values(by=['YEAR','MONTH'])

# # convert the generated dataframe to a readable .csv file for future use
# interest_rates_expanded.to_csv('../data/interest_rates_expanded.csv')

# # superficially verify output
# print(interest_rates_expanded)

In [17]:
import pandas as pd

# Read the data and parse dates directly while reading the CSV
interest_rates_expanded = pd.read_csv('../data/interest_rates.csv', parse_dates=['DATE'])

# Extract YEAR and MONTH using vectorized operations
interest_rates_expanded['YEAR'] = interest_rates_expanded['DATE'].dt.year
interest_rates_expanded['MONTH'] = interest_rates_expanded['DATE'].dt.month

# Drop the redundant DATE column and sort by YEAR, MONTH
interest_rates_expanded = interest_rates_expanded.drop(columns='DATE').sort_values(by=['YEAR', 'MONTH'])

# Save the generated DataFrame to a .csv file
interest_rates_expanded.to_csv('../data/interest_rates_expanded.csv', index=False)

# Superficially verify output
print(interest_rates_expanded)


     FEDFUNDS  YEAR  MONTH
0        0.80  1954      7
1        1.22  1954      8
2        1.07  1954      9
3        0.85  1954     10
4        0.83  1954     11
..        ...   ...    ...
827      5.08  2023      6
828      5.12  2023      7
829      5.33  2023      8
830      5.33  2023      9
831      5.33  2023     10

[832 rows x 3 columns]


### Implement and incorporate Interest Rate Feature
Actually process the base data and find the correlating interest rate for each loan based on approval date

In [18]:
import pandas as pd

# read the base values
SBA_vals = pd.read_csv('../data/df_clean.csv')

# Preprocess 'ApprovalDate' column to extract 'YEAR' and 'MONTH' values
SBA_vals['YEAR'] = pd.to_datetime(SBA_vals['ApprovalDate'], format='%d-%b-%y').dt.year.astype(str)
SBA_vals['MONTH'] = pd.to_datetime(SBA_vals['ApprovalDate'], format='%d-%b-%y').dt.month.astype(str).str.zfill(2)

# Merge 'YEAR' and 'MONTH' columns into a single 'YEAR_MONTH' column for efficient comparison
SBA_vals['YEAR_MONTH'] = SBA_vals['YEAR'] + '-' + SBA_vals['MONTH']

# Merge interest_rates_expanded DataFrame 'YEAR' and 'MONTH' columns into a single 'YEAR_MONTH' column
interest_rates_expanded['YEAR_MONTH'] = interest_rates_expanded['YEAR'].astype(str) + '-' + interest_rates_expanded['MONTH'].astype(str).str.zfill(2)

# Merge 'InterestRate' column into SBA_vals based on 'YEAR_MONTH' column
SBA_vals = pd.merge(SBA_vals, interest_rates_expanded[['YEAR_MONTH', 'FEDFUNDS']], how='left', on='YEAR_MONTH')

print(SBA_vals.sample(20))


  SBA_vals = pd.read_csv('../data/df_clean.csv')


        LoanNr_ChkDgt                            Name           City State  \
377282     3758314002       ANATOMICAL SERVICES, INC.  SCHILLER PARK    IL   
686688     7356464007         OLSEN CUSTOM FARMS, LLC      HENDRICKS    MN   
73957      1579335010  RODNEY S. BINGHAM DBA DR. RODN           ERIE    PA   
844770     9429803007                 THE GRILL, INC.     NAPERVILLE    IL   
121699     1925535004  Mayhall's Benchmaster Jewelers       Florence    AL   
93077      1720515005                  LYNN C. KYRISS      MANZANITA    OR   
530505     5363643005                  PFAFF GRAPHICS          IRWIN    PA   
278553     2994314010     HALL OF  FAME  BOOK COMPANY     STILLWATER    OK   
754020     8271893006           ANDY'S DISCOUNT STORE     LONG BEACH    CA   
477208     4751553010  BINFORD GROCERY; BINFORD HARDW        BINFORD    ND   
507313     5068603001   CHALLENGER PALLET & SUPPLY, I    IDAHO FALLS    ID   
740970     8010064002                    NANCY UNISEX     OZONE 

In [None]:
SBA1 = SBA_vals

In [19]:
SBA1.head(10)

Unnamed: 0,LoanNr_ChkDgt,Name,City,State,Zip,Bank,BankState,NAICS,ApprovalDate,ApprovalFY,...,SBA_Appv,Default,NAICS_i,isNewBusiness,isFranchise,SBARatio,YEAR,MONTH,YEAR_MONTH,FEDFUNDS
0,1000014003,ABC HOBBYCRAFT,EVANSVILLE,IN,47711,FIFTH THIRD BANK,OH,451120,28-Feb-97,1997,...,48000.0,0,45,1.0,0,0.8,1997,2,1997-02,5.19
1,1000024006,LANDMARK BAR & GRILLE (THE),NEW PARIS,IN,46526,1ST SOURCE BANK,IN,722410,28-Feb-97,1997,...,32000.0,0,72,1.0,0,0.8,1997,2,1997-02,5.19
2,1000034009,"WHITLOCK DDS, TODD M.",BLOOMINGTON,IN,47401,GRANT COUNTY STATE BANK,IN,621210,28-Feb-97,1997,...,215250.0,0,62,0.0,0,0.75,1997,2,1997-02,5.19
3,1000044001,"BIG BUCKS PAWN & JEWELRY, LLC",BROKEN ARROW,OK,74012,1ST NATL BK & TR CO OF BROKEN,OK,0,28-Feb-97,1997,...,28000.0,0,0,0.0,0,0.8,1997,2,1997-02,5.19
4,1000054004,"ANASTASIA CONFECTIONS, INC.",ORLANDO,FL,32801,FLORIDA BUS. DEVEL CORP,FL,0,28-Feb-97,1997,...,229000.0,0,0,0.0,0,1.0,1997,2,1997-02,5.19
5,1000084002,"B&T SCREW MACHINE COMPANY, INC",PLAINVILLE,CT,6062,"TD BANK, NATIONAL ASSOCIATION",DE,332721,28-Feb-97,1997,...,387750.0,0,33,0.0,0,0.75,1997,2,1997-02,5.19
6,1000093009,MIDDLE ATLANTIC SPORTS CO INC,UNION,NJ,7083,WELLS FARGO BANK NATL ASSOC,SD,0,2-Jun-80,1980,...,499998.0,1,0,1.0,0,0.83333,1980,6,1980-06,9.47
7,1000094005,WEAVER PRODUCTS,SUMMERFIELD,FL,34491,REGIONS BANK,AL,811118,28-Feb-97,1997,...,36000.0,0,81,1.0,0,0.8,1997,2,1997-02,5.19
8,1000104006,TURTLE BEACH INN,PORT SAINT JOE,FL,32456,CENTENNIAL BANK,FL,721310,28-Feb-97,1997,...,228750.0,0,72,1.0,0,0.75,1997,2,1997-02,5.19
9,1000124001,INTEXT BUILDING SYS LLC,GLASTONBURY,CT,6073,WEBSTER BANK NATL ASSOC,CT,0,28-Feb-97,1997,...,56000.0,0,0,1.0,0,0.8,1997,2,1997-02,5.19


In [None]:
SBA_vals.head(10)

In [None]:
# # read the base values
# SBA_vals = pd.read_csv('../data/df_clean.csv')

# # the format for a ApprovalDate is DD-mmm-YY where 'mmm' is a 3-char string that represents the month (e.g.: 'Jan', 'Feb', 'Oct',etc.)
# # in order to find the interest rate at time of Approval, this format will need to be changed to be consistent with the YEAR MONTH 
# # values above

# months = {'Jan':'01','Feb':'02','Mar':'03','Apr':'04','May':'05','Jun':'06','Jul':'07','Aug':'08','Sep':'09','Oct':'10','Nov':'11','Dec':'12'}
# def pull_fixed_YEAR_MONTH_SBAdata(date:str):
#     day,month,year = date.split('-')

#     #fix the month
#     month = months[month]

#     #fix the year
#     year = '19' + year if year > '30' else '20' + year

#     return month, year

# # now that we can extract appropriate/standardized month/year information
# # we will add a new column that is the interest rate based on the information
# # in the SBA_vals['ApprovalDate'] column

# def get_interest_rate_from_ApprovalDate(date:str):
#     month,year = pull_fixed_YEAR_MONTH_SBAdata(date)
    
#     interest_rate = interest_rates_expanded[(interest_rates_expanded['YEAR'] == year) & (interest_rates_expanded['MONTH'] == month)]

#     return interest_rate['FEDFUNDS'].iloc[0] if not interest_rate.empty else None


# # sample set and demonstrate it works for a given sample
# # SBA_sample = SBA_vals.sample(20)
# # SBA_sample['InterestRate'] = SBA_sample['ApprovalDate'].apply(get_interest_rate_from_ApprovalDate)

# # print(SBA_sample[['ApprovalDate','InterestRate']])

# SBA_vals['InterestRate'] = SBA_vals['ApprovalDate'].apply(get_interest_rate_from_ApprovalDate)

# print(SBA_vals.sample(20))

In [None]:
# SBA_vals[['ApprovalDate', 'InterestRate']].sample(10)
SBA_vals.to_csv('../data/df_clean_with_interest_rate.csv',index=False)

# Engineering ApprovalDate-derivative-features
## Outlier Years (number of loans)

There are years where the SBA was just getting established and the overall number of loans for that period of time are non-representative of the overall data. For the sake of clarity, the years where this is the case will be extracted to a csv file for later reference in order to remove loans that were made in these odd years.

In [None]:
# read the SBA data
SBA_data = pd.read_csv('../data/SBAnational.csv', low_memory=False)

# format for a date: DD-mmm-YY where 'mmm' is a 3-char string representing the month (ex: Sep, Aug, Jun)
# the year needs to be fixed, because as a 2-digit string '99' comes after '06'; needs to be expanded to 4-digit version

# function returns cleaned year value
def extract_year(date:str):
    # parse date information
    day,month,year = date.split('-')

    # clean year
    year = int('19' + year) if year > '30' else int('20' + year)

    return year


# apply the cleaning function to the dataframe
loans_in_year = SBA_data['ApprovalDate'].apply(extract_year)

print('loans_in_year check')
print(f'min year: {loans_in_year.min()}\tmax year: {loans_in_year.max()}')

loans_in_year = loans_in_year.value_counts()

loans_in_year.describe()

import matplotlib.pyplot as plt

pd.DataFrame(loans_in_year).boxplot()
plt.show()



# outlier_years = pd.DataFrame(columns=['YEAR','LOANS'])

# some of the ApprovalDates are strings - specifically in a 1976 where there was a lot of fraud;
# since these will automatically be removed later, these values should be automatically added to the list of years