# All States
----
This contains the merged DataFrames of California, Texas, New York and Florida.

Data is limited to:
 - Years 2014-2017
 - People over the 20<sup>th</sup> percentile and under the 90<sup>th</sup> percentile by state
 - People in the few major Metro Areas by state

In [1]:
# import pandas as pd
# import numpy as np
# import matplotlib.pyplot as plt
# import seaborn as sns
# import plotly.graph_objects as go
# import plotly.express as px
# from plotly.subplots import make_subplots

# import json
# from sklearn.linear_model import LogisticRegression
# from sklearn.model_selection import train_test_split, GridSearchCV
# from sklearn.impute import SimpleImputer
# from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
# from sklearn.pipeline import Pipeline
# from sklearn.compose import ColumnTransformer


In [2]:
# def compress_dataframe(df, dictionary):
#     df = df.copy(deep=True)
    
#     for col in dictionary.keys():
#         df[col] = df[col].map(dictionary[col])
#     return df

In [3]:
# def reduce_large_strings(df):
#     to_replace = ["Information not provided by applicant in mail, Internet, or telephone application",
#                   "Native Hawaiian or Other Pacific Islander",
#                   "One-to-four family dwelling (other than manufactured housing)"]
#     replace_with = ["Information not Provided",
#                     "Native Hawaiian/Pacific Islander",
#                     "1-4 Family House"]
#     for col in df.columns:
#         if df[col].dtype == object:
#             for i, _ in enumerate(to_replace):
#                 df[col] = np.where(df[col] == to_replace[i], replace_with[i], df[col])            
#     return df

In [4]:
# # Code from Murmel on Stack Exchange
# # https://stackoverflow.com/questions/1450957/pythons-json-module-converts-int-dictionary-keys-to-strings
# def jsonKeys2int(x):
#     if isinstance(x, dict):
#         try:
#             return {int(k):v for k,v in x.items()}
#         except:
#             pass #bad form
#     return x

### Read In Merged DF and Decompression Dictionary
----
In order to reduce file size all categorical cells were converted into numbers and additionally compressed into a GZip file.

The next three cells

1. Reads in the numerical Dataframe
2. Reads the JSON file which contains the key:value pairs to decode the DataFrame
3. Executes a function that uses the JSON file to decompress/decode the numerical DataFrame.

In [5]:
# df=pd.read_csv('./merged_df/merged_df.csv.gz', low_memory=False)

In [6]:
# df.to_csv("./merged_df/merged_df.csv", index=False)

In [7]:
# # with open("./json_files/decompression_dictionary.json", "r") as json_file:
#     decompression_dictionary = json.load(json_file, object_hook=jsonKeys2int)

In [8]:
# df = compress_dataframe(df, decompression_dictionary)

### Quick Clean

In [9]:
# df = reduce_large_strings(df)

In [10]:
# #cast numerical features back to correct values
# numerical_features = ['as_of_year', 'agency_code', 'loan_type', 'loan_purpose',
#                       'owner_occupancy', 'loan_amount_000s', 'preapproval',
#                       'action_taken', 'msamd', 'census_tract_number', 'applicant_ethnicity',
#                       'co_applicant_ethnicity', 'applicant_race_1', 'co_applicant_race_1',
#                       'applicant_income_000s', 'purchaser_type', 'denial_reason_1',
#                       'denial_reason_2', 'denial_reason_3', 'hoepa_status', 'lien_status',
#                       'population', 'minority_population', 'hud_median_family_income',
#                       'tract_to_msamd_income', 'number_of_owner_occupied_units',
#                       'number_of_1_to_4_family_units', 'latino', 'approve_bin']
# dtypes = [int, int, int, int, int, float, int, int, float, float, int, int, int,int,
#           float, int, float, float, float, int, int, float, float, float, float, float,
#           float, int, int]
# for col, dtype in zip(numerical_features, dtypes):
#     df[col] = df[col].astype(dtype)

In [11]:
# df.head()

**Baseline Score**
The baseline for the modeling of this study is the mean approval rate of the set.

In [12]:
# df.isnull().sum().sort_values(ascending=False).head(15)

In [13]:
# df.shape

In [14]:
# df.dropna(axis=0,subset=['number_of_1_to_4_family_units','number_of_owner_occupied_units',\
#                          'tract_to_msamd_income','minority_population','population',\
#                          'hud_median_family_income','census_tract_number'], inplace=True)

Dropping the NAN entries that are not the denial reasons, as they make up only a small fraction of the dataset

In [15]:
# df.shape

In [16]:
# df=df[df['loan_purpose']==1].copy(deep=True)

Filtering by loan type, we wish to only look at the loan's that were categorized as 'Home purchase'.

In [17]:
# df.shape

## Plotly!

In [18]:
# import plotly.graph_objects as go
# import plotly.express as px
# from plotly.subplots import make_subplots

In [19]:
# fig=px.histogram(df[~(df['applicant_race_name_1']=='White') &\
#                    (df['loan_amount_000s']<600)],
#                  x='loan_amount_000s',
#                  color='approve_bin',
#                  nbins=200,
#                  color_discrete_sequence=["blue", "red"],barmode='stack',
#                 title='Minority Applicant Loan Amount by Approved/Denied Count')
# fig.update_layout(
# #     xaxis = dict(
# #     tickmode = 'linear',
# #     tick0=0,
# #     dtick=1
# #     ),
#     width=1400,
#     height=450)
# fig.show()

In [20]:
# fig=px.histogram(df[(df['applicant_race_name_1']=='White') &\
#                    (df['loan_amount_000s']<500)],\
#                  x='loan_amount_000s',
#                  color='approve_bin',
#                  nbins=100,
#                  color_discrete_sequence=["blue", "red"],
#                  title='White Applicant Loan Amount by Approved/Denied Count')
# fig.update_layout(
# #     xaxis = dict(
# #     tickmode = 'linear',
# #     tick0=0,
# #     dtick=1
# #     ),
#     width=1400,
#     height=450)
# fig.show()

In [21]:
# fig=px.histogram(df[~(df['applicant_race_name_1']=='White') &\
#                    (df['loan_amount_000s']<500)],\
#                  x='minority_population',
#                  color='approve_bin',
#                  nbins=100,
#                  color_discrete_sequence=["blue", "red"],
#                  title='Minority Applicant Population Tract Percentage by Approved/Denied Count'
#                  )
# fig.update_layout(
# #     xaxis = dict(
# #     tickmode = 'linear',
# #     tick0=0,
# #     dtick=1
# #     ),
#     width=1400,
#     height=450)
# fig.show()

In [22]:
# fig=px.histogram(df[(df['applicant_race_name_1']=='White') &\
#                    (df['loan_amount_000s']<500)],\
#                  x='minority_population',
#                  color='approve_bin',
#                  nbins=100,
#                  color_discrete_sequence=["blue", "red"],
#                  title='White Applicant Minority Population Tract Percentage by Approved/Denied Count'
#                  )
# fig.update_layout(
# #     xaxis = dict(
# #     tickmode = 'linear',
# #     tick0=0,
# #     dtick=1
# #     ),
#     width=1400,
#     height=450)
# fig.show()

In [23]:
# fig=px.histogram(df[~(df['applicant_race_name_1']=='White') &\
#                    (df['applicant_income_000s']<300)],\
#                  x='applicant_income_000s',
#                  color='approve_bin',
#                  nbins=100,
#                  color_discrete_sequence=["blue", "red"],
#                  title='Minority Applicant Income < $300,000 by Approved/Denied Count'
#                  )
# xaxis = dict(
# tickmode = 'linear',
# tick0=40,
# dtick=40),
# fig.update_layout(
# #     xaxis = dict(
# #     tickmode = 'linear',
# #     tick0=0,
# #     dtick=1
# #     ),
#     width=1400,
#     height=450)
# fig.show()

In [24]:
# fig=px.histogram(df[(df['applicant_race_name_1']=='White') &\
#                    (df['applicant_income_000s']<300)],\
#                  x='applicant_income_000s',
#                  color='approve_bin',
#                  nbins=100,
#                  color_discrete_sequence=["blue", "red"],
#                  title='White Applicant Income < $300,000 by Approved/Denied Count'
#                  )
# fig.update_layout(
# #     xaxis = dict(
# #     tickmode = 'linear',
# #     tick0=0,
# #     dtick=1
# #     ),
#     width=1400,
#     height=450)
# fig.show()

In [25]:
#df['latino'].value_counts(normalize=True)

In [26]:
# df_race=pd.crosstab(df['applicant_race_name_1'], df['approve_bin'])
# df_race=df_race.reset_index()
# df_race.columns=['race','not approved','approved']
# l=[]
# for x in range(df_race.shape[0]):
#     l.append(df_race['approved'][x]/(df_race['approved'][x]+ df_race['not approved'][x]))
# df_race['percent approved']=[x*100 for x in np.array(l)]
# df_race['percent not approved']=[100-x for x in df_race['percent approved']]

# fig = go.Figure(data=[
# go.Scatter(x=df_race['race'],
#                  y=df_race['percent approved'],
#                  name='percent approved'),
# go.Scatter(x=df_race['race'],
#                  y=df_race['percent not approved'],
#                  name='percent not approved')
# ], layout={'title':'Loan Approval Rate by Race','xaxis_title':'Applicant Race', 'yaxis_title':'Approval Rate','title_x':0.5})
# fig.update_layout(xaxis_tickangle = -45, height=600, width=1200)  
# fig.show()

In [27]:
# race_to_remove = ['Information not Provided', 'Not applicable']
# tmp = df[~df['applicant_race_name_1'].isin(race_to_remove)]

# tmp = tmp[['applicant_race_name_1','approve_bin']]\
#              .groupby(['applicant_race_name_1','approve_bin'])\
#              .size().reset_index().rename(columns={0:'count'})
# #MIN-MAX Normalization
# #tmp['count'] = (tmp['count']-tmp['count'].min())/(tmp['count'].max()-tmp['count'].min())

# #Filtering
# tmp['percentage']= df[~df['applicant_race_name_1'].isin(race_to_remove)][['applicant_race_name_1','approve_bin']]\
#                   .groupby(['applicant_race_name_1','approve_bin'])\
#                   .size().groupby(level=0).apply(lambda x: 100 * x / float(x.sum())).values
# #Figure Object
# fig = go.Figure(data=[
#     go.Bar(name='Approved', x=tmp[tmp['approve_bin']==1]['applicant_race_name_1'], y=tmp[tmp['approve_bin']==1]['percentage'],text=tmp[tmp['approve_bin']==1]['percentage'].apply(lambda x: '{:.1f}%'.format(x)), textposition = 'auto'),
#     go.Bar(name='Denied', x=tmp[tmp['approve_bin']==0]['applicant_race_name_1'], y=tmp[tmp['approve_bin']==0]['percentage'], text=tmp[tmp['approve_bin']==0]['percentage'].apply(lambda x: '{:.1f}%'.format(x)))
# ], layout={'title':'Loan Approval Rate by Race','xaxis_title':'Applicant Race', 'yaxis_title':'Approval Rate (%)', 'title_x':0.5})
# fig.update_layout(xaxis_tickangle = -45, height=600, width=1200)  
# fig.show()

In [28]:
# tmp = df[(df['approve_bin']==0) & (~df['applicant_race_name_1'].isin(race_to_remove))]\
#               [['applicant_race_name_1','denial_reason_name_1']]\
#              .groupby(['applicant_race_name_1','denial_reason_name_1'])\
#              .size().reset_index().rename(columns={0:'count'})
# tmp['percentage']= df[(df['approve_bin']==0) & (~df['applicant_race_name_1'].isin(race_to_remove))]\
#                   [['applicant_race_name_1','denial_reason_name_1']]\
#                   .groupby(['applicant_race_name_1','denial_reason_name_1'])\
#                   .size().groupby(level=0).apply(lambda x: 100 * x / float(x.sum())).values
# fig = px.bar(tmp,
#              x = 'applicant_race_name_1',
#              y = 'percentage',
#              color = 'denial_reason_name_1',
#              hover_data = ['count','percentage','denial_reason_name_1'],
#              title = 'Denial Reasons Rate by Race',
#              labels = {'applicant_race_name_1': 'Applicant Race', 'denial_reason_name_1':'Denial Reason'})
# fig.update_layout(xaxis_tickangle = -45, height=700, width=1200, title_x=0.5)             
# fig.show()

In [29]:
# tmp = df[['loan_type_name','approve_bin']]\
#              .groupby(['loan_type_name','approve_bin'])\
#              .size().reset_index().rename(columns={0:'count'})
# tmp['percentage'] = df[['loan_type_name','approve_bin']]\
#                   .groupby(['loan_type_name','approve_bin'])\
#                   .size().groupby(level=0).apply(lambda x: 100 * x / float(x.sum())).values
# fig = go.Figure(data=[
#     go.Bar(name='Approved', x=tmp[tmp['approve_bin']==1]['loan_type_name'], y=tmp[tmp['approve_bin']==1]['percentage'],text=tmp[tmp['approve_bin']==1]['percentage'].apply(lambda x: '{:.1f}%'.format(x)), textposition = 'auto'),
#     go.Bar(name='Denied', x=tmp[tmp['approve_bin']==0]['loan_type_name'], y=tmp[tmp['approve_bin']==0]['percentage'], text=tmp[tmp['approve_bin']==0]['percentage'].apply(lambda x: '{:.1f}%'.format(x)))
# ], layout={'title':'Loan Approval Percentage by Loan Type','xaxis_title':'Loan Type', 'yaxis_title':'Percentage', 'title_x':0.5})
# fig.update_layout(xaxis_tickangle = -45, height=600, width=1200) 
# fig.show()

In [30]:
# tmp = df[['latino','approve_bin']]\
#              .groupby(['latino','approve_bin'])\
#              .size().reset_index().rename(columns={0:'count'})
# tmp['percentage']= df[['latino','approve_bin']]\
#                   .groupby(['latino','approve_bin'])\
#                   .size().groupby(level=0).apply(lambda x: 100 * x / float(x.sum())).values
# fig = go.Figure(data=[
#     go.Bar(name='Approved', x=tmp[tmp['approve_bin']==1]['latino'], y=tmp[tmp['approve_bin']==1]['percentage'],text=tmp[tmp['approve_bin']==1]['count'], textposition = 'auto'),
#     go.Bar(name='Denied', x=tmp[tmp['approve_bin']==0]['latino'], y=tmp[tmp['approve_bin']==0]['percentage'], text=tmp[tmp['approve_bin']==0]['count'])
# ], layout={'title':'Loan Approval Rate for Hispanics','xaxis_title':'Ethnicity', 'yaxis_title':'percentage', 'title_x':0.5})
# fig.update_layout(height=600, width=1200)
# fig.show()

In [31]:
# tmp = df[~df.applicant_race_name_1.isin(race_to_remove)][['applicant_race_name_1','approve_bin','loan_amount_000s','applicant_income_000s','latino']]\
#              .groupby(['applicant_race_name_1','approve_bin','latino'])\
#              .mean().reset_index()
# tmp['loan_to_income_ratio']= tmp['applicant_income_000s']/tmp['loan_amount_000s']

# fig = go.Figure(data=[
#     go.Bar(name='Approved-Latino', x=tmp[(tmp['approve_bin']==1) & (tmp.latino==1)]['applicant_race_name_1'], y=tmp[(tmp['approve_bin']==1) & (tmp.latino==1)]['loan_to_income_ratio']),
#     go.Bar(name='Denied-Latino', x=tmp[(tmp['approve_bin']==0) & (tmp.latino==1)]['applicant_race_name_1'], y=tmp[(tmp['approve_bin']==0) & (tmp.latino==1)]['loan_to_income_ratio']),
#     go.Bar(name='Approved-Non-Latino', x=tmp[(tmp['approve_bin']==1) & (tmp.latino==0)]['applicant_race_name_1'], y=tmp[(tmp['approve_bin']==1) & (tmp.latino==0)]['loan_to_income_ratio']),
#     go.Bar(name='Denied-Non-Latino', x=tmp[(tmp['approve_bin']==0) & (tmp.latino==0)]['applicant_race_name_1'], y=tmp[(tmp['approve_bin']==0) & (tmp.latino==0)]['loan_to_income_ratio'])
    
# ], layout={'title':'Loan Amount to Income by Race','xaxis_title':'Race', 'yaxis_title':'Loan-amount-to-income-ratio', 'title_x':0.5})
# fig.update_layout(xaxis_tickangle = -45, height=600, width=1400, title_x=0.5)
# fig.show()

In [32]:
# tmp_minority = df[df.applicant_race_name_1 != 'White'][['msamd_name']].groupby(['msamd_name']).size().reset_index().rename(columns={0:'minority'})
# tmp = df[['msamd_name','minority_population']].groupby(['msamd_name']).mean().reset_index().rename(columns={0:'minority_avg'})

# tmp_minority['minority_percentage'] = tmp_minority['minority']/df[~(df.applicant_race_name_1.isin(race_to_remove))].groupby(['msamd_name']).size().reset_index().rename(columns={0:'count'})['count']*100
# #.apply(lambda x: 100 * x / df[~(df.applicant_race_name_1.isin(race_to_remove))].shape[0]).values

# fig = go.Figure(data=[
#     go.Bar(name='Minority Applicants', y=tmp_minority['msamd_name'], x=tmp_minority['minority_percentage'], orientation='h'),
#     go.Bar(name='Minority Population', y=tmp['msamd_name'], x=tmp['minority_population'], orientation='h')
# ], 
# layout={'title':'Minority Applicants vs Minority Population','xaxis_title':'Minority Percentage', 'yaxis_title':'MSA/MD','title_x':0.5}
# )
# fig.update_layout(xaxis_tickangle = -45, height=600, width=1400, title_x=0.5)
# fig.show()

In [33]:
# plt.figure(figsize=(15,5))
# ax = sns.countplot(x=df['denial_reason_name_1'], order = df['denial_reason_name_1'].value_counts().index)

# plt.title('Distribution of Denial Reason', size=15)
# plt.xlabel('Denial Reasons', size=12)
# plt.ylabel('Count', size=12)
# ax.set(xticks=[0,1,2,3,4,5,6,7,8])
# ax.set_xticklabels(ax.get_xticklabels(), rotation=45, horizontalalignment='right');

In [34]:
# import plotly.graph_objects as go

# race_to_remove = ['Information not Provided', 'Not applicable']
# tmp = df[~df['applicant_race_name_1'].isin(race_to_remove)]

# fig = go.Figure()

# for race in tmp['applicant_race_name_1'].unique():
#   fig.add_trace(go.Box(y=tmp[(tmp['applicant_race_name_1']==race) & (tmp.approve_bin==1)]['applicant_income_000s'], name ="Approved " +race, line=dict(color='blue')))
#   fig.add_trace(go.Box(y=tmp[(tmp['applicant_race_name_1']==race) & (tmp.approve_bin==0)]['applicant_income_000s'], name="Denied " +race, line=dict(color='palevioletred')))
# fig.update_layout(height = 800, width= 1500)
# fig.show()