# All States
----
This contains the merged DataFrames of California, Texas, New York and Florida.

Data is limited to:
 - Years 2014-2017
 - People over the 20<sup>th</sup> percentile and under the 90<sup>th</sup> percentile by state
 - People in the few major Metro Areas by state

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


In [2]:
def compress_dataframe(df, dictionary):
    df = df.copy(deep=True)
    
    for col in dictionary.keys():
        df[col] = df[col].map(dictionary[col])
    return df

In [3]:
def reduce_large_strings(df):
    to_replace = ["Information not provided by applicant in mail, Internet, or telephone application",
                  "Native Hawaiian or Other Pacific Islander",
                  "One-to-four family dwelling (other than manufactured housing)"]
    replace_with = ["Information not Provided",
                    "Native Hawaiian/Pacific Islander",
                    "1-4 Family House"]
    for col in df.columns:
        if df[col].dtype == object:
            for i, _ in enumerate(to_replace):
                df[col] = np.where(df[col] == to_replace[i], replace_with[i], df[col])            
    return df

In [4]:
# Code from Murmel on Stack Exchange
# https://stackoverflow.com/questions/1450957/pythons-json-module-converts-int-dictionary-keys-to-strings
def jsonKeys2int(x):
    if isinstance(x, dict):
        try:
            return {int(k):v for k,v in x.items()}
        except:
            pass #bad form
    return x

### Read In Merged DF and Decompression Dictionary
----
In order to reduce file size all categorical cells were converted into numbers and additionally compressed into a GZip file.

The next three cells

1. Reads in the numerical Dataframe
2. Reads the JSON file which contains the key:value pairs to decode the DataFrame
3. Executes a function that uses the JSON file to decompress/decode the numerical DataFrame.

In [5]:
df=pd.read_csv('./merged_df/merged_df.csv.gz', low_memory=False)

In [6]:
df.to_csv("./merged_df/merged_df.csv", index=False)

In [7]:
with open("./json_files/decompression_dictionary.json", "r") as json_file:
    decompression_dictionary = json.load(json_file, object_hook=jsonKeys2int)

In [8]:
df = compress_dataframe(df, decompression_dictionary)

### Quick Clean

In [9]:
df = reduce_large_strings(df)

In [10]:
#cast numerical features back to correct values
numerical_features = ['as_of_year', 'agency_code', 'loan_type', 'loan_purpose',
                      'owner_occupancy', 'loan_amount_000s', 'preapproval',
                      'action_taken', 'msamd', 'census_tract_number', 'applicant_ethnicity',
                      'co_applicant_ethnicity', 'applicant_race_1', 'co_applicant_race_1',
                      'applicant_income_000s', 'purchaser_type', 'denial_reason_1',
                      'denial_reason_2', 'denial_reason_3', 'hoepa_status', 'lien_status',
                      'population', 'minority_population', 'hud_median_family_income',
                      'tract_to_msamd_income', 'number_of_owner_occupied_units',
                      'number_of_1_to_4_family_units', 'latino', 'approve_bin']
dtypes = [int, int, int, int, int, float, int, int, float, float, int, int, int,int,
          float, int, float, float, float, int, int, float, float, float, float, float,
          float, int, int]
for col, dtype in zip(numerical_features, dtypes):
    df[col] = df[col].astype(dtype)

In [11]:
df.head()

Unnamed: 0,as_of_year,respondent_id,agency_name,agency_abbr,agency_code,loan_type_name,loan_type,loan_purpose_name,loan_purpose,owner_occupancy_name,...,lien_status_name,lien_status,population,minority_population,hud_median_family_income,tract_to_msamd_income,number_of_owner_occupied_units,number_of_1_to_4_family_units,latino,approve_bin
0,2014,0000451965,Consumer Financial Protection Bureau,CFPB,9,Conventional,1,Refinancing,3,Owner-occupied as a principal dwelling,...,Secured by a first lien,1,2126.0,85.0,60600.0,109.699997,433.0,528.0,0,1
1,2014,0000451965,Consumer Financial Protection Bureau,CFPB,9,Conventional,1,Refinancing,3,Owner-occupied as a principal dwelling,...,Secured by a first lien,1,13748.0,42.93,72700.0,206.259995,2406.0,2882.0,0,1
2,2014,86-0860478,Department of Housing and Urban Development,HUD,7,Conventional,1,Home purchase,1,Owner-occupied as a principal dwelling,...,Secured by a first lien,1,2995.0,20.030001,72700.0,135.589996,952.0,1128.0,0,1
3,2014,0000480228,Consumer Financial Protection Bureau,CFPB,9,Conventional,1,Refinancing,3,Owner-occupied as a principal dwelling,...,Secured by a first lien,1,5266.0,17.26,72700.0,152.929993,1614.0,1799.0,0,1
4,2014,7197000003,Department of Housing and Urban Development,HUD,7,Conventional,1,Refinancing,3,Owner-occupied as a principal dwelling,...,Secured by a first lien,1,4887.0,75.510002,60600.0,103.040001,778.0,1060.0,0,1


**Baseline Score**
The baseline for the modeling of this study is the mean approval rate of the set.

In [12]:
df.isnull().sum().sort_values(ascending=False).head(15)

denial_reason_3                   4698559
denial_reason_name_3              4698559
denial_reason_name_2              4617303
denial_reason_2                   4617303
denial_reason_name_1              4245896
denial_reason_1                   4245896
number_of_1_to_4_family_units        2138
number_of_owner_occupied_units       1351
tract_to_msamd_income                 850
minority_population                   159
population                            152
hud_median_family_income               56
census_tract_number                    56
applicant_income_000s                   0
purchaser_type_name                     0
dtype: int64

In [13]:
df.shape

(4715850, 52)

In [14]:
df.dropna(axis=0,subset=['number_of_1_to_4_family_units','number_of_owner_occupied_units',\
                         'tract_to_msamd_income','minority_population','population',\
                         'hud_median_family_income','census_tract_number'], inplace=True)

Dropping the NAN entries that are not the denial reasons, as they make up only a small fraction of the dataset

In [15]:
df.shape

(4712341, 52)

In [16]:
df['loan_purpose_name'].value_counts(ascending=False)

Home purchase       2327216
Refinancing         2104057
Home improvement     281068
Name: loan_purpose_name, dtype: int64

In [17]:
df=df[df['loan_purpose']==1].copy(deep=True)

Filtering by loan type, we wish to only look at the loan's that were categorized as 'Home purchase'.

In [18]:
print('Baseline mean approval of', round((df['approve_bin'].mean())*100,2),'%')

Baseline mean approval of 78.22 %


In [19]:
df[(df['state_name']=='New York') & (df['as_of_year']==2017)].approve_bin.mean()

0.7898265768253617

## EDA Descriptive/Numerical Metrics

In [20]:
df.columns

Index(['as_of_year', 'respondent_id', 'agency_name', 'agency_abbr',
       'agency_code', 'loan_type_name', 'loan_type', 'loan_purpose_name',
       'loan_purpose', 'owner_occupancy_name', 'owner_occupancy',
       'loan_amount_000s', 'preapproval_name', 'preapproval',
       'action_taken_name', 'action_taken', 'msamd_name', 'msamd',
       'state_name', 'county_name', 'census_tract_number',
       'applicant_ethnicity_name', 'applicant_ethnicity',
       'co_applicant_ethnicity_name', 'co_applicant_ethnicity',
       'applicant_race_name_1', 'applicant_race_1', 'co_applicant_race_name_1',
       'co_applicant_race_1', 'applicant_sex_name', 'co_applicant_sex_name',
       'applicant_income_000s', 'purchaser_type_name', 'purchaser_type',
       'denial_reason_name_1', 'denial_reason_1', 'denial_reason_name_2',
       'denial_reason_2', 'denial_reason_name_3', 'denial_reason_3',
       'hoepa_status_name', 'hoepa_status', 'lien_status_name', 'lien_status',
       'population', 'minority

List of features we have narrowed down, with the addition of the engineered features of latino and approved_bin

In [21]:
numerical_features = df.select_dtypes(include='number').columns.tolist()
print(f'There are {len(numerical_features)} numerical features:', '\n')
print(numerical_features)

There are 29 numerical features: 

['as_of_year', 'agency_code', 'loan_type', 'loan_purpose', 'owner_occupancy', 'loan_amount_000s', 'preapproval', 'action_taken', 'msamd', 'census_tract_number', 'applicant_ethnicity', 'co_applicant_ethnicity', 'applicant_race_1', 'co_applicant_race_1', 'applicant_income_000s', 'purchaser_type', 'denial_reason_1', 'denial_reason_2', 'denial_reason_3', 'hoepa_status', 'lien_status', 'population', 'minority_population', 'hud_median_family_income', 'tract_to_msamd_income', 'number_of_owner_occupied_units', 'number_of_1_to_4_family_units', 'latino', 'approve_bin']


In [22]:
numerical_features = df.select_dtypes(exclude='number').columns.tolist()
print(f'There are {len(numerical_features)} numerical features:', '\n')
print(numerical_features)

There are 23 numerical features: 

['respondent_id', 'agency_name', 'agency_abbr', 'loan_type_name', 'loan_purpose_name', 'owner_occupancy_name', 'preapproval_name', 'action_taken_name', 'msamd_name', 'state_name', 'county_name', 'applicant_ethnicity_name', 'co_applicant_ethnicity_name', 'applicant_race_name_1', 'co_applicant_race_name_1', 'applicant_sex_name', 'co_applicant_sex_name', 'purchaser_type_name', 'denial_reason_name_1', 'denial_reason_name_2', 'denial_reason_name_3', 'hoepa_status_name', 'lien_status_name']


Due to the unfamiliarity of the dataset, it was chosen to use the data with both name tags and numerical encoding for the categories

In [23]:
df['as_of_year'].value_counts(normalize=True,ascending=False)

2017    0.286068
2016    0.270182
2015    0.242829
2014    0.200922
Name: as_of_year, dtype: float64

Based on our sorting patterns to reduce outliers/edge cases, the data based on year recorded normalized

In [24]:
df['state_name'].value_counts(normalize=True,ascending=False)

Texas         0.394129
Florida       0.243131
California    0.220412
New York      0.142328
Name: state_name, dtype: float64

Florida and Texas make up ~65% of the dataset

In [25]:
df['msamd_name'].value_counts(normalize=True,ascending=False)

Houston, The Woodlands, Sugar Land - TX         0.173953
Dallas, Plano, Irving - TX                      0.141893
Los Angeles, Long Beach, Glendale - CA          0.108053
New York, Jersey City, White Plains - NY, NJ    0.091554
Tampa, St. Petersburg, Clearwater - FL          0.085355
Austin, Round Rock - TX                         0.078283
Orlando, Kissimmee, Sanford - FL                0.070349
San Diego, Carlsbad - CA                        0.061470
Oakland, Hayward, Berkeley - CA                 0.050889
Nassau County, Suffolk County - NY              0.050774
Jacksonville - FL                               0.046914
Miami, Miami Beach, Kendall - FL                0.040513
Name: msamd_name, dtype: float64

A quick look at the percentage of samples based on location.

In [26]:
df['applicant_sex_name'].value_counts(normalize=True)

Male                        0.639681
Female                      0.270836
Information not Provided    0.059391
Not applicable              0.030091
Name: applicant_sex_name, dtype: float64

Majority of applicants are male.

In [27]:
df['applicant_race_name_1'].value_counts(normalize=True)

White                               0.653445
Asian                               0.118726
Information not Provided            0.112333
Black or African American           0.075130
Not applicable                      0.030219
Native Hawaiian/Pacific Islander    0.005092
American Indian or Alaska Native    0.005054
Name: applicant_race_name_1, dtype: float64

Majority applicant race is white with 65% of the samples falling in that category and additionally 'Information not Provided' is the third highest category by sample number.

In [28]:
df[df['state_name']=='California'].groupby('applicant_race_name_1').approve_bin.mean()

applicant_race_name_1
American Indian or Alaska Native    0.794494
Asian                               0.812797
Black or African American           0.755984
Information not Provided            0.769643
Native Hawaiian/Pacific Islander    0.803544
Not applicable                      0.998907
White                               0.816982
Name: approve_bin, dtype: float64

In [29]:
df[df['state_name']=='Texas'].groupby('applicant_race_name_1').approve_bin.mean()

applicant_race_name_1
American Indian or Alaska Native    0.726911
Asian                               0.774367
Black or African American           0.721438
Information not Provided            0.700442
Native Hawaiian/Pacific Islander    0.786033
Not applicable                      0.994961
White                               0.802314
Name: approve_bin, dtype: float64

In [30]:
df[df['state_name']=='Florida'].groupby('applicant_race_name_1').approve_bin.mean()

applicant_race_name_1
American Indian or Alaska Native    0.698212
Asian                               0.731054
Black or African American           0.692121
Information not Provided            0.673995
Native Hawaiian/Pacific Islander    0.722169
Not applicable                      0.997956
White                               0.761812
Name: approve_bin, dtype: float64

In [31]:
df[df['state_name']=='New York'].groupby('applicant_race_name_1').approve_bin.mean()

applicant_race_name_1
American Indian or Alaska Native    0.675223
Asian                               0.761723
Black or African American           0.708471
Information not Provided            0.699794
Native Hawaiian/Pacific Islander    0.707952
Not applicable                      0.999043
White                               0.789247
Name: approve_bin, dtype: float64

Groupby splits for the states in question, we do see some trends with this EDA.  In all the sets Black/African American recorded applicant samples have lower approval numbers then the majority class (White applicants).  There is a significant number of applicants that do not have information provided for race and because this category is lower then the majority class for all states, it is very possible that this most likely contains more minority applicants then majority class applicants.  It should be noted that applicants that are recorded as Asian for California are at parity with White.

In [32]:
df.groupby('purchaser_type_name').approve_bin.mean()

purchaser_type_name
Affiliate institution                                                           1.000000
Commercial bank, savings bank or savings association                            1.000000
Fannie Mae (FNMA)                                                               1.000000
Farmer Mac (FAMC)                                                               1.000000
Freddie Mac (FHLMC)                                                             1.000000
Ginnie Mae (GNMA)                                                               1.000000
Life insurance company, credit union, mortgage bank, or finance company         1.000000
Loan was not originated or was not sold in calendar year covered by register    0.424595
Other type of purchaser                                                         1.000000
Private securitization                                                          1.000000
Name: approve_bin, dtype: float64

This feature will not be useful in modeling, but it could be interesting to look at the demographic splits of the loan purchasers.

In [33]:
df.groupby('lien_status_name').approve_bin.mean()

lien_status_name
Not applicable                   1.000000
Secured by a first lien          0.754223
Secured by a subordinate lien    0.707178
Name: approve_bin, dtype: float64

In [57]:
df.groupby('msamd_name')[['approve_bin','minority_population','applicant_income_000s','hud_median_family_income']].mean().sort_values(by='approve_bin',ascending=False)

Unnamed: 0_level_0,approve_bin,minority_population,applicant_income_000s,hud_median_family_income
msamd_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"Oakland, Hayward, Berkeley - CA",0.825592,58.090199,136.044643,93787.305474
"San Diego, Carlsbad - CA",0.818685,46.570498,123.73506,75042.66401
"Nassau County, Suffolk County - NY",0.81065,29.801468,115.486459,107929.220054
"Los Angeles, Long Beach, Glendale - CA",0.802034,64.710707,122.361282,62899.148185
"Dallas, Plano, Irving - TX",0.794837,38.509134,104.831316,71334.193177
"Austin, Round Rock - TX",0.787037,40.546846,103.894638,78044.039477
"Houston, The Woodlands, Sugar Land - TX",0.774528,48.212212,102.109012,69768.306136
"New York, Jersey City, White Plains - NY, NJ",0.767156,50.262382,116.144294,71743.614859
Jacksonville - FL,0.759816,26.001307,86.963519,64006.2338
"Tampa, St. Petersburg, Clearwater - FL",0.759238,30.248869,87.027925,58988.255135


Florida lags behind specifically Miami Beach in loan approval rate and additionally by mean applicant income.  Specifically looking at Miami Beach, the lowest approval rate of the metro area's specifically we are looking at and additionally the highest mean percentage of minority population.  It should be noted that the next two highest minority population mean percentage metro areas are both California, so clearly applicant income is strong deciding factor as well, but one must consider that the standard of living in these locations is much different.  Adjustments would need to be made to accurate reflect applicant income effect more precisely.  It is interesting to see how Housing Urban Development median family income numbers stack up to the approval rates (the median family income per the data dictionary is per MS/AMD).  Applicant overall income does look to be a stronger correlation.

In [51]:
df[['loan_type_name','owner_occupancy_name']].value_counts()

loan_type_name      owner_occupancy_name                      
Conventional        Owner-occupied as a principal dwelling        1433730
FHA-insured         Owner-occupied as a principal dwelling         501905
VA-guaranteed       Owner-occupied as a principal dwelling         198002
Conventional        Not owner-occupied as a principal dwelling     173587
FSA/RHS-guaranteed  Owner-occupied as a principal dwelling          18580
Conventional        Not applicable                                    727
FHA-insured         Not owner-occupied as a principal dwelling        343
                    Not applicable                                    209
VA-guaranteed       Not owner-occupied as a principal dwelling         53
                    Not applicable                                     52
FSA/RHS-guaranteed  Not applicable                                     16
                    Not owner-occupied as a principal dwelling         12
dtype: int64

Plotting on the loan type will be done, most of the loans are being taken out for owner's own housing.

In [52]:
df['co_applicant_ethnicity_name'].value_counts()

No co-applicant             1217173
Not Hispanic or Latino       729422
Hispanic or Latino           195289
Information not Provided     126348
Not applicable                58984
Name: co_applicant_ethnicity_name, dtype: int64

Most of the applicants do not have a co-applicant and even more so do not fall in the latino designation

In [53]:
df['co_applicant_race_name_1'].value_counts()

No co-applicant                     1217173
White                                730160
Information not Provided             134935
Asian                                114283
Black or African American             60584
Not applicable                        58634
Native Hawaiian/Pacific Islander       6557
American Indian or Alaska Native       4890
Name: co_applicant_race_name_1, dtype: int64

Again most applicant's are not filed with a co-applicant.  The vast majority of the samples are recorded as White.

In [54]:
df['hoepa_status_name'].value_counts()

Not a HOEPA loan    2327016
HOEPA loan              200
Name: hoepa_status_name, dtype: int64

Very few loans fall in the category of a loan that fall under the Home Ownership and Equity Protection Act.  The 200 entries are all approved so if further study was done with a larger number of samples that fell in this category in might be worth investigating the applicant parameters for a loan with this designation.

## Plots

In [None]:
plt.figure(figsize=(10, 7), facecolor="w")

sns.histplot(data=df, x="applicant_race_name_1", hue="approve_bin")
plt.xticks(rotation=45)

plt.title("Count of Applicants by Race:\nApproved, Not Approved", size=24)
plt.xlabel("Applicant Race", size=14)
plt.ylabel("Count", size=14);

In [None]:
plt.figure(figsize=(10, 7), facecolor="w")


sns.histplot(data=df[(df["approve_bin"] == 0)], x="applicant_race_name_1", hue="denial_reason_name_1",multiple='stack')
plt.xticks(rotation=45)

plt.title("Count of Applicants Denied by Race:\nSplit by Denied Reason", size=24)
plt.xlabel("Applicant Race", size=14)
plt.ylabel("Count", size=14);

In [None]:
plt.figure(figsize=(10, 10), facecolor="w")

ax = sns.countplot(data=df, x="loan_type_name", hue="approve_bin")

totals = df["loan_type_name"].value_counts().values
np.append(totals, totals)

for i, patch in enumerate(ax.patches):
    if i > 3:
        i = i-4
    percent = f"{(patch.get_height()/totals[i]):.2%}"
    x = patch.get_x() + patch.get_width()/6
    y = patch.get_height() / 2
    if i == 3:
        y = patch.get_height() + 1000
    ax.text(x, y, percent, color="k")
    
plt.title("Loan Approval by Type", size=24)
plt.xlabel("Loan Type", size=14)
plt.ylabel("Count", size=14);

In [None]:
fig = plt.figure(figsize=(10, 10), facecolor="w")

ax = sns.countplot(data=df, x="applicant_race_name_1", hue="approve_bin");
plt.xticks(rotation=45)

totals = df["applicant_race_name_1"].value_counts().values
np.append(totals, totals)

for i, patch in enumerate(ax.patches):
    if i > 6:
        i = i-7
    percent = f"{(patch.get_height()/totals[i]):.2%}"
    x = patch.get_x() + patch.get_width()/3
    y = patch.get_height() + 2000
    ax.text(x, y, percent, color="k", rotation=90)
    
plt.title("Loan Approval by Type", size=24)
plt.xlabel("Loan Type", size=14)
plt.ylabel("Count", size=14);

In [None]:
plt.figure(figsize=(10, 10), facecolor="w")

sns.countplot(data=df, x="loan_type_name", hue="action_taken_name");
plt.xticks(rotation=45)
plt.title("Loans by Type: Split by Result", size=24)
plt.xlabel("Loan Type", size=14);

| Race Name | Race Number  |
| --------- | ------------ |
| American Indian or Alaska Native | 1 |
| Asian | 2 |
| Black or African American | 3 |
| Native Hawaiian or Other Pacific Islander | 4 |
| White | 5 |
| Information not provided by applicant | 6 |
| N/A | 7 |
| No co-applicant | 8 |

In [None]:
fig, axes = plt.subplots(2,3, figsize=(18,10), facecolor="w")

fig.suptitle('Plots Set 1')

sns.histplot(ax=axes[0,0], data=df[df['loan_amount_000s']],x='loan_amount_000s')
sns.histplot(ax=axes[0,1], data=df,x='applicant_race_1')
sns.histplot(ax=axes[0,2], data=df,x='denial_reason_1')
sns.kdeplot(ax=axes[1,0], data=df,x='applicant_income_000s')
sns.kdeplot(ax=axes[1,1], data=df,x='minority_population')
sns.histplot(ax=axes[1,2], data=df,x='denial_reason_2')

In [None]:
plt.figure(figsize=(15,8), facecolor="w")
sns.voilinplot(data=df,x='applicant_race_1',y='applicant_income_000s',
               hue='approve_bin')
plt.title('Boxplots of tract percentage income', size=20);

In [None]:
# applicant race : 1 = American Indian or Alaska Native, 2 = Asian, 3 = Black or African American, 4 = Native Hawaiian or Other Pacific Islander
# applicant race : 5 = White, 6 = Information not provided by applicant, 7 = N/A, 8 = No co-applicant

#denial reasons : 1 = debt-income ratio, 2 = employment history, 3 = credit history, 4 = collateral, 5 = insufficient cash. 6 = Unverifiable information
#dneial reasons : 7 = unverifiable information, 8 = mortgage insurance denied, 9 = other

In [None]:
sns.set(style='whitegrid', palette="deep", font_scale=1.1, rc={"figure.figsize": [20, 10]})
sns.distplot(df['applicant_income_000s'], norm_hist=False, kde=False, bins=50, hist_kws={"alpha": 1}
).set(xlabel='Applicant Income Levels (Thousands)', ylabel='Count');
plt.xticks([x for x in range(int(df['applicant_income_000s'].min()),
                             int(df['applicant_income_000s'].max()),
                             50)])
plt.title('Application Income Levels');

## Plotly!

In [None]:
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

In [None]:
fig=px.histogram(df[~(df['applicant_race_name_1']=='White') &\
                   (df['loan_amount_000s']<600)],
                 x='loan_amount_000s',
                 color='approve_bin',
                 nbins=200,
                 color_discrete_sequence=["red", "blue"],barmode='stack',
                title='Non-White Applicant Loan Amount Coded by Approved/Disapproved Count')
fig.update_layout(
#     xaxis = dict(
#     tickmode = 'linear',
#     tick0=0,
#     dtick=1
#     ),
    width=1400,
    height=450)
fig.show()

In [None]:
fig=px.histogram(df[(df['applicant_race_name_1']=='White') &\
                   (df['loan_amount_000s']<500)],\
                 x='loan_amount_000s',
                 color='approve_bin',
                 nbins=100,
                 color_discrete_sequence=["red", "blue"],
                 title='White Applicant Loan Amount Coded by Approved/Disapproved Count')
fig.show()

In [None]:
fig=px.histogram(df[~(df['applicant_race_name_1']=='White') &\
                   (df['loan_amount_000s']<500)],\
                 x='minority_population',
                 color='approve_bin',
                 nbins=100,
                 color_discrete_sequence=["red", "blue"],
                 title='Non-White Applicant Minority Population Tract % Coded by Approved/Disapproved Count'
                 )
fig.update_layout(
#     xaxis = dict(
#     tickmode = 'linear',
#     tick0=0,
#     dtick=1
#     ),
    width=1400,
    height=450)
fig.show()

In [None]:
fig=px.histogram(df[(df['applicant_race_name_1']=='White') &\
                   (df['loan_amount_000s']<500)],\
                 x='minority_population',
                 color='approve_bin',
                 nbins=100,
                 color_discrete_sequence=["red", "blue"],
                 title='White Applicant Minority Population Tract % Coded by Approved/Disapproved Count'
                 )
fig.show()

In [None]:
fig=px.histogram(df[~(df['applicant_race_name_1']=='White') &\
                   (df['applicant_income_000s']<300)],\
                 x='applicant_income_000s',
                 color='approve_bin',
                 nbins=100,
                 color_discrete_sequence=["red", "blue"]
                 )
xaxis = dict(
tickmode = 'linear',
tick0=40,
dtick=40)
fig.show()

In [None]:
fig=px.histogram(df[(df['applicant_race_name_1']=='White') &\
                   (df['applicant_income_000s']<300)],\
                 x='applicant_income_000s',
                 color='approve_bin',
                 nbins=100,
                 color_discrete_sequence=["red", "blue"],
                 )
fig.show()

In [None]:
df['latino'].value_counts(normalize=True)