# All States
----
This contains the merged DataFrames of California, Texas, New York and Florida.

Data is limited to:
 - Years 2014-2017
 - People over the 20<sup>th</sup> percentile and under the 90<sup>th</sup> percentile by state
 - People in the few major Metro Areas by state

In [1]:
import json
import math
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold

In [2]:
def compress_dataframe(df, dictionary):
    df = df.copy(deep=True)
    
    for col in dictionary.keys():
        df[col] = df[col].map(dictionary[col])
    return df

In [3]:
def reduce_large_strings(df):
    to_replace = ["Information not provided by applicant in mail, Internet, or telephone application",
                  "Native Hawaiian or Other Pacific Islander",
                  "One-to-four family dwelling (other than manufactured housing)"]
    replace_with = ["Information not Provided",
                    "Native Hawaiian/Pacific Islander",
                    "1-4 Family House"]
    for col in df.columns:
        if df[col].dtype == object:
            for i, _ in enumerate(to_replace):
                df[col] = np.where(df[col] == to_replace[i], replace_with[i], df[col])            
    return df

In [4]:
# Code from Murmel on Stack Exchange
# https://stackoverflow.com/questions/1450957/pythons-json-module-converts-int-dictionary-keys-to-strings
def jsonKeys2int(x):
    if isinstance(x, dict):
        try:
            return {int(k):v for k,v in x.items()}
        except:
            pass #bad form
    return x

### Read In Merged DF and Decompression Dictionary
----
In order to reduce file size all categorical cells were converted into numbers and additionally compressed into a GZip file.

The next three cells

1. Reads in the numerical Dataframe
2. Reads the JSON file which contains the key:value pairs to decode the DataFrame
3. Executes a function that uses the JSON file to decompress/decode the numerical DataFrame.

In [6]:
df=pd.read_csv('./merged_df/merged_df.csv.gz', low_memory=False)

In [7]:
df.to_csv("./merged_df/merged_df.csv", index=False)

In [8]:
with open("./json_files/decompression_dictionary.json", "r") as json_file:
    decompression_dictionary = json.load(json_file, object_hook=jsonKeys2int)

In [9]:
df = compress_dataframe(df, decompression_dictionary)

### Quick Clean

In [10]:
df = reduce_large_strings(df)

In [11]:
#cast numerical features back to correct values
numerical_features = ['as_of_year', 'agency_code', 'loan_type', 'loan_purpose',
                      'owner_occupancy', 'loan_amount_000s', 'preapproval',
                      'action_taken', 'msamd', 'census_tract_number', 'applicant_ethnicity',
                      'co_applicant_ethnicity', 'applicant_race_1', 'co_applicant_race_1',
                      'applicant_income_000s', 'purchaser_type', 'denial_reason_1',
                      'denial_reason_2', 'denial_reason_3', 'hoepa_status', 'lien_status',
                      'population', 'minority_population', 'hud_median_family_income',
                      'tract_to_msamd_income', 'number_of_owner_occupied_units',
                      'number_of_1_to_4_family_units', 'latino', 'approve_bin']
dtypes = [int, int, int, int, int, float, int, int, float, float, int, int, int,int,
          float, int, float, float, float, int, int, float, float, float, float, float,
          float, int, int]
for col, dtype in zip(numerical_features, dtypes):
    df[col] = df[col].astype(dtype)

In [14]:
df.isnull().sum().sort_values(ascending=False).head(15)

denial_reason_3                   4698559
denial_reason_name_3              4698559
denial_reason_name_2              4617303
denial_reason_2                   4617303
denial_reason_name_1              4245896
denial_reason_1                   4245896
number_of_1_to_4_family_units        2138
number_of_owner_occupied_units       1351
tract_to_msamd_income                 850
minority_population                   159
population                            152
hud_median_family_income               56
census_tract_number                    56
applicant_income_000s                   0
purchaser_type_name                     0
dtype: int64

In [15]:
df.shape

(4715850, 52)

In [16]:
df.dropna(axis=0,subset=['number_of_1_to_4_family_units','number_of_owner_occupied_units',\
                         'tract_to_msamd_income','minority_population','population',\
                         'hud_median_family_income','census_tract_number'], inplace=True)

Dropping the NAN entries that are not the denial reasons, as they make up only a small fraction of the dataset

In [17]:
df.shape

(4712341, 52)

| Race Name | Race Number  |
| --------- | ------------ |
| American Indian or Alaska Native | 1 |
| Asian | 2 |
| Black or African American | 3 |
| Native Hawaiian or Other Pacific Islander | 4 |
| White | 5 |
| Information not provided by applicant | 6 |
| N/A | 7 |
| No co-applicant | 8 |

#### in this model we are looking at home purchases

In [26]:
df = df[df["loan_purpose_name"] == "Home purchase"].copy(deep=True)

#### the not applicable sex name for applicant had almost 100% loan rate so this was deemed to be not good for the model

In [27]:
df=df[~(df['applicant_sex_name']=='Not applicable')].copy(deep=True)

In [28]:
### initial test run for new york - uncomment if you want smaller sample (NY Metro)
### df = df[(df["as_of_year"] == 2017) & (df["state_name"] == "New York")]

In [29]:
df.shape

(2257187, 52)

In [30]:
df.isna().sum()

as_of_year                              0
respondent_id                           0
agency_name                             0
agency_abbr                             0
agency_code                             0
loan_type_name                          0
loan_type                               0
loan_purpose_name                       0
loan_purpose                            0
owner_occupancy_name                    0
owner_occupancy                         0
loan_amount_000s                        0
preapproval_name                        0
preapproval                             0
action_taken_name                       0
action_taken                            0
msamd_name                              0
msamd                                   0
state_name                              0
county_name                             0
census_tract_number                     0
applicant_ethnicity_name                0
applicant_ethnicity                     0
co_applicant_ethnicity_name       

In [31]:
#### a copy of the dataset just in case
df_copy = df.copy()

In [32]:
#### hard coded drops of columns we don't need for sure
df.drop(columns = ['respondent_id',
                   'denial_reason_name_1','denial_reason_name_2','denial_reason_name_3',
                   'denial_reason_1','denial_reason_2','denial_reason_3','action_taken_name','action_taken',
                  'purchaser_type_name','lien_status_name','county_name','agency_abbr',
                  'agency_code','msamd','owner_occupancy','loan_purpose_name','lien_status','purchaser_type','hoepa_status_name'],
        inplace = True)

In [33]:
#### grab the columns that have "name" in the ... name and then drop those if there is a numerical analog
name_analog = [x.replace("_name","") for x in df.columns if x.find("name") > 0]
name_analog_clean = [x for x in name_analog if x in df.columns]

In [34]:
df.drop(name_analog_clean, axis=1, inplace=True)

In [35]:
df.columns

Index(['as_of_year', 'agency_name', 'loan_type_name', 'loan_purpose',
       'owner_occupancy_name', 'loan_amount_000s', 'preapproval_name',
       'msamd_name', 'state_name', 'census_tract_number',
       'applicant_ethnicity_name', 'co_applicant_ethnicity_name',
       'applicant_race_name_1', 'co_applicant_race_name_1',
       'applicant_sex_name', 'co_applicant_sex_name', 'applicant_income_000s',
       'hoepa_status', 'population', 'minority_population',
       'hud_median_family_income', 'tract_to_msamd_income',
       'number_of_owner_occupied_units', 'number_of_1_to_4_family_units',
       'latino', 'approve_bin'],
      dtype='object')

In [36]:
#purchaser type -> leakage -> since it implies a loan was originated
#df.groupby('purchaser_type').approve_bin.mean()

Purchaser_type_name and lien_status_name are features that leak into our target variable so we needed to drop them from our model

In [37]:
print('Baseline mean approval of', round((df['approve_bin'].mean())*100,2),'%')

Baseline mean approval of 77.54 %


In [84]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns = 'approve_bin', axis=1), 
                                                    df.approve_bin, test_size=0.75, random_state=42)

In [85]:
numerical_features = X_train.select_dtypes(include='number').columns.tolist()
print(f'There are {len(numerical_features)} numerical features:', '\n')
print(numerical_features)

There are 13 numerical features: 

['as_of_year', 'loan_purpose', 'loan_amount_000s', 'census_tract_number', 'applicant_income_000s', 'hoepa_status', 'population', 'minority_population', 'hud_median_family_income', 'tract_to_msamd_income', 'number_of_owner_occupied_units', 'number_of_1_to_4_family_units', 'latino']


In [86]:
categorical_features = X_train.select_dtypes(exclude='number').columns.tolist()
print(f'There are {len(categorical_features)} categorical features:', '\n')
print(categorical_features)

There are 12 categorical features: 

['agency_name', 'loan_type_name', 'owner_occupancy_name', 'preapproval_name', 'msamd_name', 'state_name', 'applicant_ethnicity_name', 'co_applicant_ethnicity_name', 'applicant_race_name_1', 'co_applicant_race_name_1', 'applicant_sex_name', 'co_applicant_sex_name']


In [87]:
numeric_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='mean')),
    ('scale', StandardScaler())
])

categorical_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('one-hot', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

In [88]:
full_processor = ColumnTransformer(transformers=[
    ('number', numeric_pipeline, numerical_features),
    ('category', categorical_pipeline, categorical_features)
])


#### Logistic : First Pass

In [46]:
logreg = LogisticRegression(random_state=42)

logreg_pipeline = Pipeline(steps=[
    ('preprocess', full_processor),
    ('clf', logreg)
])

logreg_grid = [{'clf__penalty': ['l2'],
                'clf__C': [1.0,0.5,0.1],
                'clf__solver': ['sag'],
                'clf__max_iter': [1000]
               }] 

logreg_cv = GridSearchCV(logreg_pipeline, logreg_grid, 
                      cv=10, 
                      scoring='accuracy',
                        n_jobs=-1)

logreg_cv.fit(X_train,y_train)

print("tuned hpyerparameters :(best parameters) ",logreg_cv.best_params_)
print("accuracy :",logreg_cv.best_score_)

tuned hpyerparameters :(best parameters)  {'clf__C': 0.5, 'clf__max_iter': 1000, 'clf__penalty': 'l2', 'clf__solver': 'sag'}
accuracy : 0.7756354822652106




In [47]:
coeff_val = logreg_cv.best_estimator_.named_steps.clf.coef_


In [48]:
best_cat = logreg_cv.best_estimator_.named_steps['preprocess'].transformers_[1][1].named_steps['one-hot'].get_feature_names(categorical_features)

In [49]:
coeff = numerical_features

In [50]:
coeff.extend(best_cat)

In [51]:
coeff_zip = zip(coeff,coeff_val[0])

In [52]:
c_l = list(coeff_zip)

In [53]:
df_fin =  pd.DataFrame(c_l,columns = ['Coefficient', 'Value'])

In [54]:
df_fin['abs_Val'] = df_fin['Value'].map(lambda x : abs(x))

In [55]:
df_fin.sort_values(by = 'abs_Val', ascending  = False)[:50]

Unnamed: 0,Coefficient,Value,abs_Val
35,"msamd_name_Nassau County, Suffolk County - NY",0.671963,0.671963
16,agency_name_Federal Reserve System,0.591762,0.591762
37,"msamd_name_Oakland, Hayward, Berkeley - CA",0.569985,0.569985
42,state_name_Florida,-0.390041,0.390041
36,"msamd_name_New York, Jersey City, White Plains...",-0.387501,0.387501
17,agency_name_National Credit Union Administration,-0.370783,0.370783
47,applicant_ethnicity_name_Not Hispanic or Latino,0.332194,0.332194
34,"msamd_name_Miami, Miami Beach, Kendall - FL",-0.328223,0.328223
41,state_name_California,0.327349,0.327349
48,applicant_ethnicity_name_Not applicable,-0.301469,0.301469


## PCA

In [None]:
#### reseting the numerical and categorical features and pipelines because the PCA pipline below did not originally like the 
#### coefficient exploration above 
numerical_features = X_train.select_dtypes(include='number').columns.tolist()
print(f'There are {len(numerical_features)} numerical features:', '\n')
print(numerical_features)

categorical_features = X_train.select_dtypes(exclude='number').columns.tolist()
print(f'There are {len(categorical_features)} categorical features:', '\n')
print(categorical_features)

numeric_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='mean')),
    ('scale', StandardScaler())
])

categorical_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('one-hot', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

full_processor = ColumnTransformer(transformers=[
    ('number', numeric_pipeline, numerical_features),
    ('category', categorical_pipeline, categorical_features)
])

In [89]:
pca = PCA()
log_reg = LogisticRegression()

log_reg_pipe = Pipeline(steps=[
    ('preprocess', full_processor),
    ('pca',pca),
    ('model', log_reg)
])

In [90]:
param_grid = {
    "pca__n_components": [5, 10, 15, 20],
    "model__C": np.logspace(-4, 4, 4)
}
search = GridSearchCV(log_reg_pipe, param_grid, n_jobs=2)
search.fit(X_train, y_train)
print("Best score (CV score=%0.3f):" % search.best_score_)
print("Best params:", search.best_params_)


Best score (CV score=0.776):
Best params: {'model__C': 0.046415888336127774, 'pca__n_components': 20}


### PCA quick analysis

In [91]:
# ~80 % explained by the first 10 components!
pca.fit(full_processor.fit_transform(X_train_2))
sum(pca.explained_variance_ratio_[:15])

0.8522885291833828

In [92]:
pca1_evec = pca.components_[0]
for weight, event in zip(pca1_evec, df.iloc[:,1:].columns):
    print(event, weight)

agency_name -0.014187994775458089
loan_type_name -5.551115123125783e-17
loan_purpose 0.07039225701378193
owner_occupancy_name -0.14570020304589204
loan_amount_000s 0.027244202537898277
preapproval_name -0.0005677568505217667
msamd_name -0.5397214088680111
state_name 0.09397167331730324
census_tract_number 0.042429569671753306
applicant_ethnicity_name -0.16738264794326269
co_applicant_ethnicity_name -0.5611213332658109
applicant_race_name_1 -0.551365442984059
co_applicant_race_name_1 0.02674417640752844
applicant_sex_name 0.012439813397180154
co_applicant_sex_name -0.013225489718138068
applicant_income_000s -0.0009816439573668298
hoepa_status -0.0009268282399198638
population 0.001115719155925229
minority_population 0.001578429362319375
hud_median_family_income 0.01191413857798993
tract_to_msamd_income -0.002230513321053439
number_of_owner_occupied_units -0.0014160927894623914
number_of_1_to_4_family_units -0.008267532467474118
latino 4.06670851848809e-05
approve_bin 0.00623754162553021

In [93]:
X = df.drop(columns = 'approve_bin', axis=1)
pca_df = pd.DataFrame(pca.transform(full_processor.fit_transform(X)),
                      columns=['pr_comp_' + str(i) for i in range(full_processor.fit_transform(X).shape[1])])
pca_df.head()

Unnamed: 0,pr_comp_0,pr_comp_1,pr_comp_2,pr_comp_3,pr_comp_4,pr_comp_5,pr_comp_6,pr_comp_7,pr_comp_8,pr_comp_9,...,pr_comp_67,pr_comp_68,pr_comp_69,pr_comp_70,pr_comp_71,pr_comp_72,pr_comp_73,pr_comp_74,pr_comp_75,pr_comp_76
0,1.095432,0.968579,-1.10329,0.996589,-1.076046,-0.989158,0.055898,0.048601,0.727745,0.801911,...,-9.921459e-07,-1.1e-05,7.706636e-07,-8.623413e-07,2e-06,-1e-06,-1e-06,-4.631126e-07,1e-06,7.614373e-07
1,1.031035,1.923122,-0.839787,0.116356,1.598051,-0.435288,0.017411,0.254199,1.619205,0.007144,...,-9.921459e-07,-1.1e-05,7.706636e-07,-8.623413e-07,2e-06,-1e-06,-1e-06,-4.631126e-07,1e-06,7.614373e-07
2,0.185112,-0.398179,0.974169,-1.957079,-0.094478,-1.400309,0.07925,0.537291,1.208401,0.374513,...,-9.921459e-07,-1.1e-05,7.706636e-07,-8.623413e-07,2e-06,-1e-06,-1e-06,-4.631126e-07,1e-06,7.614373e-07
3,0.577284,0.370584,-1.077228,-0.757118,0.426208,-0.447451,0.004721,0.360895,1.60281,-0.05638,...,-9.921459e-07,-1.1e-05,7.706636e-07,-8.623413e-07,2e-06,-1e-06,-1e-06,-4.631126e-07,1e-06,7.614373e-07
4,1.789032,-3.16614,1.747354,0.059937,1.158152,-1.836405,0.096266,0.186461,0.564993,-0.102784,...,-4.559521e-06,-5.1e-05,3.541674e-06,-3.962989e-06,1e-05,-7e-06,-5e-06,-2.128287e-06,5e-06,3.499273e-06


In [94]:
pca_df['approve_bin'] = list(df.approve_bin)

In [95]:
lr = LogisticRegression(random_state=42)

scores = cross_val_score(lr,
                         pca_df.iloc[:, :15],
                         pca_df.iloc[:, -1],
                         cv=KFold(n_splits=5,
                                  shuffle=True,
                                  random_state=42)   
                        )
round(scores.mean(), 2), '+-', round(2 * scores.std(), 2)

(0.78, '+-', 0.0)

In [96]:
knn = KNeighborsClassifier()
scores = cross_val_score(knn,
                         pca_df.iloc[:, :15],
                         pca_df.iloc[:, -1],
                         cv=KFold(n_splits=5,
                                  shuffle=True,
                                  random_state=73))
round(scores.mean(), 2), '+-', round(2 * scores.std(), 2)

(0.74, '+-', 0.0)

## Random Forrest 

In [97]:
pca = PCA()
rf = RandomForestClassifier()

rf_pipe = Pipeline(steps=[
    ('preprocess', full_processor),
    ('pca',pca),
    ('model', rf)
])

In [98]:
param_grid = [{"pca__n_components": [15],
               "model__n_estimators": [100]
               }] 

search = GridSearchCV(rf_pipe, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

search.fit(X_train,y_train)

print("tuned hpyerparameters :(best parameters) ",search.best_params_)
print("accuracy :",search.best_score_)

tuned hpyerparameters :(best parameters)  {'model__n_estimators': 100, 'pca__n_components': 15}
accuracy : 0.7731562860086575


In [99]:
#https://towardsdatascience.com/grid-search-for-model-tuning-3319b259367e
#https://towardsdatascience.com/polynomial-regression-with-a-machine-learning-pipeline-7e27d2dedc87
#https://scikit-learn.org/stable/auto_examples/compose/plot_digits_pipe.html
#https://medium.com/@andymdc31/using-pca-in-a-machine-learning-pipeline-b6fe3492b1b9