In [8]:
## Adding liberies
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline

#
import seaborn as sns
sns.set_style('whitegrid')

# sklearn 
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.metrics import roc_curve, auc, roc_curve, confusion_matrix
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier 
from sklearn import metrics

from sklearn import tree

# avoid warning signs
import warnings
warnings.filterwarnings("ignore")

In [9]:
df = pd.read_csv('df_mortgage_loan_decision.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 184956 entries, 0 to 184955
Data columns (total 76 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   loan_amounts                    184956 non-null  float64
 1   applicant_incomes               184956 non-null  float64
 2   population                      184956 non-null  float64
 3   minority_population             184956 non-null  float64
 4   hud_median_family_income        184956 non-null  float64
 5   tract_to_msamd_income           184956 non-null  float64
 6   number_of_owner_occupied_units  184956 non-null  float64
 7   number_of_1_to_4_family_units   184956 non-null  float64
 8   denial                          184956 non-null  int64  
 9   minority_population_            184956 non-null  float64
 10  loan_type_name_1                184956 non-null  int64  
 11  loan_type_name_2                184956 non-null  int64  
 12  loan_type_name_3

## Data preparation

We have determined our experiment falls in the category of a supervised binary classification task. This following blog post will be focusing on testing multiple models at once to isolate the most appropraite model to optimize for. 

In [10]:
## Create training and test sets

# Seperate your X and y variables [Our independant variable is our Denial Class]
y= df.denial
X= df.drop(['denial'], axis=1)

# Split your data into a taining and test set. I will proceed with a 75/25 split.
X_train, X_train, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

## Models

In this experiment we will be examining 7 different models.
1. Logistic Regression: Basic Linear Classifer (baseline)
2. Decision Tree: Ensemble(Light) bagging Classifier
3. Random Forest: Ensemble(Intermidate) bagging Classifier
7. XGBoost: Ensemble(Extreme) Boosting Classifier
4. K-Nearest Neighbors: Instance Based Classifer
5. Support Vector Machines: Maximum Margin Classifier
6. Gaussian Naive Bayes: Probabilistic Classifier

Intially, I will start by using the default parameters for each model.

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

from sklearn import model_selection
from sklearn.utils import class_weight

from sklearn.metrics import auc, roc_curve, confusion_matrix, classification_report
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

In [None]:
def run_models(X_train: pd.DataFrame, y_train: pd.DataFrame, X_test: pd.DataFrame, y_test: pd.DataFrame) -> pd.DataFrame:
    """ Function to test all models and find the best performing model
        :param X_train: training split
        :param y_train: training target
        :param X_test:  test split
        :param y_test:  training target """
    df = []
    
    models = [
              ('LogReg', LogisticRegression()),
              ('DT', DecisionTreeClassifier(class_weight='balanced')),
              ('RF', RandomForestClassifier(class_weight='balanced_subsample')),
              ('XGB', XGBClassifer()),
              ('KNN', KNeighborsClassifier(weights='distance')),
              ('SVC', SVC()),
              ('GBN', GaussianNB())
    ]
    
    results = []
    name = []
    scoring = ['accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted', 'roc_auc']
    target = ['not_denied', 'denied']
    
    