# Loan Denial Prediction to look for discrimination

In [62]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [63]:
loan_denials=pd.read_csv('loan_denials_all.csv')
loan_denials.head()

ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [None]:
loan_denials.info()


In [None]:
#drop state and application date columns
loan_denials.drop(['State','Application_Date'],axis=1,inplace=True)

In [None]:
#count number of accepted loans by Loan_Title
total_title_count = loan_denials.groupby('Loan_Title').count()
accepted_title_count=loan_denials[['Loan_Title', 'Accepted']].groupby('Loan_Title').sum()
frac_accepted_title_count=(accepted_title_count/total_title_count)['Accepted']

#plot the fraction of accepted loans by Loan_Title
plt.figure(figsize=(10,5))
plt.bar(frac_accepted_title_count.index,frac_accepted_title_count)
plt.title('Fraction of Accepted Loans by Loan Title')
#angle the x-axis labels
plt.xlabel('Loan Title')
plt.xticks(rotation=90)
plt.ylabel('Fraction of Accepted Loans')
plt.show()


In [None]:
#count number of accepted loans by Loan_Title
total_cat_count = loan_denials.groupby('State_Category').count()
accepted_cat_count=loan_denials[['State_Category', 'Accepted']].groupby('State_Category').sum()
frac_accepted_cat_count=(accepted_cat_count/total_cat_count)['Accepted']

#plot the fraction of accepted loans by Loan_Title
plt.figure(figsize=(10,5))
plt.bar(frac_accepted_cat_count.index,frac_accepted_cat_count)
plt.title('Fraction of Accepted Loans by State Category')
#angle the x-axis labels
plt.xlabel('State Category')
plt.xticks(rotation=90)
plt.ylabel('Fraction of Accepted Loans')
plt.show()

## Feature Engineering

In [None]:
emp_length_dict = {
    "Employment_Length": {
        "10+ years": 10,
        "9 years": 9,
        "8 years": 8,
        "7 years": 7,
        "6 years": 6,
        "5 years": 5,
        "4 years": 4,
        "3 years": 3,
        "2 years": 2,
        "1 year": 1,
        "< 1 year": 0,
        "n/a": 0
    }
}
loan_denials['Employment_Length'] = loan_denials['Employment_Length'].replace(emp_length_dict['Employment_Length']).astype(int)




In [None]:
loan_denials['State_Category']=loan_denials['State_Category'].astype('category')
loan_denials.info()

In [None]:
#let us create dummies for categorical variables and drop the original categorical variables
loan_denials_dummies=pd.get_dummies(loan_denials,drop_first=True)
loan_denials_dummies.head()

In [None]:
#let us use a heatmap to see the correlation between the variables
sns.heatmap(abs(loan_denials_dummies.corr()))


In [None]:
#based on the heatmap, we must drop the policy_code, Year
loan_denials_dummies.drop(['Policy_Code','Year'],axis=1,inplace=True)
loan_denials_dummies.corr()['Accepted']

In [None]:
#let us use a heatmap to see the correlation between the variables
sns.heatmap(abs(loan_denials_dummies.corr()))

## Train and Test a Logistic Regression Model

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
#let us split the data into train and test sets
X_train,X_test,y_train,y_test=train_test_split(loan_denials_dummies,loan_denials_dummies['Accepted'],test_size=0.2,random_state=42)


In [None]:
#let us scale the data on the train set
scaler=StandardScaler()
scaler.fit(X_train)
X_train_scaled=scaler.transform(X_train)
X_test_scaled=scaler.transform(X_test)



In [None]:
print(X_train_scaled.shape)

In [None]:
#train and test the logistic regression model
def train_test_logistic_regression(X_train,y_train,X_test,y_test):
    logreg=LogisticRegression()
    logreg.fit(X_train,y_train)
    y_pred=logreg.predict(X_test)
    print('AUC of logistic regression classifier on train set: {:.2f}'.format(roc_auc_score(y_train,logreg.predict(X_train))))
    print('AUC of logistic regression classifier on test set: {:.2f}'.format(roc_auc_score(y_test,y_pred)))
    return logreg

In [None]:
train_test_logistic_regression(X_train_scaled,y_train,X_test_scaled,y_test)