![Credit card being held in hand](credit_card.jpg)

Commercial banks receive _a lot_ of applications for credit cards. Many of them get rejected for many reasons, like high loan balances, low income levels, or too many inquiries on an individual's credit report, for example. Manually analyzing these applications is mundane, error-prone, and time-consuming (and time is money!). Luckily, this task can be automated with the power of machine learning and pretty much every commercial bank does so nowadays. In this workbook, you will build an automatic credit card approval predictor using machine learning techniques, just like real banks do.

### The Data

The data is a small subset of the Credit Card Approval dataset from the UCI Machine Learning Repository showing the credit card applications a bank receives. This dataset has been loaded as a `pandas` DataFrame called `cc_apps`. The last column in the dataset is the target value.

In [20]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
import plotly.graph_objects as go
# Load the dataset
cc_apps = pd.read_csv("cc_approvals.data", header=None) 
cc_apps.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,g,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,g,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,g,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,g,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,s,0,+


In [21]:
#data preprocessing
cc_apps_clean = cc_apps.replace(np.nan)
#we check that we got rid of ?
cc_apps_clean.isna().sum()


0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
dtype: int64

In [22]:
#now will replace nans with values
for col in cc_apps_clean.columns:
    if cc_apps_clean[col].dtypes == 'object':
        #here it wont be a number so will use the most frequent value
        cc_apps_clean[col] = cc_apps_clean[col].fillna(cc_apps_clean[col].value_counts().index[0])
    else:
        #here they will be numbers so we can use mean
            cc_apps_clean[col] = cc_apps_clean[col].fillna(cc_apps_clean[col].mean())


In [23]:
#This will encode categorical variables so that we can use them in our model
#which means 1 if it belongs to the category or 0 if it doesnt
encoded_clean = pd.get_dummies(cc_apps_clean, drop_first=True)
encoded_clean

Unnamed: 0,2,7,10,12,0_a,0_b,1_15.17,1_15.75,1_15.83,1_15.92,1_16.00,1_16.08,1_16.17,1_16.25,1_16.33,1_16.50,1_16.92,1_17.08,1_17.25,1_17.33,1_17.42,1_17.50,1_17.58,1_17.67,1_17.83,1_17.92,1_18.00,1_18.08,1_18.17,1_18.25,1_18.33,1_18.42,1_18.50,1_18.58,1_18.67,1_18.75,1_18.83,1_18.92,1_19.00,1_19.17,...,1_71.58,1_73.42,1_74.83,1_76.75,1_80.25,1_?,3_l,3_u,3_y,4_g,4_gg,4_p,5_aa,5_c,5_cc,5_d,5_e,5_ff,5_i,5_j,5_k,5_m,5_q,5_r,5_w,5_x,6_bb,6_dd,6_ff,6_h,6_j,6_n,6_o,6_v,6_z,8_t,9_t,11_p,11_s,13_-
0,0.000,1.25,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0
1,4.460,3.04,6,560,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0
2,0.500,1.50,0,824,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0
3,1.540,3.75,5,3,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0
4,5.625,1.71,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
685,10.085,1.25,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1
686,0.750,2.00,2,394,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1
687,13.500,2.00,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1
688,0.205,0.04,0,750,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1


In [24]:
#Now will separate the target variable
#everything except last column
X = encoded_clean.iloc[:,:-1].values
#last column
y = encoded_clean.iloc[:, [-1]].values

In [25]:
#will use sklearn train test split to separate into trainging and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

#will scale the data, standard scaler will give us a 0 mean and standard deviation of 1
scaler = StandardScaler()
rescaledX_train = scaler.fit_transform(X_train)
rescaledX_test = scaler.fit_transform(X_test)

#will use a Logistic Regression Model
logreg = LogisticRegression()

#we fit the data into the model
logreg.fit(rescaledX_train, y_train)

#we predict from the train data
y_train_pred = logreg.predict(rescaledX_train)

cm = confusion_matrix(y_train, y_train_pred)
#we can check results with a confusion matrix
print(cm)

[[203   1]
 [  1 257]]


In [26]:
#here its a yes or no but lets add a plotly confusion matrix so its prettier
import plotly.graph_objects as go
labels = np.unique(y_train)

# Heatmap
fig = go.Figure(
    data=go.Heatmap(
        z=cm,
        x=labels,  # Predicted
        y=labels,  # True
        colorscale="Blues",
        text=cm,   # Show numbers inside
        texttemplate="%{text}",
        hovertemplate="True: %{y}<br>Pred: %{x}<br>Count: %{z}<extra></extra>"
    )
)

fig.update_layout(
    title="Confusion Matrix",
    xaxis_title="Predicted Label",
    yaxis_title="True Label"
)

fig.show()

In [27]:
#now will perform grid search to find the best model
#tolerance means that the algorithm will stop if the improvement between models is smaller than the established tolerance
tol = [0.01, 0.001, 0.0001]

#max iterations will stop the model after the determined runs if the model is unable to converge
max_iter = [100, 150, 200]

param_grid = dict(tol=tol, max_iter=max_iter)

#we perform the grid search using the model, our established params and cv, cv is into how many parts our data will be split
grid_model = GridSearchCV(estimator=logreg, param_grid=param_grid, cv=5)

#will fit the best resulting model
grid_model_result = grid_model.fit(rescaledX_train, y_train)

#will want to obtain what the highest score and parameters were for future reference
best_train_score, best_train_params = grid_model_result.best_score_, grid_model_result.best_params_

print(f"Best score: {best_train_score} using params: {best_train_params}")
#in this case the lowest iterations combined with the 2nd established tolerance yielded best results

Best score: 0.8225105189340814 using params: {'max_iter': 100, 'tol': 0.001}


In [28]:
#now will extract the best model and evaluate test data
best_model = grid_model_result.best_estimator_
best_score = best_model.score(rescaledX_test, y_test)

print(f"Accuracy of logistic regresion model: {best_score}")

Accuracy of logistic regresion model: 0.8377192982456141
