# Predicting Credit Card Approvals

### Automation for the process of approving credit card applications using a logistic regression model.
### We will be utilising the Credit Card Approval dataset from the UCI Machine Learning Repository, available here http://archive.ics.uci.edu/ml/datasets/credit+approval . Unfortunately feature names are hidden but we can infer from this blog (http://rstudio-pubs-static.s3.amazonaws.com/73039_9946de135c0a49daa7a0a9eda4a67a72.html) their meanings.

### *Importing necessary modules*

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV

### *Inspection of the data*

In [2]:
#Load in the dataset (header=None, because no feature names present)
credit_data = pd.read_csv('credit_data.data', header=None)
display(credit_data.head())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,+


In [3]:
#Learn more about the dataset as currently it is not easily comprehendable

#Summary statistics
credit_data_description = credit_data.describe()
print(credit_data_description)
print('\n')

#DF info
credit_data_info = credit_data.info()
print(credit_data_info)

               2           7          10             14
count  690.000000  690.000000  690.00000     690.000000
mean     4.758725    2.223406    2.40000    1017.385507
std      4.978163    3.346513    4.86294    5210.102598
min      0.000000    0.000000    0.00000       0.000000
25%      1.000000    0.165000    0.00000       0.000000
50%      2.750000    1.000000    0.00000       5.000000
75%      7.207500    2.625000    3.00000     395.500000
max     28.000000   28.500000   67.00000  100000.000000


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 16 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       690 non-null    object 
 1   1       690 non-null    object 
 2   2       690 non-null    float64
 3   3       690 non-null    object 
 4   4       690 non-null    object 
 5   5       690 non-null    object 
 6   6       690 non-null    object 
 7   7       690 non-null    float64
 8   8       690 no

### *Finding missing values and handling them*

In [4]:
#No NaN values present, but what about '?' as a substitute
print(credit_data.isin(['?']).any())

0      True
1      True
2     False
3      True
4      True
5      True
6      True
7     False
8     False
9     False
10    False
11    False
12    False
13     True
14    False
15    False
dtype: bool


In [5]:
#Many columns with '?'
credit_data = credit_data.replace('?', np.nan)

#Check
print(credit_data.isin(['?']).any())

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
15    False
dtype: bool


In [6]:
#Going to use mean imputation to replace the NaN values
credit_data.fillna(credit_data.mean(), inplace=True)

#Check
print(credit_data.isnull().sum())

0     12
1     12
2      0
3      6
4      6
5      9
6      9
7      0
8      0
9      0
10     0
11     0
12     0
13    13
14     0
15     0
dtype: int64


In [7]:
#Still NaN values present in the non numeric features

#Iterate over each column of credit_data
for col in credit_data:
    # Check if the column is of object type
    if credit_data[col].dtypes == 'object':
        # Impute with the most frequent value
        credit_data = credit_data.fillna(credit_data[col].value_counts().index[0])

#Count the number of NaNs in the dataset and print the counts to verify
print(credit_data.isnull().sum())

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
dtype: int64


### *Preprocessing the data*

In [8]:
#Conversion of all data to numeric

le = LabelEncoder()

# Iterate over every value of each column and extract the dtypes
for col in credit_data.columns.values:
    # Check if the dtype is object
    if credit_data[col].dtypes=='object':
    # Use LabelEncoder to do the numeric transformation
        credit_data[col]=le.fit_transform(credit_data[col])
        
print(credit_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 16 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       690 non-null    int64  
 1   1       690 non-null    int64  
 2   2       690 non-null    float64
 3   3       690 non-null    int64  
 4   4       690 non-null    int64  
 5   5       690 non-null    int64  
 6   6       690 non-null    int64  
 7   7       690 non-null    float64
 8   8       690 non-null    int64  
 9   9       690 non-null    int64  
 10  10      690 non-null    int64  
 11  11      690 non-null    int64  
 12  12      690 non-null    int64  
 13  13      690 non-null    int64  
 14  14      690 non-null    int64  
 15  15      690 non-null    int64  
dtypes: float64(2), int64(14)
memory usage: 86.4 KB
None


In [9]:
#By infering from the article in the description we will drop features 11 and 13
#as they would not impact credit card approval chances
credit_data = credit_data.drop([11, 13], axis=1)

#Conversion to a numpy array in order for ML algorithm to work
credit_data = credit_data.values

In [10]:
#Seperation of features and labels
X, y = credit_data[:, 0:13], credit_data[:, 13]

# Seperation into test and training sets
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=40)

### *Scaling*

In [11]:
#Instantiate a MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))
rescaledX_train = scaler.fit_transform(X_train)
rescaledX_test = scaler.fit_transform(X_test)

### *Fitting the training set to a logistic regression*

In [12]:
# Instantiate a LogisticRegression classifier with default parameter values
logreg = LogisticRegression(max_iter=150)

# Fit logreg to the train set
logreg.fit(rescaledX_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=150,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

### *Evaluating performance of the model*

In [13]:
#predicted values
y_pred = logreg.predict(rescaledX_test)

#accuracy score
print('The accuracy of this model on the test data: ' + str(logreg.score(rescaledX_test, y_test)))
print('\n')

#confusion matrix
print(confusion_matrix(y_test, y_pred))

The accuracy of this model on the test data: 0.821256038647343


[[89 10]
 [27 81]]


### *Tuning of the model for Optimisation*

In [14]:
#Tuning of the model with GridSearchCV

#Paramaters to tune
tol = [0.01, 0.001, 0.0001]
max_iter = [100, 200, 300]

#dictionary for the paramaters
param_grid = dict(tol=tol, max_iter=max_iter)

#Instantiate GridSearchCV
grid_model = GridSearchCV(estimator=logreg, param_grid=param_grid, cv=5)

#scale X
scaled_X = scaler.fit_transform(X)

#fit grid model to the data
grid_model_result = grid_model.fit(scaled_X, y)

#Compute results
best_score, best_params = grid_model_result.best_score_, grid_model_result.best_params_

print('The best accuracy score was %f , using paramaters: %s' % (best_score, best_params) )

The best accuracy score was 0.850725 , using paramaters: {'max_iter': 100, 'tol': 0.01}


### This model achieved an accuracy score of 85% when utilising logistic regression with paramaters holding 'max_iter' constant at 100, and increasing tolerance to 0.01