In [3]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV

# Load the dataset
cc_apps = pd.read_csv("cc_approvals.data", header=None) 
cc_apps.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,g,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,g,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,g,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,g,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,s,0,+


Preprocessing Data:

- Replacing missing values with NaN
- Imputing missing values in dataset

In [4]:

cc_apps = cc_apps.replace("?",np.NaN)
cc_apps_copy = cc_apps.copy()

                          
for column in cc_apps_copy.columns:
    if cc_apps_copy[column].dtypes ==  "object":
          cc_apps_copy[column] = cc_apps_copy[column].fillna(cc_apps_copy[column].value_counts().index[0])  
    else:
        cc_apps_copy[column] = cc_apps_copy[column].fillna(cc_apps_copy[column].mean())
cc_apps_encod =pd.get_dummies(cc_apps_copy, drop_first = "True")   




Preparing data for modelling:

- Defining feature and target variables
- Splitting data
- Scaling data

In [5]:
X = cc_apps_encod.iloc[:, :-1].values
print(X[:5])
y = cc_apps_encod.iloc[:, [-1]].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 21)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

[[0.0 1.25 1 ... True False False]
 [4.46 3.04 6 ... True False False]
 [0.5 1.5 0 ... False False False]
 [1.54 3.75 5 ... True False False]
 [5.625 1.71 0 ... False False True]]


- Defining Model
- Training and testing model without Hyperparameter Tuning

In [41]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
prediction = logreg.predict(X_test)


print(accuracy_score(y_test, prediction))
print(confusion_matrix(y_test, prediction))

0.7971014492753623
[[46 16]
 [12 64]]


Using GridSearchCV to improve the parameters and get a better model accuracy

In [42]:
param_grid = {
    'penalty': ['l1', 'l2'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100]

}
grid_cv = GridSearchCV(logreg, param_grid, cv = 5)
grid_cv.fit(X_train, y_train)
predictions_2 = grid_cv.predict(X_test)

print(confusion_matrix(y_test, predictions_2))

best_score = grid_cv.best_score_
best_parameter = grid_cv.best_params_

print(f"Best Score: {best_score} and Parameters used: {best_parameter}")

[[45 17]
 [10 66]]
Best Score: 0.8478296478296479 and Parameters used: {'C': 0.01, 'penalty': 'l2'}


Accuracy increased from 79.7% to 84.7% after Hyper parameter tuning was done on the logistic regression model