# Import Lib

In [7]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings("ignore")

# Import Data

In [8]:
cc_apps=pd.read_csv(r'C:\Users\kkdhi\Downloads\cc_approvals.csv')
cc_apps.head()

Unnamed: 0,b,30.83,0,u,g,w,v,1.25,t,t.1,01,f,g.1,00202,0.1,+
0,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+
1,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+
2,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,+
3,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,+
4,b,32.08,4.0,u,g,m,v,2.5,t,f,0,t,g,360,0,+


# Inspecting the applications

In [9]:
# Print summary statistics
cc_apps_description = cc_apps.describe()
print(cc_apps_description)

print("\n")

# Print DataFrame information
cc_apps_info = cc_apps.info()
print(cc_apps_info)

print("\n")

# Inspect missing values in the dataset
print(cc_apps.tail())

                0        1.25          01            0.1
count  689.000000  689.000000  689.000000     689.000000
mean     4.765631    2.224819    2.402032    1018.862119
std      4.978470    3.348739    4.866180    5213.743149
min      0.000000    0.000000    0.000000       0.000000
25%      1.000000    0.165000    0.000000       0.000000
50%      2.750000    1.000000    0.000000       5.000000
75%      7.250000    2.625000    3.000000     396.000000
max     28.000000   28.500000   67.000000  100000.000000


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 689 entries, 0 to 688
Data columns (total 16 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   b       689 non-null    object 
 1   30.83   689 non-null    object 
 2   0       689 non-null    float64
 3   u       689 non-null    object 
 4   g       689 non-null    object 
 5   w       689 non-null    object 
 6   v       689 non-null    object 
 7   1.25    689 non-null    float64
 8   t    

# Handling the missing values (part i)

In [10]:
# Inspect missing values in the dataset
print(cc_apps.tail(17))

# Replace the '?'s with NaN
cc_apps = cc_apps.replace("?", np.NaN )

# Inspect the missing values again
print(cc_apps.tail(17))

     b  30.83       0  u  g   w   v   1.25  t t.1  01  f g.1  00202  0.1  +
672  ?  29.50   2.000  y  p   e   h  2.000  f   f   0  f   g  00256   17  -
673  a  37.33   2.500  u  g   i   h  0.210  f   f   0  f   g  00260  246  -
674  a  41.58   1.040  u  g  aa   v  0.665  f   f   0  f   g  00240  237  -
675  a  30.58  10.665  u  g   q   h  0.085  f   t  12  t   g  00129    3  -
676  b  19.42   7.250  u  g   m   v  0.040  f   t   1  f   g  00100    1  -
677  a  17.92  10.210  u  g  ff  ff  0.000  f   f   0  f   g  00000   50  -
678  a  20.08   1.250  u  g   c   v  0.000  f   f   0  f   g  00000    0  -
679  b  19.50   0.290  u  g   k   v  0.290  f   f   0  f   g  00280  364  -
680  b  27.83   1.000  y  p   d   h  3.000  f   f   0  f   g  00176  537  -
681  b  17.08   3.290  u  g   i   v  0.335  f   f   0  t   g  00140    2  -
682  b  36.42   0.750  y  p   d   v  0.585  f   f   0  f   g  00240    3  -
683  b  40.58   3.290  u  g   m   v  3.500  f   f   0  t   s  00400    0  -
684  b  21.0

# Handling the missing values (part ii)

In [11]:
# Impute the missing values with mean imputation
cc_apps.fillna(cc_apps.mean(), inplace=True)

# Count the number of NaNs in the dataset to verify
print(cc_apps.isnull())

         b  30.83      0      u      g      w      v   1.25      t    t.1  \
0    False  False  False  False  False  False  False  False  False  False   
1    False  False  False  False  False  False  False  False  False  False   
2    False  False  False  False  False  False  False  False  False  False   
3    False  False  False  False  False  False  False  False  False  False   
4    False  False  False  False  False  False  False  False  False  False   
..     ...    ...    ...    ...    ...    ...    ...    ...    ...    ...   
684  False  False  False  False  False  False  False  False  False  False   
685  False  False  False  False  False  False  False  False  False  False   
686  False  False  False  False  False  False  False  False  False  False   
687  False  False  False  False  False  False  False  False  False  False   
688  False  False  False  False  False  False  False  False  False  False   

        01      f    g.1  00202    0.1      +  
0    False  False  False  F

# Handling the missing values (part iii)

In [12]:
# Iterate over each column of cc_apps
for col in cc_apps:
    # Check if the column is of object type
    if cc_apps[col].dtype == 'object':
        # Impute with the most frequent value
        cc_apps = cc_apps.fillna(cc_apps[col].value_counts().index[0])

# Count the number of NaNs in the dataset and print the counts to verify
print (cc_apps.isnull().values.sum())

0


# Preprocessing the data (part i)

In [13]:
le= LabelEncoder()

# Iterate over all the values of each column and extract their dtypes
for col in cc_apps:
    # Compare if the dtype is object
    if cc_apps[col].dtype=='object':
    # Use LabelEncoder to do the numeric transformation
        cc_apps[col]=le.fit_transform(cc_apps[col])

# Splitting the dataset into train and test sets

In [19]:
# Drop the features 11 and 13 and convert the DataFrame to a NumPy array
cc_apps = cc_apps.drop(['f' , '00202'], axis=1)
cc_apps = cc_apps.values

# Segregate features and labels into separate variables
X,y = cc_apps[:,0:13] , cc_apps[:,13]

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X,
                                y,
                                test_size=0.33,
                                random_state=42)

# Preprocessing the data (part ii)

In [20]:
# Instantiate MinMaxScaler and use it to rescale X_train and X_test
scaler = MinMaxScaler(feature_range=(0, 1))
rescaledX_train = scaler.fit_transform(X_train)
rescaledX_test = scaler.fit_transform(X_test)

# Fitting a logistic regression model to the train set

In [21]:
# Instantiate a LogisticRegression classifier with default parameter values
logreg = LogisticRegression()

# Fit logreg to the train set
logreg.fit(X_train, y_train)

# Making predictions and evaluating performance

In [22]:
# Use logreg to predict instances from the test set and store it
y_pred = logreg.predict(rescaledX_test)

# Get the accuracy score of logreg model and print it
print("Accuracy of logistic regression classifier: ", logreg.score(rescaledX_test, y_test))

# Print the confusion matrix of the logreg model
print(confusion_matrix(y_test, y_pred))

Accuracy of logistic regression classifier:  0.8157894736842105
[[ 80  20]
 [ 22 106]]


# Grid searching and making the model perform better

In [23]:
# Define the grid of values for tol and max_iter
tol = [0.01, 0.001, 0.0001]
max_iter = [100, 150, 200]

# Create a dictionary where tol and max_iter are keys and the lists of their values are corresponding values
param_grid = dict(tol=tol, max_iter=max_iter)

# Finding the best performing model

In [24]:
# Instantiate GridSearchCV with the required parameters
grid_model = GridSearchCV(estimator=logreg, param_grid=param_grid, cv=5)

# Use scaler to rescale X and assign it to rescaledX
rescaledX = scaler.fit_transform(X)

# Fit data to grid_model
grid_model_result = grid_model.fit(rescaledX, y)

# Summarize results
best_score, best_params = grid_model_result.best_score_, grid_model_result.best_params_
print("Best: %f using %s" % (best_score, best_params))

Best: 0.850640 using {'max_iter': 100, 'tol': 0.01}
