In [15]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV

# Load the dataset
cc_apps = pd.read_csv("cc_approvals.data", header=None) 
# Replace missing values (represented as characters, e.g., "?") with np.nan
cc_apps_nans_replaced = cc_apps.replace("?", np.NaN)
# Create a copy of the NaN replacement DataFrame
cc_apps_imputed = cc_apps_nans_replaced.copy()
# Iterate over each column of cc_apps_nans_replaced and impute the most frequent value for object data types and the mean for numeric data types
for col in cc_apps_imputed.columns:
    #checking if the column is of object type
    if cc_apps_imputed[col].dtypes=="object":
        #impute with the most frequent value
        cc_apps_imputed[col] = cc_apps_imputed[col].fillna(
        cc_apps_imputed[col].value_counts().index[0]
        )
    else:
        cc_apps_nans_imputed[col] = cc_apps_nans_imputed[col].fillna(
        cc_apps_nans_imputed[col].mean())
# Dummify the categorical features
cc_apps_encoded = pd.get_dummies(cc_apps_imputed, drop_first=True)
# Extract the last column as your target variable
X = cc_apps_encoded.iloc[:, :-1].values
y = cc_apps_encoded.iloc[:, -1].values
print(f"1D Target Shape: {X.shape}")
print(f"2D Target Shape: {y.shape}")
#splitting the data into train and test sets
X_train , X_test , y_train ,y_test = train_test_split(
    X,
    y,
    test_size=0.33,
    random_state=42
)
#instanciate a scaler and use it to rescale X_train and X_test
scaler = StandardScaler()
rescaledX_train=scaler.fit_transform(X_train)
rescaledX_test=scaler.fit_transform(X_test)
#instanciate a logistic regression classidfier and fit it to the training test
logreg = LogisticRegression()
logreg.fit(rescaledX_train,y_train)
#predict instances from the training set 
y_train_pred=logreg.predict(rescaledX_train)
#print the confusion matrix 
print(confusion_matrix(y_train,y_train_pred))

#define the grid of values for tol and max_iter
tol=[0.01,0.001,1.0001]
max_iter=[100,150,200]
# Create a dictionary where tol and max_iter are keys and the lists of their values are the corresponding values
param_grid = dict(tol=tol, max_iter=max_iter)
print(param_grid)
# Instantiate GridSearchCV with the required parameters
grid_model = GridSearchCV(
    estimator=logreg,
    param_grid=param_grid,
    cv=5
)
#fit it to the data
grid_model_result = grid_model.fit(rescaledX_train,y_train)
#summarize results
best_train_score,best_train_params=grid_model_result.best_score_,grid_model_result.best_params_
print("Best: %f using %s" % (best_train_score, best_train_params))
#extract the best model and evaluate it on the testing set
best_model = grid_model_result.best_estimator_
best_score = best_model.score(rescaledX_test,y_test)
print("Accuracy of logistic regression classifier: ", best_score)



1D Target Shape: (690, 382)
2D Target Shape: (690,)
Logistic Regression Training Confusion Matrix:
[[203   1]
 [  1 257]]
{'tol': [0.01, 0.001, 1.0001], 'max_iter': [100, 150, 200]}
Best: 0.818163 using {'max_iter': 100, 'tol': 0.01}
Accuracy of logistic regression classifier:  0.793859649122807
