In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV

In [2]:
# Load the dataset
cc_apps = pd.read_csv("cc_approvals.data", header=None)
summary = pd.DataFrame({
    "Column": cc_apps.columns,
    "DataType": cc_apps.dtypes,
    "NonNullCount": cc_apps.notnull().sum()
})
print(summary)

    Column DataType  NonNullCount
0        0   object           690
1        1   object           690
2        2  float64           690
3        3   object           690
4        4   object           690
5        5   object           690
6        6   object           690
7        7  float64           690
8        8   object           690
9        9   object           690
10      10    int64           690
11      11   object           690
12      12    int64           690
13      13   object           690


In [3]:
cc_apps.replace('?', np.nan,inplace=True)
for col in cc_apps.columns:
    if cc_apps[col].dtype == "object":
        most_freq = cc_apps[col].value_counts().index[0]
        cc_apps[col] = cc_apps[col].fillna(most_freq)
    else:
        cc_apps[col] = cc_apps[col].fillna(cc_apps[col].astype(float).mean())
cc = pd.get_dummies(cc_apps, drop_first=True)
print(cc)

          2     7  10   12    0_b  1_15.17  1_15.75  1_15.83  1_15.92  \
0     0.000  1.25   1    0   True    False    False    False    False   
1     4.460  3.04   6  560  False    False    False    False    False   
2     0.500  1.50   0  824  False    False    False    False    False   
3     1.540  3.75   5    3   True    False    False    False    False   
4     5.625  1.71   0    0   True    False    False    False    False   
..      ...   ...  ..  ...    ...      ...      ...      ...      ...   
685  10.085  1.25   0    0   True    False    False    False    False   
686   0.750  2.00   2  394  False    False    False    False    False   
687  13.500  2.00   1    1  False    False    False    False    False   
688   0.205  0.04   0  750   True    False    False    False    False   
689   3.375  8.29   0    0   True    False    False    False    False   

     1_16.00  ...    6_j    6_n    6_o    6_v    6_z    8_t    9_t   11_p  \
0      False  ...  False  False  False   True 

In [4]:
X = cc.iloc[:, :-1].values
y = cc.iloc[:, -1].values
print(X.shape, y.shape)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

(690, 382) (690,)


In [5]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [6]:
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
y_pred = log_reg.predict(X_test)
print(log_reg.score(X_test, y_test))
print(confusion_matrix(y_test, y_pred))

0.782608695652174
[[52 18]
 [12 56]]


In [7]:
params = {
    "C": [0.1, 1, 10],
    "max_iter": [100, 200, 500],
    "tol": [1e-3, 1e-4]
}
grid = GridSearchCV(
    estimator=LogisticRegression(),
    param_grid=params,
    cv=5,
    scoring="accuracy"
)
grid.fit(X_train, y_train)

In [8]:
best_model = grid.best_estimator_
best_score = best_model.score(X_test, y_test)
print("Best hyperparameters:", grid.best_params_)
print("Test set accuracy:", best_score)

Best hyperparameters: {'C': 0.1, 'max_iter': 100, 'tol': 0.0001}
Test set accuracy: 0.782608695652174
