In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression


In [58]:
# Read the dataset and drop missing values
data = pd.read_csv('Data/cc_approvals.data', header=None, na_values='?')
print(data.head())
data = data.dropna()
print("Missing values dropped")

  0      1      2  3  4  5  6     7  8  9   10 11   12 13
0  b  30.83  0.000  u  g  w  v  1.25  t  t   1  g    0  +
1  a  58.67  4.460  u  g  q  h  3.04  t  t   6  g  560  +
2  a  24.50  0.500  u  g  q  h  1.50  t  f   0  g  824  +
3  b  27.83  1.540  u  g  w  v  3.75  t  t   5  g    3  +
4  b  20.17  5.625  u  g  w  v  1.71  t  f   0  s    0  +
Missing values dropped


In [59]:
# Split data and convert to numpy arrays
X = pd.get_dummies(data, columns=[0, 3, 4, 5, 6, 8, 9, 11]).drop(columns=[13]).values
y = data[13].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [60]:
# Pipeline with StandardScaler and KNeighborsClassifier
pipeline = Pipeline([("Scaler", StandardScaler()), ("KNN", KNeighborsClassifier())])

# GridSearchCV with pipeline to find the best k value
parameters = {"KNN__n_neighbors": range(1, 20)}
grid_search = GridSearchCV(pipeline, parameters , cv=5)
grid_search.fit(X_train, y_train)

# Best parameters and score
k = grid_search.best_params_['KNN__n_neighbors']
print("Best k value: ", k)
print("Best score: ", grid_search.best_score_)

# Make predictions
y_pred = grid_search.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Test accuracy:", accuracy)


Best k value:  18
Best score:  0.853854447439353
Test accuracy: 0.8257575757575758


In [62]:
# Logistic Regression pipeline with scaling
lr_pipeline = Pipeline([("scaler", StandardScaler()), ("log_reg", LogisticRegression(random_state=42 , max_iter=1000))])
lr_pipeline.fit(X_train, y_train)

# Make predictions with logistic regression
y_pred_lr = lr_pipeline.predict(X_test)
accuracy_lr = accuracy_score(y_test, y_pred_lr)
print("Logistic Regression Test accuracy:", accuracy_lr)

Logistic Regression Test accuracy: 0.8484848484848485


In [64]:
# Choose the best performing model
if accuracy > accuracy_lr:
    print("KNN performs better")
    best_model = grid_search
    best_score = accuracy
else:
    print("Logistic Regression performs better")
    best_model = lr_pipeline
    best_score = accuracy_lr


print("Best score:", best_score)


Logistic Regression performs better
Best score: 0.8484848484848485
