In [None]:
#Aryan Roy
#CSI Week6 Assisgnment

In [2]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

def evaluate(model, X_test, y_test):
    y_pred = model.predict(X_test)
    return {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred)
    }

base_models = {
    "Logistic Regression": LogisticRegression(max_iter=10000),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(),
    "KNN": KNeighborsClassifier()
}

base_results = {}
for name, model in base_models.items():
    pipe = Pipeline([("scaler", StandardScaler()), ("clf", model)])
    pipe.fit(X_train, y_train)
    base_results[name] = evaluate(pipe, X_test, y_test)

pipe_lr = Pipeline([("scaler", StandardScaler()), ("clf", LogisticRegression(max_iter=10000))])
param_grid_lr = {
    'clf__C': np.logspace(-3, 3, 7),
    'clf__penalty': ['l2'],
    'clf__solver': ['lbfgs']
}
grid_lr = GridSearchCV(pipe_lr, param_grid_lr, cv=5, scoring='f1', n_jobs=-1)
grid_lr.fit(X_train, y_train)

pipe_rf = Pipeline([("scaler", StandardScaler()), ("clf", RandomForestClassifier(random_state=42))])
param_grid_rf = {
    'clf__n_estimators': [50, 100, 150],
    'clf__max_depth': [None, 10, 20],
    'clf__min_samples_split': [2, 5]
}
grid_rf = GridSearchCV(pipe_rf, param_grid_rf, cv=5, scoring='f1', n_jobs=-1)
grid_rf.fit(X_train, y_train)

pipe_knn = Pipeline([("scaler", StandardScaler()), ("clf", KNeighborsClassifier())])
param_grid_knn = {
    'clf__n_neighbors': [3, 5, 7, 9],
    'clf__weights': ['uniform', 'distance'],
    'clf__p': [1, 2]
}
grid_knn = GridSearchCV(pipe_knn, param_grid_knn, cv=5, scoring='f1', n_jobs=-1)
grid_knn.fit(X_train, y_train)

pipe_svm = Pipeline([("scaler", StandardScaler()), ("clf", SVC())])
param_dist_svm = {
    'clf__C': np.logspace(-3, 2, 20),
    'clf__gamma': ['scale', 'auto'],
    'clf__kernel': ['linear', 'rbf']
}
random_svm = RandomizedSearchCV(pipe_svm, param_dist_svm, cv=5, scoring='f1', n_iter=15, random_state=42, n_jobs=-1)
random_svm.fit(X_train, y_train)

best_random_params = random_svm.best_params_
refined_grid_svm = {
    'clf__C': [best_random_params['clf__C'] * 0.5, best_random_params['clf__C'], best_random_params['clf__C'] * 2],
    'clf__gamma': [best_random_params['clf__gamma']],
    'clf__kernel': [best_random_params['clf__kernel']]
}
grid_svm = GridSearchCV(pipe_svm, refined_grid_svm, cv=5, scoring='f1', n_jobs=-1)
grid_svm.fit(X_train, y_train)

tuned_models = {
    "Logistic Regression": grid_lr.best_estimator_,
    "Random Forest": grid_rf.best_estimator_,
    "SVM": grid_svm.best_estimator_,
    "KNN": grid_knn.best_estimator_
}

tuned_results = {}
for name, model in tuned_models.items():
    tuned_results[name] = evaluate(model, X_test, y_test)

print("Before Hyperparameter Tuning:")
print(pd.DataFrame(base_results).T.round(4))

print("\nAfter Hyperparameter Tuning:")
print(pd.DataFrame(tuned_results).T.round(4))


Before Hyperparameter Tuning:
                     Accuracy  Precision  Recall  F1 Score
Logistic Regression    0.9825     0.9861  0.9861    0.9861
Random Forest          0.9561     0.9589  0.9722    0.9655
SVM                    0.9825     0.9861  0.9861    0.9861
KNN                    0.9561     0.9589  0.9722    0.9655

After Hyperparameter Tuning:
                     Accuracy  Precision  Recall  F1 Score
Logistic Regression    0.9737     0.9726  0.9861    0.9793
Random Forest          0.9561     0.9589  0.9722    0.9655
SVM                    0.9825     0.9861  0.9861    0.9861
KNN                    0.9737     0.9600  1.0000    0.9796
