In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_breast_cancer
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
import warnings
warnings.filterwarnings('ignore')

In [2]:
cancer = load_breast_cancer()
df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
df['target'] = pd.Series(cancer.target)
df['target'] = 1 - df['target']

<h1>(a) Simple Logistic Regression<h1>

In [3]:
X = df[['worst concave points','worst radius','worst texture']]
y = df['target']

log_reg = LogisticRegression()

k_fold = KFold(n_splits=5, shuffle=True)
scoring = {
    'accuracy': 'accuracy',
    'precision': 'precision',
    'recall': 'recall',
    'f1': 'f1'
}

# Perform CV for each metric
results = {}
for metric in scoring:
    results[metric] = cross_val_score(log_reg, X, y, cv=k_fold, scoring=scoring[metric])

# Report mean of each metric across the 5 folds
mean_accuracy = np.mean(results['accuracy'])
mean_precision = np.mean(results['precision'])
mean_recall = np.mean(results['recall'])
mean_f1 = np.mean(results['f1'])

# Print the results
print(f"Mean Accuracy: {mean_accuracy}")
print(f"Mean Precision: {mean_precision}")
print(f"Mean Recall: {mean_recall}")
print(f"Mean F1-score: {mean_f1}")

Mean Accuracy: 0.9420431609998448
Mean Precision: 0.939346405228758
Mean Recall: 0.9006361754187842
Mean F1-score: 0.9298841334344268


<h1>(b) Full Logistic Regression<h1>

In [4]:
X = df.drop(columns=['target'])
y = df['target']

# Perform CV for each metric
results = {}
for metric in scoring:
    results[metric] = cross_val_score(log_reg, X, y, cv=k_fold, scoring=scoring[metric])

# Report mean of each metric across the 5 folds
mean_accuracy = np.mean(results['accuracy'])
mean_precision = np.mean(results['precision'])
mean_recall = np.mean(results['recall'])
mean_f1 = np.mean(results['f1'])

# Print the results
print(f"Mean Accuracy: {mean_accuracy}")
print(f"Mean Precision: {mean_precision}")
print(f"Mean Recall: {mean_recall}")
print(f"Mean F1-score: {mean_f1}")

Mean Accuracy: 0.9473063188945815
Mean Precision: 0.9424027570222911
Mean Recall: 0.9311556887717172
Mean F1-score: 0.9203166691872081


<h1>(c) Decision Tree<h1>

In [5]:
param_clf = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [2, 3, 5, 10, 20, 30],
    'min_samples_split': [2, 5, 10, 20]
}

clf = DecisionTreeClassifier()

grid_search = GridSearchCV(clf, param_clf, cv=5, scoring=scoring, refit='recall', return_train_score=True)
grid_search.fit(X, y)

results = grid_search.cv_results_

# Print metrics for each hyperparameter set
for i in range(len(results['params'])):
    print(f"Hyperparameters: {results['params'][i]}")
    print(f"Mean F1 socres: {results['mean_test_f1'][i]}")
    print(f"Mean Precision: {results['mean_test_precision'][i]}")
    print(f"Mean Recall: {results['mean_test_recall'][i]}")
    print(f"Mean Accuracy: {results['mean_test_accuracy'][i]}")

best_param = grid_search.best_params_
best_recall = grid_search.best_score_

# Get the best results
print(f"Best parameters: {best_param}")
print(f"Best recall: {best_recall}")


Hyperparameters: {'criterion': 'gini', 'max_depth': 2, 'min_samples_split': 2}
Mean F1 socres: 0.9011572180197321
Mean Precision: 0.923279950024136
Mean Recall: 0.882392026578073
Mean Accuracy: 0.9279614966620089
Hyperparameters: {'criterion': 'gini', 'max_depth': 2, 'min_samples_split': 5}
Mean F1 socres: 0.9022552811583626
Mean Precision: 0.9138355055796916
Mean Recall: 0.8916943521594684
Mean Accuracy: 0.9279614966620089
Hyperparameters: {'criterion': 'gini', 'max_depth': 2, 'min_samples_split': 10}
Mean F1 socres: 0.9011572180197321
Mean Precision: 0.923279950024136
Mean Recall: 0.882392026578073
Mean Accuracy: 0.9279614966620089
Hyperparameters: {'criterion': 'gini', 'max_depth': 2, 'min_samples_split': 20}
Mean F1 socres: 0.9022552811583626
Mean Precision: 0.9138355055796916
Mean Recall: 0.8916943521594684
Mean Accuracy: 0.9279614966620089
Hyperparameters: {'criterion': 'gini', 'max_depth': 3, 'min_samples_split': 2}
Mean F1 socres: 0.8975533618150371
Mean Precision: 0.9162997616

<h1>(d) KNN<h1>

In [6]:
param_knn = {
    'n_neighbors': [1, 3, 5, 8, 10, 15, 20, 50, 80],
    'weights': ['uniform', 'distance']
}

knn = KNeighborsClassifier()

grid_search = GridSearchCV(knn, param_knn, cv=5, scoring=scoring, refit='recall', return_train_score=True)
grid_search.fit(X, y)

results = grid_search.cv_results_

# Print the metrics for each hyperparameter set
for i in range(len(results['params'])):
    if np.isnan(results['mean_test_f1'][i]) != 1 and np.isnan(results['mean_test_precision'][i]) != 1 and np.isnan(results['mean_test_recall'][i]) != 1 and np.isnan(results['mean_test_accuracy'][i]) != 1 :
        print(f"Hyperparameters: {results['params'][i]}")
        print(f"Mean F1 socres: {results['mean_test_f1'][i]}")
        print(f"Mean Precision: {results['mean_test_precision'][i]}")
        print(f"Mean Recall: {results['mean_test_recall'][i]}")
        print(f"Mean Accuracy: {results['mean_test_accuracy'][i]}")


best_param = grid_search.best_params_
best_recall = grid_search.best_score_

# Get the best results
print(f"Best parameters: {best_param}")
print(f"Best recall: {best_recall}")

Hyperparameters: {'n_neighbors': 1, 'weights': 'distance'}
Mean F1 socres: 0.8684785133565622
Mean Precision: 0.895097515097515
Mean Recall: 0.8447397563676633
Mean Accuracy: 0.9051079024996118
Hyperparameters: {'n_neighbors': 3, 'weights': 'distance'}
Mean F1 socres: 0.8940139444738101
Mean Precision: 0.9257031132981218
Mean Recall: 0.8685492801771872
Mean Accuracy: 0.9244216736531593
Hyperparameters: {'n_neighbors': 5, 'weights': 'distance'}
Mean F1 socres: 0.9018140640826505
Mean Precision: 0.9348896631823461
Mean Recall: 0.8730897009966778
Mean Accuracy: 0.9297003570874087
Hyperparameters: {'n_neighbors': 8, 'weights': 'distance'}
Mean F1 socres: 0.8925832948518814
Mean Precision: 0.9338182346109175
Mean Recall: 0.8591362126245847
Mean Accuracy: 0.924437199192672
Hyperparameters: {'n_neighbors': 10, 'weights': 'distance'}
Mean F1 socres: 0.8974025719603151
Mean Precision: 0.9386962833914053
Mean Recall: 0.8638981173864895
Mean Accuracy: 0.9279459711224967
Hyperparameters: {'n_neigh

<h1>(e) Miscellaneous<h1>

We care much more about false negative rate since incorrectly classify "Malignant" to "Benign" is really dangerous to one's life. And recall rate can give us information on how serious is false negative in this algorithm.

<h1>(f) Comparison <h1>

In [7]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Normalization
X_train = StandardScaler().fit_transform(X_train)
X_test = StandardScaler().fit_transform(X_test)

param_svm = {
    'kernel': ['rbf','linear'],
    'C': [0.1, 1, 10]
}

svm = SVC()

grid_search = GridSearchCV(svm, param_svm, cv=5, scoring=scoring, refit='recall', return_train_score=True)
grid_search.fit(X_train, y_train)
y_pred = grid_search.best_estimator_.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Scores:", f1_score(y_test, y_pred))

Accuracy: 0.9473684210526315
Precision: 1.0
Recall: 0.8636363636363636
F1 Scores: 0.9268292682926829


No. I would prefer logistic model because it is much simpler and the estimation accurarcy is quite similar.