# Importing necessary libraries

In [None]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Loading dataset

In [2]:
# Load dataset
cancer = datasets.load_breast_cancer()
X = cancer.data
y = cancer.target

# Exploring dataset

In [3]:
# print the names of the 13 features
print("Features: ", cancer.feature_names)

# print the label type of cancer('malignant' 'benign')
print("Labels: ", cancer.target_names)


Features:  ['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']
Labels:  ['malignant' 'benign']


In [6]:
# print data(feature)shape
print(X.shape)
print(y.shape)


(569, 30)
(569,)


# Splitting Data
To understand model performance, dividing the dataset into a training set and a test set is a good strategy.

Split the dataset by using the function train_test_split(). we need to pass 3 parameters features, target, and test_set size.
Additionally, we can use random_state to select records randomly.



In [36]:
# Import train_test_split function
from sklearn.model_selection import train_test_split

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=109) # 70% training and 30% test


# Scaling

In [37]:
#Standardize features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# K-Fold Cross-Validation

In [43]:

# Define the model without fixing the solver
model = LogisticRegression(max_iter=200)

# Perform K-Fold Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=109)
accuracy_scores = cross_val_score(model, X_train, y_train, cv=kf, scoring='accuracy')

print(f'Cross-validation accuracy scores: {accuracy_scores}')
print(f'Mean cross-validation accuracy: {accuracy_scores.mean()}')


Cross-validation accuracy scores: [0.9625     0.975      0.9875     0.93670886 0.98734177]
Mean cross-validation accuracy: 0.9698101265822785


# Performance of the Model

In [44]:
# Train and evaluate on test data
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Final Test Accuracy: {accuracy}')
print(f'Final Test Precision: {precision}')
print(f'Final Test Recall: {recall}')
print(f'Final Test F1-Score: {f1}')



Final Test Accuracy: 0.9824561403508771
Final Test Precision: 0.972972972972973
Final Test Recall: 1.0
Final Test F1-Score: 0.9863013698630136


# Hyperparameter Tuning (GridSearchCV)

In [45]:
from sklearn.model_selection import GridSearchCV

# Define hyperparameter grid
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],  # Regularization strength
    'solver': ['liblinear', 'lbfgs', 'saga'],  # Let GridSearchCV pick the best solver
    'max_iter': [200, 500, 1000]
}

# Perform Grid Search
grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Print best hyperparameters
print(f'Best Parameters: {grid_search.best_params_}')
print(f'Best Cross-Validation Accuracy: {grid_search.best_score_}')



Best Parameters: {'C': 0.1, 'max_iter': 200, 'solver': 'liblinear'}
Best Cross-Validation Accuracy: 0.9647784810126583


# Evaluation on Test Dataset Using the Best Model

In [31]:
# Get the best model
best_model = grid_search.best_estimator_

# Evaluate on test dataset
y_pred = best_model.predict(X_test)

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Final Test Accuracy: {accuracy}')
print(f'Final Test Precision: {precision}')
print(f'Final Test Recall: {recall}')
print(f'Final Test F1-Score: {f1}')


Final Test Accuracy: 0.9824561403508771
Final Test Precision: 0.972972972972973
Final Test Recall: 1.0
Final Test F1-Score: 0.9863013698630136
