In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import warnings
warnings.filterwarnings('ignore')

In [5]:
data = load_breast_cancer()
X = data.data
y = data.target

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [8]:
results = {}

In [12]:
def evaluate_model(model, model_name, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    results[model_name] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1
    }
    print(f"\n{model_name} Performance:")
    print(classification_report(y_test, y_pred, target_names=data.target_names))


In [13]:
param_grid_lr = {
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'lbfgs']
}


In [14]:
lr = LogisticRegression(random_state=42)
grid_lr = GridSearchCV(lr, param_grid_lr, cv=5, scoring='f1', n_jobs=-1)
grid_lr.fit(X_train_scaled, y_train)
print("Best Logistic Regression Parameters:", grid_lr.best_params_)
evaluate_model(grid_lr.best_estimator_, "Logistic Regression", X_test_scaled, y_test)

Best Logistic Regression Parameters: {'C': 0.1, 'solver': 'liblinear'}

Logistic Regression Performance:
              precision    recall  f1-score   support

   malignant       1.00      0.98      0.99        43
      benign       0.99      1.00      0.99        71

    accuracy                           0.99       114
   macro avg       0.99      0.99      0.99       114
weighted avg       0.99      0.99      0.99       114



In [15]:
param_dist_svm = {
    'C': np.logspace(-3, 3, 100),
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto', 0.1, 0.01]
}

In [16]:
svm = SVC(random_state=42)
random_svm = RandomizedSearchCV(svm, param_dist_svm, n_iter=20, cv=5, scoring='f1', n_jobs=-1, random_state=42)
random_svm.fit(X_train_scaled, y_train)
print("Best SVM Parameters:", random_svm.best_params_)
evaluate_model(random_svm.best_estimator_, "SVM", X_test_scaled, y_test)

Best SVM Parameters: {'kernel': 'linear', 'gamma': 0.01, 'C': np.float64(0.13219411484660287)}

SVM Performance:
              precision    recall  f1-score   support

   malignant       0.98      0.95      0.96        43
      benign       0.97      0.99      0.98        71

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114



In [17]:
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}
rf = RandomForestClassifier(random_state=42)
grid_rf = GridSearchCV(rf, param_grid_rf, cv=5, scoring='f1', n_jobs=-1)
grid_rf.fit(X_train_scaled, y_train)
print("Best Random Forest Parameters:", grid_rf.best_params_)
evaluate_model(grid_rf.best_estimator_, "Random Forest", X_test_scaled, y_test)

Best Random Forest Parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}

Random Forest Performance:
              precision    recall  f1-score   support

   malignant       0.98      0.93      0.95        43
      benign       0.96      0.99      0.97        71

    accuracy                           0.96       114
   macro avg       0.97      0.96      0.96       114
weighted avg       0.97      0.96      0.96       114



In [18]:

# Analyzing results to select the best model
results_df = pd.DataFrame(results).T
print("\nModel Comparison:")
print(results_df)


Model Comparison:
                     Accuracy  Precision    Recall  F1-Score
Logistic Regression  0.991228   0.986111  1.000000  0.993007
SVM                  0.973684   0.972222  0.985915  0.979021
Random Forest        0.964912   0.958904  0.985915  0.972222


In [19]:
# Selecting
#  the best model based on F1-Score
best_model = results_df['F1-Score'].idxmax()
print(f"\nBest Performing Model: {best_model}")
print(f"Best Model Metrics:\n{results_df.loc[best_model]}")


Best Performing Model: Logistic Regression
Best Model Metrics:
Accuracy     0.991228
Precision    0.986111
Recall       1.000000
F1-Score     0.993007
Name: Logistic Regression, dtype: float64
