In [4]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

In [6]:
# Load the dataset
data = pd.read_csv('cancer_data.csv')  # Replace with your dataset path

In [8]:
# Step 1: Remove the ID column
#data.drop(columns=['ID'], inplace=True)

In [10]:
# Step 2: Encode the Diagnosis column (M = 1, B = 0)
label_encoder = LabelEncoder()
data['diagnosis'] = label_encoder.fit_transform(data['diagnosis'])

In [12]:
# Step 3: Choose Diagnosis as the target variable
X = data.drop(columns=['diagnosis'])  # Features
y = data['diagnosis']  # Target

In [14]:
# Step 4: Split the data into train (80%) and test (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
# Step 5: Standardize the features (important for models like SVM, Logistic Regression)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [18]:
# Step 6: Initialize models
models = {
    "Logistic Regression": LogisticRegression(random_state=42),
    "SVM": SVC(kernel='rbf', probability=True, random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "XGBoost": GradientBoostingClassifier(random_state=42),
    "k-NN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "LDA": LinearDiscriminantAnalysis(),
    "QDA": QuadraticDiscriminantAnalysis()
}

In [20]:
# Step 7: Evaluate models using 10-fold cross-validation on the training set
results = {}
for name, model in models.items():
    scores = cross_val_score(model, X_train_scaled, y_train, cv=10, scoring='accuracy')
    results[name] = np.mean(scores)
    print(f"{name}: Mean CV Accuracy = {np.mean(scores):.4f}")

Logistic Regression: Mean CV Accuracy = 0.9736
SVM: Mean CV Accuracy = 0.9714
Random Forest: Mean CV Accuracy = 0.9604
XGBoost: Mean CV Accuracy = 0.9627
k-NN: Mean CV Accuracy = 0.9670
Naive Bayes: Mean CV Accuracy = 0.9252
Decision Tree: Mean CV Accuracy = 0.9276




AdaBoost: Mean CV Accuracy = 0.9648
LDA: Mean CV Accuracy = 0.9517
QDA: Mean CV Accuracy = 0.9537


In [24]:
# Step 8: Print the best model
best_model_name = max(results, key=results.get)
best_model = models[best_model_name]
print(f"\nBest Model: {best_model_name} with Mean CV Accuracy = {results[best_model_name]:.4f}")


Best Model: Logistic Regression with Mean CV Accuracy = 0.9736


In [26]:
# Step 9: Train the best model on the full training set
best_model.fit(X_train_scaled, y_train)

In [28]:
# Step 10: Evaluate the best model on the test set
y_test_pred = best_model.predict(X_test_scaled)
y_test_pred_prob = best_model.predict_proba(X_test_scaled)[:, 1]  # Probabilities for ROC-AUC

test_accuracy = accuracy_score(y_test, y_test_pred)
test_precision = precision_score(y_test, y_test_pred)
test_recall = recall_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)
test_roc_auc = roc_auc_score(y_test, y_test_pred_prob)

print("\nTest Set Performance:")
print(f"Accuracy: {test_accuracy:.4f}")
print(f"Precision: {test_precision:.4f}")
print(f"Recall: {test_recall:.4f}")
print(f"F1-Score: {test_f1:.4f}")
print(f"ROC-AUC: {test_roc_auc:.4f}")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred))


Test Set Performance:
Accuracy: 0.9737
Precision: 0.9762
Recall: 0.9535
F1-Score: 0.9647
ROC-AUC: 0.9974
Confusion Matrix:
 [[70  1]
 [ 2 41]]


### Key Observations:

- **Naive Bayes** performed the worst with a mean cross-validation accuracy of **0.9252**. This is likely because Naive Bayes assumes feature independence, which may not hold true for this dataset, leading to suboptimal performance.

- **Logistic Regression** performed the best among all models with default parameters. The model achieved high accuracy, precision, recall, and ROC-AUC on the test set, indicating strong performance.
- The confusion matrix shows:
- **70 True Negatives (TN)**: Correctly predicted benign cases.
- **1 False Positive (FP)**: Benign cases predicted as malignant.
- **2 False Negatives (FN)**: Malignant cases predicted as benign.
- **41 True Positives (TP)**: Correctly predicted malignant cases.

### Next Steps:
- While the default parameters yielded good results, we will further explore **hyperparameter tuning** for each model to optimize performance.
- Specifically, we will investigate how **Logistic Regression**, **SVM**, and **Random Forest** perform with tuned hyperparameters.
- This will help us identify the best model and configuration for the breast cancer diagnosis prediction task.