In [6]:
# Importing required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, classification_report, f1_score
import matplotlib.pyplot as plt

# 1. Load the dataset
file_path = "Breast_Cancer_Dataset.csv"  # Update the path if needed
try:
    df = pd.read_csv(file_path)
    print("Dataset loaded successfully!")
except FileNotFoundError:
    print("Error: File not found. Please check the file path.")
    exit()

# 2. Inspect the dataset
print("\nDataset preview:")
print(df.head())
print("\nDataset information:")
print(df.info())
print("\nNull values in each column:")
print(df.isnull().sum())

# 3. Handle missing values
if df.isnull().sum().sum() > 0:
    print("\nHandling missing values...")
    numeric_columns = df.select_dtypes(include=[np.number]).columns
    df[numeric_columns] = df[numeric_columns].fillna(df[numeric_columns].mean())  # Replace numeric nulls with the mean
    print("Missing values replaced in numeric columns.")
else:
    print("No missing values found.")

# 4. Verify dataset is not empty
if df.empty:
    print("Error: Dataset is empty after cleaning. Please check the data.")
    exit()

# 5. Encode the target variable
if 'diagnosis' not in df.columns:
    print("Error: 'diagnosis' column not found in the dataset.")
    exit()
    
le = LabelEncoder()
df['diagnosis'] = le.fit_transform(df['diagnosis'])

# 6. Separate features and target variable
X = df.drop(columns=['diagnosis'])  # Features
y = df['diagnosis']  # Target
print("\nFeature matrix (X) shape:", X.shape)
print("Target variable (y) shape:", y.shape)

# 7. Split the dataset into training and test sets
try:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    print("\nData split into training and test sets:")
    print("Training set shape:", X_train.shape)
    print("Test set shape:", X_test.shape)
except ValueError as e:
    print(f"Error during train-test split: {e}")
    exit()

# 8. Initialize models
models = [
    ("Logistic Regression", LogisticRegression()),
    ("Naive Bayes", GaussianNB()),
    ("Decision Tree (CART)", DecisionTreeClassifier()),
    ("K-NN", KNeighborsClassifier()),
    ("SVM", SVC(probability=True)),
    ("Random Forest", RandomForestClassifier())
]

# 9. Train and evaluate models
print("\nModel performance:")
for name, model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {accuracy:.4f}")

    # Calculate and display ROC AUC score
    y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else model.decision_function(X_test)
    roc_auc = roc_auc_score(y_test, y_prob)
    print(f"{name} ROC AUC: {roc_auc:.4f}")

    # Plot ROC curve
    fpr, tpr, _ = roc_curve(y_test, y_prob)
    plt.plot(fpr, tpr, label=f"{name} (AUC = {roc_auc:.2f})")

# 10. Plot ROC Curve
plt.title("ROC Curve")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend()
plt.grid()
plt.show()

# 11. Calculate precision, recall, F1-score
print("\nClassification Report and Metrics:")
for name, model in models:
    y_pred = model.predict(X_test)
    print(f"\n{name} Classification Report:")
    print(classification_report(y_test, y_pred))
    print(f"F1 Score: {f1_score(y_test, y_pred):.4f}")


Dataset loaded successfully!

Dataset preview:
         id diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0    842302         M        17.99         10.38          122.80     1001.0   
1    842517         M        20.57         17.77          132.90     1326.0   
2  84300903         M        19.69         21.25          130.00     1203.0   
3  84348301         M        11.42         20.38           77.58      386.1   
4  84358402         M        20.29         14.34          135.10     1297.0   

   smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
0          0.11840           0.27760          0.3001              0.14710   
1          0.08474           0.07864          0.0869              0.07017   
2          0.10960           0.15990          0.1974              0.12790   
3          0.14250           0.28390          0.2414              0.10520   
4          0.10030           0.13280          0.1980              0.10430   

   ...  texture

ValueError: Input X contains NaN.
LogisticRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values