In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt

# 1. Load the dataset
file_path = 'Breast_Cancer_Dataset.csv'  # Replace with your correct file path
df = pd.read_csv(file_path)

# 1. Identify and remove null values
print("Null values before dropping:\n", df.isnull().sum())
df = df.dropna()
print("\nNull values after dropping:\n", df.isnull().sum())

# 2. Encode the 'diagnosis' column using LabelEncoder
label_encoder = LabelEncoder()
df['diagnosis'] = label_encoder.fit_transform(df['diagnosis'])
print("\nUnique values in 'diagnosis' column after encoding:", df['diagnosis'].unique())

# 3. Separate target variable and features
X = df.drop(columns=['diagnosis'])  # Features (independent variables)
y = df['diagnosis']                # Target variable

# 4. Split the dataset into training and test sets (70:30 ratio)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 5. List of models to train and evaluate
models = [
    ('Logistic Regression', LogisticRegression(max_iter=1000)),
    ('Naive Bayes', GaussianNB()),
    ('Decision Tree', DecisionTreeClassifier()),
    ('K-NN', KNeighborsClassifier()),
    ('SVM', SVC()),
    ('Random Forest', RandomForestClassifier())
]

# 5. Train and evaluate each model
accuracies = {}
for name, model in models:
    model.fit(X_train, y_train)  # Train the model
    y_pred = model.predict(X_test)  # Predict on test data
    
    # Calculate accuracy
    acc = accuracy_score(y_test, y_pred)
    accuracies[name] = acc
    
    # Print confusion matrix
    print(f"\n{name} - Accuracy: {acc:.4f}")
    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_encoder.classes_)
    disp.plot()
    plt.title(f'Confusion Matrix for {name}')
    plt.show()

# 6. Print the final accuracies
print("\nFinal Model Accuracies:")
for model, acc in accuracies.items():
    print(f"{model}: {acc:.4f}")


Null values before dropping:
 id                           0
diagnosis                    0
radius_mean                  0
texture_mean                 0
perimeter_mean               0
area_mean                    0
smoothness_mean              0
compactness_mean             0
concavity_mean               0
concave points_mean          0
symmetry_mean                0
fractal_dimension_mean       0
radius_se                    0
texture_se                   0
perimeter_se                 0
area_se                      0
smoothness_se                0
compactness_se               0
concavity_se                 0
concave points_se            0
symmetry_se                  0
fractal_dimension_se         0
radius_worst                 0
texture_worst                0
perimeter_worst              0
area_worst                   0
smoothness_worst             0
compactness_worst            0
concavity_worst              0
concave points_worst         0
symmetry_worst               0
fractal_d

ValueError: With n_samples=0, test_size=0.3 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.