# Import required libraries

In [79]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import CategoricalNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Data Preprocessing

In [80]:
# Load dataset
data = pd.read_csv("Breast_Cancer.csv")

# Display basic information about the dataset
print("Dataset shape:", data.shape)
print("Columns:", data.columns)

# Check for missing data
print("\nMissing data:")
print(data.isnull().sum())

# Handle categorical variables using Label Encoding
categorical_cols = data.select_dtypes(include=['object']).columns
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

# Select features and target
X = data.drop(columns=['Status'])  # Features
y = data['Status']  # Target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Dataset shape: (4024, 16)
Columns: Index(['Age', 'Race', 'Marital Status', 'T Stage ', 'N Stage', '6th Stage',
       'differentiate', 'Grade', 'A Stage', 'Tumor Size', 'Estrogen Status',
       'Progesterone Status', 'Regional Node Examined',
       'Regional Node Positive', 'Survival Months', 'Status'],
      dtype='object')

Missing data:
Age                       0
Race                      0
Marital Status            0
T Stage                   0
N Stage                   0
6th Stage                 0
differentiate             0
Grade                     0
A Stage                   0
Tumor Size                0
Estrogen Status           0
Progesterone Status       0
Regional Node Examined    0
Regional Node Positive    0
Survival Months           0
Status                    0
dtype: int64


# Model Selection

In [81]:
# Build Naive Bayes model
model = CategoricalNB()

# Model Training

In [82]:
model.fit(X_train, y_train)

# Prediction

In [83]:
# Make predictions on testing set
y_test_pred = model.predict(X_test)

# Make predictions on training set
y_train_pred = model.predict(X_train)

# Model Evaluation

In [84]:
# Evaluate the model on testing set
test_accuracy = accuracy_score(y_test, y_test_pred)
print("Testing Accuracy:", test_accuracy)
print("Testing Classification Report:\n", classification_report(y_test, y_test_pred))
print("Testing Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred))

# Evaluate the model on training set
train_accuracy = accuracy_score(y_train, y_train_pred)
print("\nTraining Accuracy:", train_accuracy)
print("Training Classification Report:\n", classification_report(y_train, y_train_pred))
print("Training Confusion Matrix:\n", confusion_matrix(y_train, y_train_pred))

Testing Accuracy: 0.8621118012422361
Testing Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.92      0.92       685
           1       0.54      0.53      0.53       120

    accuracy                           0.86       805
   macro avg       0.73      0.72      0.73       805
weighted avg       0.86      0.86      0.86       805

Testing Confusion Matrix:
 [[631  54]
 [ 57  63]]

Training Accuracy: 0.8592730661696178
Training Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.92      0.92      2723
           1       0.54      0.54      0.54       496

    accuracy                           0.86      3219
   macro avg       0.73      0.73      0.73      3219
weighted avg       0.86      0.86      0.86      3219

Training Confusion Matrix:
 [[2496  227]
 [ 226  270]]
