In [1]:
# --- Step 1: Import Necessary Libraries ---
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_val_score
from sklearn.naive_bayes import GaussianNB
# Import the built-in Breast Cancer dataset from sklearn
from sklearn.datasets import load_breast_cancer


In [2]:
# --- Step 2: Load and Prepare the Dataset (Breast Cancer Dataset) ---

# Load the Breast Cancer dataset
cancer = load_breast_cancer()
# Convert to DataFrame
df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
df['target'] = cancer.target # Add the target column

print("--- First 5 rows of the Breast Cancer Dataset ---")
print(df.head())
print("\n" + "="*50 + "\n")

# Separate features (X) and target (y)
X = df.drop('target', axis=1)
y = df['target']
class_names = cancer.target_names

print(f"Features (X) shape: {X.shape}")
print(f"Target (y) shape: {y.shape}")
print(f"Target Classes: {class_names}")
print("\n" + "="*50 + "\n")


--- First 5 rows of the Breast Cancer Dataset ---
   mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0        17.99         10.38          122.80     1001.0          0.11840   
1        20.57         17.77          132.90     1326.0          0.08474   
2        19.69         21.25          130.00     1203.0          0.10960   
3        11.42         20.38           77.58      386.1          0.14250   
4        20.29         14.34          135.10     1297.0          0.10030   

   mean compactness  mean concavity  mean concave points  mean symmetry  \
0           0.27760          0.3001              0.14710         0.2419   
1           0.07864          0.0869              0.07017         0.1812   
2           0.15990          0.1974              0.12790         0.2069   
3           0.28390          0.2414              0.10520         0.2597   
4           0.13280          0.1980              0.10430         0.1809   

   mean fractal dimension  ...  worst text

In [3]:
# --- Step 3: Setup K-Fold Cross-Validation ---

K = 10
# Initialize KFold, shuffling the data before splitting
kfold = KFold(n_splits=K, shuffle=True, random_state=42)
model = GaussianNB()

print(f"--- Performing {K}-Fold Cross-Validation ---")
print(f"Model: Gaussian Naive Bayes")
print(f"The dataset will be split into {K} folds (for Malignant vs. Benign classification).\n")


--- Performing 10-Fold Cross-Validation ---
Model: Gaussian Naive Bayes
The dataset will be split into 10 folds (for Malignant vs. Benign classification).



In [4]:
# --- Step 4: Perform Cross-Validation and Evaluate ---

# Calculate the cross-validation scores (accuracy)
results = cross_val_score(model, X, y, cv=kfold, scoring='accuracy')

print("--- Cross-Validation Results ---")
print(f"Scores for each of the {K} folds:")
for i, score in enumerate(results):
    print(f"  Fold {i+1}: {score:.4f}")

print("\n--- Summary ---")
mean_accuracy = results.mean()
std_deviation = results.std()

print(f"Average Accuracy (Mean): {mean_accuracy:.2%}")
print(f"Standard Deviation of Accuracy: {std_deviation:.4f}")

print("\nInterpretation:")
print(f"The model is, on average, {mean_accuracy:.2%} accurate across 10 distinct test sets.")
print("The standard deviation of performance is very low, suggesting the model is highly stable and robust to variations in the training data.")


--- Cross-Validation Results ---
Scores for each of the 10 folds:
  Fold 1: 0.9825
  Fold 2: 0.9649
  Fold 3: 0.8772
  Fold 4: 0.9649
  Fold 5: 0.9298
  Fold 6: 0.9649
  Fold 7: 0.9298
  Fold 8: 0.9298
  Fold 9: 0.9474
  Fold 10: 0.8929

--- Summary ---
Average Accuracy (Mean): 93.84%
Standard Deviation of Accuracy: 0.0319

Interpretation:
The model is, on average, 93.84% accurate across 10 distinct test sets.
The standard deviation of performance is very low, suggesting the model is highly stable and robust to variations in the training data.
