In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
# Specify the path to the downloaded dataset file on your local machine
file_path = "breast+cancer+wisconsin+original/breast-cancer-wisconsin.data"

In [3]:
# Define column names for the dataset
column_names = ["Sample_Code_Number", "Clump_Thickness", "Uniformity_of_Cell_Size", "Uniformity_of_Cell_Shape",
                "Marginal_Adhesion", "Single_Epithelial_Cell_Size", "Bare_Nuclei", "Bland_Chromatin", "Normal_Nucleoli",
                "Mitoses", "Class"]

In [4]:
# Read the dataset file into a pandas DataFrame
df = pd.read_csv(file_path, names=column_names)

In [5]:
# Remove rows with missing values ('?') in the "Bare Nuclei" column
df = df[df['Bare_Nuclei'] != '?']

In [6]:
# Reset the index of the DataFrame after removing rows
df = df.reset_index(drop=True)

In [7]:
# Remove the "Sample Code Number" column
df.drop("Sample_Code_Number", axis=1, inplace=True)

In [8]:
# Replace 2 with 0 and 4 with 1 in the 'Class' column
df['Class'] = df['Class'].replace({2: 0, 4: 1})

In [9]:
# Prepare the Data
# Separate features and target variable
X = df.drop("Class", axis=1)  # Features
y = df["Class"]  # Target variable

In [10]:
# Create a Gaussian Naive Bayes classifier
naive_bayes = GaussianNB()

In [11]:
# Specify the number of cross-validation folds (K) - e.g., 5-fold cross-validation
n_splits = 5
cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=4)

In [12]:
# Perform K-fold cross-validation and get the classification scores
scores = cross_val_score(naive_bayes, X, y, cv=cv, scoring='accuracy')

In [13]:
# Print the accuracy for each fold
print("Cross-Validation Accuracy Scores:")
for i, score in enumerate(scores):
    print(f"Fold {i+1}: {score:.6f}")

Cross-Validation Accuracy Scores:
Fold 1: 0.970803
Fold 2: 0.941606
Fold 3: 0.963504
Fold 4: 0.970588
Fold 5: 0.963235


In [14]:
# Calculate and print the mean accuracy and standard deviation
mean_accuracy = np.mean(scores)
std_accuracy = np.std(scores)
print(f"Mean Accuracy: {mean_accuracy:.6f}")
print(f"Standard Deviation: {std_accuracy:.6f}")

Mean Accuracy: 0.961947
Standard Deviation: 0.010686


#### Random State Loop

In [15]:
# Specify a range of random states to consider
random_states = list(range(1001))  # Add more random states as needed

# Initialize lists to store results
mean_accuracies = []
std_accuracies = []

# Loop through random states and perform cross-validation
for random_state in random_states:
    # Specify the number of cross-validation folds (e.g., 5-fold cross-validation)
    n_splits = 5
    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    
    # Perform K-fold cross-validation and get the classification scores
    scores = cross_val_score(naive_bayes, X, y, cv=cv, scoring='accuracy')
    
    # Calculate mean and standard deviation of accuracy
    mean_accuracy = np.mean(scores)
    std_accuracy = np.std(scores)
    
    # Append results to lists
    mean_accuracies.append(mean_accuracy)
    std_accuracies.append(std_accuracy)
    
    # Print results for the current random state
    print(f"Random State {random_state}:")
    print("Cross-Validation Accuracy Scores:")
    for i, score in enumerate(scores):
        print(f"Fold {i+1}: {score:.6f}")
    print(f"Mean Accuracy: {mean_accuracy:.6f}")
    print(f"Standard Deviation: {std_accuracy:.6f}")
    print()

# Print the mean and standard deviation across random states
print("Mean Accuracy Across Random States:")
for i, random_state in enumerate(random_states):
    print(f"Random State {random_state}: {mean_accuracies[i]:.6f}")
print()
print("Standard Deviation Across Random States:")
for i, random_state in enumerate(random_states):
    print(f"Random State {random_state}: {std_accuracies[i]:.6f}")

Random State 0:
Cross-Validation Accuracy Scores:
Fold 1: 0.985401
Fold 2: 0.948905
Fold 3: 0.948905
Fold 4: 0.955882
Fold 5: 0.970588
Mean Accuracy: 0.961936
Standard Deviation: 0.014154

Random State 1:
Cross-Validation Accuracy Scores:
Fold 1: 0.970803
Fold 2: 0.970803
Fold 3: 0.927007
Fold 4: 0.977941
Fold 5: 0.941176
Mean Accuracy: 0.957546
Standard Deviation: 0.019839

Random State 2:
Cross-Validation Accuracy Scores:
Fold 1: 0.956204
Fold 2: 0.970803
Fold 3: 0.970803
Fold 4: 0.963235
Fold 5: 0.948529
Mean Accuracy: 0.961915
Standard Deviation: 0.008620

Random State 3:
Cross-Validation Accuracy Scores:
Fold 1: 0.927007
Fold 2: 0.941606
Fold 3: 0.956204
Fold 4: 0.977941
Fold 5: 0.992647
Mean Accuracy: 0.959081
Standard Deviation: 0.023769

Random State 4:
Cross-Validation Accuracy Scores:
Fold 1: 0.970803
Fold 2: 0.941606
Fold 3: 0.963504
Fold 4: 0.970588
Fold 5: 0.963235
Mean Accuracy: 0.961947
Standard Deviation: 0.010686

Random State 5:
Cross-Validation Accuracy Scores:
Fold 

Random State 9:
Cross-Validation Accuracy Scores:
Fold 1: 0.948905
Fold 2: 0.970803
Fold 3: 0.948905
Fold 4: 0.955882
Fold 5: 0.977941
Mean Accuracy: 0.960487
Standard Deviation: 0.011837

Random State 10:
Cross-Validation Accuracy Scores:
Fold 1: 0.956204
Fold 2: 0.978102
Fold 3: 0.941606
Fold 4: 0.941176
Fold 5: 0.992647
Mean Accuracy: 0.961947
Standard Deviation: 0.020404

Random State 11:
Cross-Validation Accuracy Scores:
Fold 1: 0.948905
Fold 2: 0.963504
Fold 3: 0.978102
Fold 4: 0.941176
Fold 5: 0.963235
Mean Accuracy: 0.958985
Standard Deviation: 0.012827

Random State 12:
Cross-Validation Accuracy Scores:
Fold 1: 0.970803
Fold 2: 0.956204
Fold 3: 0.970803
Fold 4: 0.970588
Fold 5: 0.948529
Mean Accuracy: 0.963386
Standard Deviation: 0.009319

Random State 13:
Cross-Validation Accuracy Scores:
Fold 1: 0.963504
Fold 2: 0.956204
Fold 3: 0.948905
Fold 4: 0.963235
Fold 5: 0.970588
Mean Accuracy: 0.960487
Standard Deviation: 0.007364

Random State 14:
Cross-Validation Accuracy Scores:


In [16]:
# Save accuracy values as numpy array
np.save('NBC_accuracies.npy',mean_accuracies)

In [17]:
# # Sort the mean_accuracies array in ascending order and get the corresponding indices
# sorted_indices = np.argsort(mean_accuracies)

# # Initialize lists to store sorted results
# sorted_mean_accuracies = []
# sorted_std_accuracies = []
# sorted_random_states = []

# # Iterate through sorted indices and collect sorted results
# for index in sorted_indices:
#     sorted_mean_accuracies.append(mean_accuracies[index])
#     sorted_std_accuracies.append(std_accuracies[index])
#     sorted_random_states.append(random_states[index])

# # Print the sorted mean accuracies, standard deviations, and corresponding random states
# print("Sorted Mean Accuracy Across Random States:")
# for i in range(len(sorted_indices)):
#     print(f"Random State {sorted_random_states[i]}: {sorted_mean_accuracies[i]:.6f}")

# print("\nSorted Standard Deviation Across Random States:")
# for i in range(len(sorted_indices)):
#     print(f"Random State {sorted_random_states[i]}: {sorted_std_accuracies[i]:.6f}")
