In [20]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [21]:
# Specify the path to the downloaded dataset file on your local machine
file_path = "breast+cancer+wisconsin+original/breast-cancer-wisconsin.data"

In [22]:
# Define column names for the dataset
column_names = ["Sample_Code_Number", "Clump_Thickness", "Uniformity_of_Cell_Size", "Uniformity_of_Cell_Shape",
                "Marginal_Adhesion", "Single_Epithelial_Cell_Size", "Bare_Nuclei", "Bland_Chromatin", "Normal_Nucleoli",
                "Mitoses", "Class"]

In [23]:
# Read the dataset file into a pandas DataFrame
df = pd.read_csv(file_path, names=column_names)

In [24]:
# Remove rows with missing values ('?') in the "Bare Nuclei" column
df = df[df['Bare_Nuclei'] != '?']

In [25]:
# Reset the index of the DataFrame after removing rows
df = df.reset_index(drop=True)

In [26]:
# Remove the "Sample Code Number" column
df.drop("Sample_Code_Number", axis=1, inplace=True)

In [27]:
# Replace 2 with 0 and 4 with 1 in the 'Class' column
df['Class'] = df['Class'].replace({2: 0, 4: 1})

In [28]:
# Prepare the Data
# Separate features and target variable
X = df.drop("Class", axis=1)  # Features
y = df["Class"]  # Target variable

In [29]:
# # Split the Data
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [30]:
# Create a KNN classifier with k=3
knn = KNeighborsClassifier(n_neighbors=3)

In [31]:
# Specify the number of cross-validation folds (K) - e.g., 5-fold cross-validation
n_splits = 5
cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=4)

In [33]:
# Perform K-fold cross-validation and get the classification scores
scores = cross_val_score(knn, X, y, cv=cv, scoring='accuracy')

In [34]:
# Print the accuracy for each fold
print("Cross-Validation Accuracy Scores:")
for i, score in enumerate(scores):
    print(f"Fold {i+1}: {score:.6f}")

Cross-Validation Accuracy Scores:
Fold 1: 0.985401
Fold 2: 0.970803
Fold 3: 0.978102
Fold 4: 0.963235
Fold 5: 0.977941


In [35]:
# Calculate and print the mean accuracy and standard deviation
mean_accuracy = np.mean(scores)
std_accuracy = np.std(scores)
print(f"Mean Accuracy: {mean_accuracy:.6f}")
print(f"Standard Deviation: {std_accuracy:.6f}")

Mean Accuracy: 0.975097
Standard Deviation: 0.007516


#### Random State Loop

In [36]:
# Specify a range of random states to consider
random_states = list(range(101))  # Add more random states as needed

# Initialize lists to store results
mean_accuracies = []
std_accuracies = []

# Loop through random states and perform cross-validation
for random_state in random_states:
    # Specify the number of cross-validation folds (e.g., 5-fold cross-validation)
    n_splits = 5
    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    
    # Perform K-fold cross-validation and get the classification scores
    scores = cross_val_score(knn, X, y, cv=cv, scoring='accuracy')
    
    # Calculate mean and standard deviation of accuracy
    mean_accuracy = np.mean(scores)
    std_accuracy = np.std(scores)
    
    # Append results to lists
    mean_accuracies.append(mean_accuracy)
    std_accuracies.append(std_accuracy)
    
    # Print results for the current random state
    print(f"Random State {random_state}:")
    print("Cross-Validation Accuracy Scores:")
    for i, score in enumerate(scores):
        print(f"Fold {i+1}: {score:.6f}")
    print(f"Mean Accuracy: {mean_accuracy:.6f}")
    print(f"Standard Deviation: {std_accuracy:.6f}")
    print()

# Print the mean and standard deviation across random states
print("Mean Accuracy Across Random States:")
for i, random_state in enumerate(random_states):
    print(f"Random State {random_state}: {mean_accuracies[i]:.6f}")
print()
print("Standard Deviation Across Random States:")
for i, random_state in enumerate(random_states):
    print(f"Random State {random_state}: {std_accuracies[i]:.6f}")

Random State 0:
Cross-Validation Accuracy Scores:
Fold 1: 0.992701
Fold 2: 0.963504
Fold 3: 0.963504
Fold 4: 0.955882
Fold 5: 0.977941
Mean Accuracy: 0.970706
Standard Deviation: 0.013112

Random State 1:
Cross-Validation Accuracy Scores:
Fold 1: 0.985401
Fold 2: 0.970803
Fold 3: 0.970803
Fold 4: 0.970588
Fold 5: 0.977941
Mean Accuracy: 0.975107
Standard Deviation: 0.005856

Random State 2:
Cross-Validation Accuracy Scores:
Fold 1: 0.970803
Fold 2: 0.985401
Fold 3: 0.970803
Fold 4: 0.955882
Fold 5: 0.963235
Mean Accuracy: 0.969225
Standard Deviation: 0.009804

Random State 3:
Cross-Validation Accuracy Scores:
Fold 1: 0.948905
Fold 2: 0.963504
Fold 3: 0.941606
Fold 4: 1.000000
Fold 5: 0.977941
Mean Accuracy: 0.966391
Standard Deviation: 0.020935

Random State 4:
Cross-Validation Accuracy Scores:
Fold 1: 0.970803
Fold 2: 0.978102
Fold 3: 0.970803
Fold 4: 0.985294
Fold 5: 0.970588
Mean Accuracy: 0.975118
Standard Deviation: 0.005835

Random State 5:
Cross-Validation Accuracy Scores:
Fold 

Random State 10:
Cross-Validation Accuracy Scores:
Fold 1: 0.963504
Fold 2: 0.992701
Fold 3: 0.963504
Fold 4: 0.970588
Fold 5: 0.985294
Mean Accuracy: 0.975118
Standard Deviation: 0.011858

Random State 11:
Cross-Validation Accuracy Scores:
Fold 1: 0.963504
Fold 2: 0.985401
Fold 3: 0.985401
Fold 4: 0.948529
Fold 5: 0.948529
Mean Accuracy: 0.966273
Standard Deviation: 0.016548

Random State 12:
Cross-Validation Accuracy Scores:
Fold 1: 0.963504
Fold 2: 0.963504
Fold 3: 0.970803
Fold 4: 0.992647
Fold 5: 0.955882
Mean Accuracy: 0.969268
Standard Deviation: 0.012606

Random State 13:
Cross-Validation Accuracy Scores:
Fold 1: 0.970803
Fold 2: 0.963504
Fold 3: 0.985401
Fold 4: 0.963235
Fold 5: 0.970588
Mean Accuracy: 0.970706
Standard Deviation: 0.008046

Random State 14:
Cross-Validation Accuracy Scores:
Fold 1: 0.978102
Fold 2: 0.956204
Fold 3: 0.970803
Fold 4: 0.977941
Fold 5: 0.963235
Mean Accuracy: 0.969257
Standard Deviation: 0.008517

Random State 15:
Cross-Validation Accuracy Scores:

In [37]:
# # Save accuracy values as numpy array
# np.save('KNN_accuracies.npy',mean_accuracies)

In [38]:
# Sort the mean_accuracies array in ascending order and get the corresponding indices
sorted_indices = np.argsort(mean_accuracies)

# Initialize lists to store sorted results
sorted_mean_accuracies = []
sorted_std_accuracies = []
sorted_random_states = []

# Iterate through sorted indices and collect sorted results
for index in sorted_indices:
    sorted_mean_accuracies.append(mean_accuracies[index])
    sorted_std_accuracies.append(std_accuracies[index])
    sorted_random_states.append(random_states[index])

# Print the sorted mean accuracies, standard deviations, and corresponding random states
print("Sorted Mean Accuracy Across Random States:")
for i in range(len(sorted_indices)):
    print(f"Random State {sorted_random_states[i]}: {sorted_mean_accuracies[i]:.6f}")

print("\nSorted Standard Deviation Across Random States:")
for i in range(len(sorted_indices)):
    print(f"Random State {sorted_random_states[i]}: {sorted_std_accuracies[i]:.6f}")


Sorted Mean Accuracy Across Random States:
Random State 17: 0.960434
Random State 60: 0.960477
Random State 98: 0.963386
Random State 91: 0.963407
Random State 48: 0.964824
Random State 53: 0.964910
Random State 11: 0.966273
Random State 52: 0.966305
Random State 43: 0.966316
Random State 64: 0.966327
Random State 57: 0.966327
Random State 44: 0.966327
Random State 51: 0.966327
Random State 84: 0.966337
Random State 20: 0.966337
Random State 85: 0.966370
Random State 3: 0.966391
Random State 27: 0.967744
Random State 26: 0.967744
Random State 97: 0.967744
Random State 77: 0.967765
Random State 90: 0.967776
Random State 71: 0.967776
Random State 39: 0.967776
Random State 33: 0.967776
Random State 37: 0.967787
Random State 73: 0.967797
Random State 25: 0.967797
Random State 41: 0.967808
Random State 5: 0.967808
Random State 8: 0.967819
Random State 67: 0.967840
Random State 72: 0.969225
Random State 2: 0.969225
Random State 28: 0.969225
Random State 62: 0.969225
Random State 61: 0.969236