In [9]:
from tqdm import tqdm
import numpy as np

import medmnist 
from medmnist import BreastMNIST
from medmnist import INFO, Evaluator

import os
import cv2

#Modelling
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import GridSearchCV

from scipy.stats import randint
import matplotlib.pyplot as plt

from skimage.feature import hog
import matplotlib.pyplot as plt


In [11]:
# Navigate to the dataset file
npz_file_path = os.path.join(os.path.abspath(os.path.join(os.getcwd(), '..')), 'Datasets', 'breastmnist.npz')

# Step 4: Load the breastmnist.npz file using numpy
data = np.load(npz_file_path)

# Print the names of arrays stored in the .npz file
print("Stored arrays:", data.files)

for item in data.files:
    print(item)
    print(data[item].shape)

train_dataset = data['train_images']
print('Training dataset', train_dataset.shape)

val_dataset = data['val_images']
print('Validation dataset', val_dataset.shape)

test_dataset = data['test_images']
print('Testing dataset', test_dataset.shape)

y_train_labels = data['train_labels']
print('Training label', y_train_labels.shape)


y_val_labels = data['val_labels']
print('Validation label', y_val_labels.shape)


y_test_labels = data['test_labels']
print('Testing label', y_test_labels.shape)

print("Malignant:",train_dataset[4])
print("Malignant:", y_train_labels[4])
cv2.imwrite('Malignant545.png', train_dataset[545])

malignant = np.where(y_train_labels == 0)
print("Total malignant images:", len(malignant[0]))
print("Total malignant images:", malignant[0])


print("Benign", train_dataset[0])
print("Benign", y_train_labels[0])
cv2.imwrite('Benign100.png', train_dataset[100])
cv2.imwrite('Benign50.png', train_dataset[50])

benign = np.where(y_train_labels == 1)
print("Total benign images:", len(benign[0]))
print("Total benign images:", benign[0])

Stored arrays: ['train_images', 'val_images', 'test_images', 'train_labels', 'val_labels', 'test_labels']
train_images
(546, 28, 28)
val_images
(78, 28, 28)
test_images
(156, 28, 28)
train_labels
(546, 1)
val_labels
(78, 1)
test_labels
(156, 1)
Training dataset (546, 28, 28)
Validation dataset (78, 28, 28)
Testing dataset (156, 28, 28)
Training label (546, 1)
Validation label (78, 1)
Testing label (156, 1)
Malignant: [[  1   6  41  79 159 180 175 175 175 181 190 194 193 190 190 188 187 185
  183 180 182 187 191 150  18   1   1   1]
 [  1   1   9 110 187 186 183 183 185 191 196 198 198 201 199 195 197 194
  194 194 190 191 192 180  69   1   1   1]
 [  1   1  31 168 201 200 200 181 160 142 132 134 136 147 163 183 203 207
  207 210 203 200 195 192 140  15   1   1]
 [  1   3  88 184 191 186 153 114  95  88  90  97  93  98 107 120 145 161
  164 177 196 210 208 186 172  64   1   1]
 [  1  23 130 165 172 148 102  87  85  85  92 101  89  93 102  99 105 107
  108 116 132 155 180 178 175 129  14

In [18]:
# Preprocessing test

x_train = np.empty((546, 784))
image = 0


#print("Before flattening", train_dataset[0])
print("Before flattening (train)", train_dataset.shape)

for img in train_dataset:
    #print("Image Shape: ", img.shape)  # Ensure each image is 28x28
    x_train[image] = img.flatten()  # Flatten the image and store it
    image += 1  # Move to the next row in x_train
            
#print("After flattening:", x_train[0])
print("After flattening (train):", x_train.shape)

#print("Label before processing (train)",y_train_labels.shape)
y_train_labels=y_train_labels.flatten()
#print("Label after processing (train)",y_train_labels.shape)
print("\n")

#### Preprocessing  validation
x_val = np.empty((78, 784))
image = 0

print("Before flattening (val)", val_dataset.shape)

for img in val_dataset:
    #print("Image Shape: ", img.shape)  # Ensure each image is 28x28
    x_val[image] = img.flatten()  # Flatten the image and store it
    image += 1  # Move to the next row in x_train

print("After flattening (val):", x_val.shape)

#print("Label after processing (val)",y_val_labels.shape)
y_val_labels=y_val_labels.flatten()
#print("Label after processing (val)",y_val_labels.shape)

### Preprocessing test
x_test = np.empty((156, 784))
image = 0

print("Before flattening (test)", test_dataset.shape)

for img in test_dataset:
    #print("Image Shape: ", img.shape)  # Ensure each image is 28x28
    x_test[image] = img.flatten()  # Flatten the image and store it
    image += 1  # Move to the next row in x_train

print("After flattening (test):", x_test.shape)

#print("Label after processing (val)",y_test_labels.shape)
y_test_labels=y_test_labels.flatten()
#print("Label after processing (val)",y_test_labels.shape)

Before flattening (train) (546, 28, 28)
After flattening (train): (546, 784)


Before flattening (val) (78, 28, 28)
After flattening (val): (78, 784)
Before flattening (test) (156, 28, 28)
After flattening (test): (156, 784)


In [31]:
# Train
x_features_train = np.empty((len(train_dataset), 144))  # Assuming each image has 324 HOG features

# Loop through each image in the dataset
for x in range(len(train_dataset)):
    # Compute the HOG features for the current image
    hog_features, hog_image = hog(
        train_dataset[x],  # Process the current image in the loop
        orientations=9,
        pixels_per_cell=(8, 8),
        cells_per_block=(2, 2),
        visualize=True,
        feature_vector=True
    )
    # Store the HOG features for the current image in the x_features array
    x_features_train[x] = hog_features  # Assign the feature vector for each image

#Valid
x_features_val = np.empty((len(val_dataset), 144))  # Assuming each image has 324 HOG features

# Loop through each image in the dataset
for x in range(len(val_dataset)):
    # Compute the HOG features for the current image
    hog_features, hog_image = hog(
        val_dataset[x],  # Process the current image in the loop
        orientations=9,
        pixels_per_cell=(8, 8),
        cells_per_block=(2, 2),
        visualize=True,
        feature_vector=True
    )
    # Store the HOG features for the current image in the x_features array
    x_features_val[x] = hog_features  # Assign the feature vector for each image

#Test
x_features_test = np.empty((len(test_dataset), 144))  # Assuming each image has 324 HOG features

# Loop through each image in the dataset
for x in range(len(test_dataset)):
    # Compute the HOG features for the current image
    hog_features, hog_image = hog(
        test_dataset[x],  # Process the current image in the loop
        orientations=9,
        pixels_per_cell=(8, 8),
        cells_per_block=(2, 2),
        visualize=True,
        feature_vector=True
    )
    # Store the HOG features for the current image in the x_features array
    x_features_test[x] = hog_features  # Assign the feature vector for each image

In [30]:


rf = KNeighborsClassifier()
rf.fit(x_features_train, y_train_labels)

y_pred = rf.predict(x_features_test)
print(y_pred.shape)
print(y_test_labels.shape)

accuracy = accuracy_score(y_test_labels, y_pred)
precision = precision_score(y_test_labels, y_pred)
recall = recall_score(y_test_labels, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)

con_matrix = confusion_matrix(y_test_labels, y_pred) 
print(con_matrix)




(156,)
(156,)
Accuracy: 0.7435897435897436
Precision: 0.7642857142857142
Recall: 0.9385964912280702
[[  9  33]
 [  7 107]]


In [22]:
## Hyperparameter tuning ()
param_dist = {'n_neighbors': [2,3,4,5,6],
              'leaf_size': [1, 2,3, 4, 5, 6, 7,8,9,10],
              'p': [1, 2],}

# Create a random forest classifier
rf = KNeighborsClassifier()

# Use random search to find the best hyperparameters
grid_search = GridSearchCV(rf, 
                            param_grid = param_dist,  
                            cv=5)

# Fit the random search object to the data
grid_search.fit(x_val, y_val_labels)  #13mins 28.9s

In [23]:
# Create a variable for the best model
best_rf = grid_search.best_estimator_

# Print the best hyperparameters
print('Best hyperparameters:',  grid_search.best_params_)

Best hyperparameters: {'leaf_size': 1, 'n_neighbors': 3, 'p': 2}


In [24]:
rf = KNeighborsClassifier(leaf_size= 1, n_neighbors= 3, p= 2)
rf.fit(x_train, y_train_labels)

y_pred = rf.predict(x_test)
print(y_pred.shape)
print(y_test_labels.shape)

accuracy = accuracy_score(y_test_labels, y_pred)
precision = precision_score(y_test_labels, y_pred)
recall = recall_score(y_test_labels, y_pred)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)

#Confusion matrix
con_matrix = confusion_matrix(y_test_labels, y_pred) 
print(con_matrix)

(156,)
(156,)
Accuracy: 0.7564102564102564
Precision: 0.8220338983050848
Recall: 0.8508771929824561
[[21 21]
 [17 97]]
