In [1]:
import os
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input
from tensorflow.keras.layers import Input, Flatten, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical
from tqdm import tqdm
from keras.models import Sequential
from keras.layers import Flatten, Dense, Dropout
from keras.preprocessing.image import ImageDataGenerator



In [2]:
# Path to your dataset
data_dir = '/kaggle/input/bone-marrow-cell-classification/bone_marrow_cell_dataset'

# List to store image paths and labels
image_paths = []
labels = []

# Loop through class directories

for class_name in os.listdir(data_dir):
    class_dir = os.path.join(data_dir, class_name)
    
    if os.path.isdir(class_dir):
        for sub_dir_name in os.listdir(class_dir):
            sub_dir_path = os.path.join(class_dir, sub_dir_name)
            
            if os.path.isdir(sub_dir_path):
                for img_name in tqdm(os.listdir(sub_dir_path), desc=f"Processing {class_name}/{sub_dir_name}"):
                    img_path = os.path.join(sub_dir_path, img_name)
                    if os.path.isfile(img_path):
                        image_paths.append(img_path)
                        labels.append(class_name)
            else:
                img_path = os.path.join(class_dir, sub_dir_name)
                if os.path.isfile(img_path):
                    image_paths.append(img_path)
                    labels.append(class_name)
    else:
        img_path = class_dir  # For classes without subdirectories
        if os.path.isfile(img_path):
            image_paths.append(img_path)
            labels.append(class_name)

Processing NGS/3001-4000: 100%|██████████| 1000/1000 [00:00<00:00, 11155.27it/s]
Processing NGS/19001-20000: 100%|██████████| 1000/1000 [00:00<00:00, 13389.25it/s]
Processing NGS/0001-1000: 100%|██████████| 1000/1000 [00:00<00:00, 14720.52it/s]
Processing NGS/4001-5000: 100%|██████████| 1000/1000 [00:00<00:00, 14982.71it/s]
Processing NGS/7001-8000: 100%|██████████| 1000/1000 [00:00<00:00, 18354.29it/s]
Processing NGS/26001-27000: 100%|██████████| 1000/1000 [00:00<00:00, 14115.15it/s]
Processing NGS/13001-14000: 100%|██████████| 1000/1000 [00:00<00:00, 15760.71it/s]
Processing NGS/29001-29424: 100%|██████████| 424/424 [00:00<00:00, 82112.15it/s]
Processing NGS/12001-13000: 100%|██████████| 1000/1000 [00:00<00:00, 16924.12it/s]
Processing NGS/25001-26000: 100%|██████████| 1000/1000 [00:00<00:00, 14317.18it/s]
Processing NGS/27001-28000: 100%|██████████| 1000/1000 [00:00<00:00, 10597.62it/s]
Processing NGS/28001-29000: 100%|██████████| 1000/1000 [00:00<00:00, 12617.71it/s]
Processing NGS

In [3]:
# Splitting the data into train, test, and validation sets
X_train, X_temp, y_train, y_temp = train_test_split(image_paths, labels, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Now X_train, y_train, X_val, y_val, X_test, y_test are the organized data for training, validation, and testing
# Combine all labels from train, validation, and test sets
combined_labels = y_train + y_val + y_test

# Print all unique classes
unique_classes = set(combined_labels)
print("Unique classes:", unique_classes)

Unique classes: {'PLM', 'ABE', 'EBO', 'HAC', 'KSC', 'NIF', 'PEB', 'NGB', 'ART', 'EOS', 'MON', 'NGS', 'MMZ', 'LYT', 'BAS', 'PMO', 'BLA', 'MYB', 'FGC', 'LYI', 'OTH'}


In [4]:
from sklearn.preprocessing import LabelEncoder

# Convert string labels to numerical labels using LabelEncoder
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_val_encoded = label_encoder.transform(y_val)
y_test_encoded = label_encoder.transform(y_test)

# Calculate the number of classes directly from the labels
num_classes = len(set(combined_labels))

# Convert numerical labels to one-hot encoded vectors
y_train_onehot = to_categorical(y_train_encoded, num_classes)
y_val_onehot = to_categorical(y_val_encoded, num_classes)
y_test_onehot = to_categorical(y_test_encoded, num_classes)

num_classes


21

In [5]:
# ResNet-50 as the base model
base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(250, 250, 3))

# Build your custom top layers
x = base_model.output
x = Flatten()(base_model.output)
x = Dense(256, activation='relu')(x)
x = Dropout(0.5)(x)
predictions = Dense(num_classes, activation='softmax')(x)


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5


In [6]:
# Create the final model by combining the base model and custom top layers
model = Model(inputs=base_model.input, outputs=predictions)
# Freeze the layers of the base model
for layer in base_model.layers:
    layer.trainable = False


In [7]:
# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


In [None]:
# Display model summary
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 250, 250, 3  0           []                               
                                )]                                                                
                                                                                                  
 conv1_pad (ZeroPadding2D)      (None, 256, 256, 3)  0           ['input_1[0][0]']                
                                                                                                  
 conv1_conv (Conv2D)            (None, 125, 125, 64  9472        ['conv1_pad[0][0]']              
                                )                                                                 
                                                                                              

In [None]:
# Preprocess images using ImageDataGenerator
datagen = ImageDataGenerator(
    preprocessing_function=preprocess_input,
    rescale=1. / 255
)


batch_size = 32

# Define a custom generator to load and preprocess images on-the-fly
def custom_generator(image_paths, labels, batch_size):
    num_samples = len(image_paths)
    while True:
        indices = np.random.choice(num_samples, size=batch_size, replace=False)
        batch_images = []
        batch_labels = []
        for idx in indices:
            img_path = image_paths[idx]
            label = labels[idx]
            try:
                img = load_img(img_path, target_size=(250, 250))  # Resize images to input size
                img_array = img_to_array(img)
                batch_images.append(img_array)
                batch_labels.append(label)
            except Exception as e:
                print(f"Error loading image {img_path}: {e}")
        yield np.array(batch_images), to_categorical(label_encoder.transform(batch_labels), num_classes)

In [None]:
# Create custom generators for training, validation, and test sets
train_generator = custom_generator(X_train, y_train, batch_size)
val_generator = custom_generator(X_val, y_val, batch_size)
test_generator = custom_generator(X_test, y_test, batch_size)


In [None]:
epochs =2

# Train the model using the custom generators
model.fit(
    train_generator,
    steps_per_epoch=len(X_train) // batch_size,
    epochs=epochs,
    validation_data=val_generator,
    validation_steps=len(X_val) // batch_size,
    verbose=1,
)

 594/3748 [===>..........................] - ETA: 13:02 - loss: 3.4418 - accuracy: 0.2169Error loading image /kaggle/input/bone-marrow-cell-classification/bone_marrow_cell_dataset/MYB/5001-6000/MYB_05527.jpg: broken data stream when reading image file

In [None]:
# Evaluate the model on the test set using the custom generator
test_loss, test_accuracy = model.evaluate(
    test_generator,
    steps=len(X_test) // batch_size,
    verbose=1,
)
print(f'Test Loss: {test_loss:.4f}')
print(f'Test Accuracy: {test_accuracy:.4f}')

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# Make predictions on the test set
y_pred = model.predict(test_generator)
y_pred_classes = np.argmax(y_pred, axis=1)

# Get true labels
true_labels = []
for i in range(len(test_generator)):
    true_labels.extend(np.argmax(test_generator[i][1], axis=1))

# Calculate confusion matrix
conf_matrix = confusion_matrix(true_labels, y_pred_classes)

# Calculate classification report
class_report = classification_report(true_labels, y_pred_classes)

# Calculate accuracy
accuracy = accuracy_score(true_labels, y_pred_classes)

print("Confusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)
print("\nAccuracy:", accuracy)
