In [None]:
import numpy as np
import tensorflow as tf
from keras_cv.layers import AugMix
from sklearn.model_selection import train_test_split

data_path = "/kaggle/input/blood-cells/cleaned_training_set.npz"

# Load data and normalize
data = np.load(data_path, allow_pickle=True)
X = data['images']
y = data['labels']

# Convert to int and normalize
X = X.astype(int)
X = (X / 255).astype('float32')
y = tf.keras.utils.to_categorical(y)

# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, stratify=y)

# Define AugMix with mid-to-high severity
augmix_layer = AugMix(value_range=(0, 1), severity=0.5, num_chains=3, chain_depth=(2, 4), alpha=1.0)

# Function to apply AugMix to each image
def augment_image(image, label):
    aug_img = augmix_layer(image[None, ...])[0]
    return aug_img, label

# Create TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
val_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val))

# Apply augmentation
train_dataset = train_dataset.map(augment_image, num_parallel_calls=tf.data.AUTOTUNE)
val_dataset = val_dataset.map(augment_image, num_parallel_calls=tf.data.AUTOTUNE)

# Batch and prefetch
train_dataset = train_dataset.batch(32).prefetch(tf.data.AUTOTUNE)
val_dataset = val_dataset.batch(32).prefetch(tf.data.AUTOTUNE)

# Save the final training and validation sets as .npz files
X_train_augmented, y_train_augmented = next(iter(train_dataset.unbatch().batch(len(X_train))))
X_val_augmented, y_val_augmented = next(iter(val_dataset.unbatch().batch(len(X_val))))

# Concatenate original and augmented data
X_train_combined = np.concatenate((X_train, X_train_augmented.numpy()), axis=0)
y_train_combined = np.concatenate((y_train, y_train_augmented.numpy()), axis=0)
X_val_combined = np.concatenate((X_val, X_val_augmented.numpy()), axis=0)
y_val_combined = np.concatenate((y_val, y_val_augmented.numpy()), axis=0)

np.savez('/kaggle/working/full_and_augMix_training_data.npz', images=X_train_combined, labels=y_train_combined)
np.savez('/kaggle/working/full_and_augMix_validation_data.npz', images=X_val_combined, labels=y_val_combined)

print("Combined training and validation datasets saved successfully.")


Augmenting images:   3%|▎         | 337/10165 [03:39<1:46:53,  1.53it/s]


: 

: 

In [None]:
import matplotlib.pyplot as plt

# Function to display images
def display_images(dataset, num_images):
    plt.figure(figsize=(10, 10))
    for images, labels in dataset.take(1):
        for i in range(num_images):
            ax = plt.subplot(5, 5, i + 1)
            plt.imshow(images[i].numpy())
            plt.title(np.argmax(labels[i].numpy()))
            plt.axis("off")

# Display 25 augmented images from the training dataset
display_images(train_dataset, 25)

In [None]:
# Plot the distribution of labels in the training and validation datasets
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
plt.hist(np.argmax(y_train, axis=1), bins=np.arange(y_train.shape[1] + 1) - 0.5, edgecolor='black')
plt.title('Training Set Label Distribution')
plt.xlabel('Label')
plt.ylabel('Frequency')
plt.xticks(np.arange(y_train.shape[1]))

plt.subplot(1, 2, 2)
plt.hist(np.argmax(y_val, axis=1), bins=np.arange(y_val.shape[1] + 1) - 0.5, edgecolor='black')
plt.title('Validation Set Label Distribution')
plt.xlabel('Label')
plt.ylabel('Frequency')
plt.xticks(np.arange(y_val.shape[1]))

plt.tight_layout()
plt.show()