# Some global directories and resources

In [1]:
import os

dataset_path = "E:/Gauava Disease Detection/Guava_Dataset/GuavaDiseaseDataset"
os.chdir(dataset_path)

train_dir = os.path.join(dataset_path, "train")
val_dir = os.path.join(dataset_path, "val")
test_dir = os.path.join(dataset_path, "test")
dataset_dirs = [train_dir, test_dir, val_dir]
dataset_subdirs_names = ["Anthracnose", "fruit_fly", "healthy_guava"]

dataset_subdirs = [] # Create a list for storing Subdirectories like './train/healthy_guava' for later use
for dir in dataset_dirs:
    for subdir_name in dataset_subdirs_names:
        subdir = os.path.join(dir, subdir_name)
        dataset_subdirs.append(subdir)

RANDOM_SEED_1 = 42  # Used for random shuffling and sampling
RANDOM_SEED_2 = 101 # Used for other purposes

In [2]:
for subdir in dataset_subdirs:
    print(f"{subdir}: {len(os.listdir(subdir))}")

E:/Gauava Disease Detection/Guava_Dataset/GuavaDiseaseDataset\train\Anthracnose: 1080
E:/Gauava Disease Detection/Guava_Dataset/GuavaDiseaseDataset\train\fruit_fly: 918
E:/Gauava Disease Detection/Guava_Dataset/GuavaDiseaseDataset\train\healthy_guava: 649
E:/Gauava Disease Detection/Guava_Dataset/GuavaDiseaseDataset\test\Anthracnose: 156
E:/Gauava Disease Detection/Guava_Dataset/GuavaDiseaseDataset\test\fruit_fly: 132
E:/Gauava Disease Detection/Guava_Dataset/GuavaDiseaseDataset\test\healthy_guava: 94
E:/Gauava Disease Detection/Guava_Dataset/GuavaDiseaseDataset\val\Anthracnose: 308
E:/Gauava Disease Detection/Guava_Dataset/GuavaDiseaseDataset\val\fruit_fly: 262
E:/Gauava Disease Detection/Guava_Dataset/GuavaDiseaseDataset\val\healthy_guava: 185


# **➡ TensorFlow Datasets**

In [3]:
import tensorflow as tf
from tensorflow.keras.utils import image_dataset_from_directory

In [4]:
## Creating the initial datasets from directory
params = dict(
    labels="inferred",          # default
    label_mode="categorical",
    class_names=["Anthracnose", "fruit_fly", "healthy_guava"],
    color_mode='rgb',           # default
    batch_size=32,              # default
    image_size=(224, 224),      # default
    shuffle=True,               # default
    seed=RANDOM_SEED_2,
    validation_split=None,      # default
    subset=None,                # default
    interpolation="bilinear",   # default
    follow_links=False,         # default
    crop_to_aspect_ratio=False, # default
    pad_to_aspect_ratio=False,  # default
    data_format=None,           # default
    verbose=True,               # default
)

train_ds = image_dataset_from_directory(directory=train_dir, **params)
test_ds = image_dataset_from_directory(directory=test_dir, **params)
val_ds = image_dataset_from_directory(directory=val_dir, **params)

Found 2647 files belonging to 3 classes.
Found 382 files belonging to 3 classes.
Found 755 files belonging to 3 classes.


Dataset Split:
- 70% - Train
- 10% - Test
- 20% - Validation

In [5]:
for ds in [train_ds, test_ds, val_ds]:
    print(ds)
    print("Total Filenames:", len(ds.file_paths))
    print("Classes:", ds.class_names)
    print("Cardinality:", ds.cardinality().numpy(), "\n")

<_PrefetchDataset element_spec=(TensorSpec(shape=(None, 224, 224, 3), dtype=tf.float32, name=None), TensorSpec(shape=(None, 3), dtype=tf.float32, name=None))>
Total Filenames: 2647
Classes: ['Anthracnose', 'fruit_fly', 'healthy_guava']
Cardinality: 83 

<_PrefetchDataset element_spec=(TensorSpec(shape=(None, 224, 224, 3), dtype=tf.float32, name=None), TensorSpec(shape=(None, 3), dtype=tf.float32, name=None))>
Total Filenames: 382
Classes: ['Anthracnose', 'fruit_fly', 'healthy_guava']
Cardinality: 12 

<_PrefetchDataset element_spec=(TensorSpec(shape=(None, 224, 224, 3), dtype=tf.float32, name=None), TensorSpec(shape=(None, 3), dtype=tf.float32, name=None))>
Total Filenames: 755
Classes: ['Anthracnose', 'fruit_fly', 'healthy_guava']
Cardinality: 24 



# **➡ Saving the datasets**

In [6]:
save_path = os.path.join(dataset_path, "Saved_datasets")
os.makedirs(save_path, exist_ok=True)

Saving the train dataset

In [None]:
save_path_subdir = os.path.join(save_path, 'train')
os.makedirs(save_path_subdir, exist_ok=True)

num_shards = 5

# Sharding the train dataset into 15 shards and saving each of them in dedicated subdirectories

# Uncomment the codelines below to execute again. Proceed with caution.
# for i in range(num_shards):
#     shard_dataset = train_ds.shard(num_shards=num_shards, index=i)
#     shard_dataset.save(os.path.join(save_path_subdir, f'shard_{i}'), compression='GZIP')

# Please comment the lines again, once runned.

Saving the validation dataset

In [None]:
save_path_subdir = os.path.join(save_path, 'val')
os.makedirs(save_path_subdir, exist_ok=True)

# Uncomment the codelines below to execute again. Proceed with caution.
# val_ds.save(save_path_subdir, compression='GZIP')

# Please comment the lines again, once runned.

Saving the test dataset

In [None]:
save_path_subdir = os.path.join(save_path, 'test')
os.makedirs(save_path_subdir, exist_ok=True)

# Uncomment the codelines below to execute again. Proceed with caution.
# test_ds.save(save_path_subdir, compression='GZIP')

# Please comment the lines again, once runned.

Extras

In [10]:
save_path_subdir = os.path.join(save_path, 'train')
shards = []
for i in range(num_shards):
    shard_path = os.path.join(save_path_subdir, f'shard_{i}')
    shard_dataset = tf.data.Dataset.load(shard_path, compression='GZIP')
    shards.append(shard_dataset)

combined_dataset = shards[0]
for shard in shards[1:]:
    combined_dataset = combined_dataset.concatenate(shard)

In [11]:
combined_dataset.cardinality().numpy()

np.int64(83)