## Dataset Preprocessing

# A Dataset

In [2]:
import os
import numpy as np
import pickle
from tqdm import tqdm
from keras.preprocessing.image import load_img, img_to_array

def preprocess_images(data_dir, img_size):
    images = []
    labels = []

    for label in os.listdir(data_dir):
        label_path = os.path.join(data_dir, label)
        
        for image_file in tqdm(os.listdir(label_path)):
            image_path = os.path.join(label_path, image_file)
            
            img = load_img(image_path, target_size=img_size)
            img_array = img_to_array(img)
            
            img_array /= 255.0
            
            images.append(img_array)
            labels.append(int(label)) 

    return np.array(images), np.array(labels)

def create_data_arrays(dir, img_size, output_dir):
    images, labels = preprocess_images(dir, img_size)

    train_indices = np.arange(len(images))
    np.random.seed(45)
    np.random.shuffle(train_indices)
    train_images = images[train_indices]
    train_labels = labels[train_indices]

    print(f"Training images shape: {train_images.shape}")
    print(f"Training labels shape: {train_labels.shape}")

    with open(output_dir, 'wb') as f:
        pickle.dump((train_images, train_labels), f)

#### Combined dataset pickle files for training, validation, and testing

In [3]:
input_dataset_folder = r"E:\Classification\Focus Conference\Datasets\A\train"
pickle_folder = r"E:\Classification\Focus Conference\Datasets\pickle_data\A_training.pkl"
img_size = (256, 256)
create_data_arrays(input_dataset_folder, img_size, pickle_folder)

100%|██████████| 3315/3315 [01:02<00:00, 52.91it/s] 
100%|██████████| 403/403 [00:09<00:00, 42.53it/s] 


Training images shape: (3718, 256, 256, 3)
Training labels shape: (3718,)


In [4]:
input_dataset_folder = r"E:\Classification\Focus Conference\Datasets\A\val"
pickle_folder = r"E:\Classification\Focus Conference\Datasets\pickle_data\A_validation.pkl"
img_size = (256, 256)
create_data_arrays(input_dataset_folder, img_size, pickle_folder)

100%|██████████| 710/710 [00:13<00:00, 51.45it/s] 
100%|██████████| 85/85 [00:02<00:00, 38.90it/s]


Training images shape: (795, 256, 256, 3)
Training labels shape: (795,)


In [5]:
input_dataset_folder = r"E:\Classification\Focus Conference\Datasets\A\test"
pickle_folder = r"E:\Classification\Focus Conference\Datasets\pickle_data\A_testing.pkl"
img_size = (256, 256)
create_data_arrays(input_dataset_folder, img_size, pickle_folder)

100%|██████████| 712/712 [00:13<00:00, 52.57it/s] 
100%|██████████| 89/89 [00:02<00:00, 37.49it/s]


Training images shape: (801, 256, 256, 3)
Training labels shape: (801,)


-------------------

In [6]:
input_dataset_folder = r"E:\Classification\Focus Conference\Datasets\B\train"
pickle_folder = r"E:\Classification\Focus Conference\Datasets\pickle_data\B_training.pkl"
img_size = (256, 256)
create_data_arrays(input_dataset_folder, img_size, pickle_folder)

100%|██████████| 3551/3551 [01:03<00:00, 56.32it/s] 
100%|██████████| 832/832 [00:12<00:00, 64.75it/s] 


Training images shape: (4383, 256, 256, 3)
Training labels shape: (4383,)


In [7]:
input_dataset_folder = r"E:\Classification\Focus Conference\Datasets\B\val"
pickle_folder = r"E:\Classification\Focus Conference\Datasets\pickle_data\B_validation.pkl"
img_size = (256, 256)
create_data_arrays(input_dataset_folder, img_size, pickle_folder)

100%|██████████| 760/760 [00:13<00:00, 56.37it/s] 
100%|██████████| 176/176 [00:03<00:00, 58.48it/s]


Training images shape: (936, 256, 256, 3)
Training labels shape: (936,)


In [8]:
input_dataset_folder = r"E:\Classification\Focus Conference\Datasets\B\test"
pickle_folder = r"E:\Classification\Focus Conference\Datasets\pickle_data\B_testing.pkl"
img_size = (256, 256)
create_data_arrays(input_dataset_folder, img_size, pickle_folder)

100%|██████████| 764/764 [00:13<00:00, 55.62it/s] 
100%|██████████| 182/182 [00:03<00:00, 58.59it/s] 


Training images shape: (946, 256, 256, 3)
Training labels shape: (946,)


--------------------

In [9]:
destination_directory = r"E:\Classification\Focus Conference\Datasets\A"
print("Dataset A")
print(f'Training images for non-flood: {len(os.listdir(os.path.join(destination_directory, "train", "0")))}')
print(f'Validation images for non-flood: {len(os.listdir(os.path.join(destination_directory, "val", "0")))}')
print(f'Test images for non-flood: {len(os.listdir(os.path.join(destination_directory, "test", "0")))}')

print(f'Training images for flood: {len(os.listdir(os.path.join(destination_directory, "train", "1")))}')
print(f'Validation images for flood: {len(os.listdir(os.path.join(destination_directory, "val", "1")))}')
print(f'Test images for flood: {len(os.listdir(os.path.join(destination_directory, "test", "1")))}')

Dataset A
Training images for non-flood: 3315
Validation images for non-flood: 710
Test images for non-flood: 712
Training images for flood: 403
Validation images for flood: 85
Test images for flood: 89


In [10]:
destination_directory = r"E:\Classification\Focus Conference\Datasets\B"
print("Dataset B")
print(f'Training images for non-flood: {len(os.listdir(os.path.join(destination_directory, "train", "0")))}')
print(f'Validation images for non-flood: {len(os.listdir(os.path.join(destination_directory, "val", "0")))}')
print(f'Test images for non-flood: {len(os.listdir(os.path.join(destination_directory, "test", "0")))}')

print(f'Training images for flood: {len(os.listdir(os.path.join(destination_directory, "train", "1")))}')
print(f'Validation images for flood: {len(os.listdir(os.path.join(destination_directory, "val", "1")))}')
print(f'Test images for flood: {len(os.listdir(os.path.join(destination_directory, "test", "1")))}')

Dataset B
Training images for non-flood: 3551
Validation images for non-flood: 760
Test images for non-flood: 764
Training images for flood: 832
Validation images for flood: 176
Test images for flood: 182
