## Extract, Transform and Load (ETL) Step

We are going to use a custom CNN for object identification.

### Setup

In [1]:
import os
import random
import time
from dataclasses import dataclass
from pathlib import Path
from enum import Enum
import shutil
from tqdm import tqdm
import tensorflow as tf
import keras


ROOT_DIR = "datasets\\aircraft"
DATASET_DIR = "..\\dataset\\crop"

## Classes Mapping

We also need to provide a classes mapping, here is how we do that

In [2]:
CLASSES_RAW = [
    "a10",
    "a400m",
    "ag600",
    "ah64",
    "av8b",
    "an124",
    "an22",
    "an225",
    "an72",
    "b1",
    "b2",
    "b21",
    "b52",
    "be200",
    "c130",
    "c17",
    "c2",
    "c390",
    "c5",
    "ch47",
    "cl415",
    "e2",
    "e7",
    "ef2000",
    "f117",
    "f14",
    "f15",
    "f16",
    "f22",
    "f35",
    "f4",
    # IN DATA AS F18
    "f18",
    "h6",
    "j10",
    "j20",
    "jas39",
    "jf17",
    "jh7",
    "kc135",
    "kf21",
    "kj600",
    "ka27",
    "ka52",
    "mq9",
    "mi24",
    "mi26",
    "mi28",
    "mig29",
    "mig31",
    "mirage2000",
    "p3",
    "rq4",
    "rafale",
    "sr71",
    "su24",
    "su25",
    "su34",
    "su57",
    "tb001",
    "tb2",
    "tornado",
    "tu160",
    "tu22m",
    "tu95",
    "u2",
    "uh60",
    "us2",
    "v22",
    "vulcan",
    "wz7",
    "xb70",
    "y20",
    "yf23",
    "z19"
]

CLASSES = {model: i for i, model in enumerate(CLASSES_RAW)}

def get_class_id(class_str: str):
    return CLASSES[class_str.lower()]

print(CLASSES)

{'a10': 0, 'a400m': 1, 'ag600': 2, 'ah64': 3, 'av8b': 4, 'an124': 5, 'an22': 6, 'an225': 7, 'an72': 8, 'b1': 9, 'b2': 10, 'b21': 11, 'b52': 12, 'be200': 13, 'c130': 14, 'c17': 15, 'c2': 16, 'c390': 17, 'c5': 18, 'ch47': 19, 'cl415': 20, 'e2': 21, 'e7': 22, 'ef2000': 23, 'f117': 24, 'f14': 25, 'f15': 26, 'f16': 27, 'f22': 28, 'f35': 29, 'f4': 30, 'f18': 31, 'h6': 32, 'j10': 33, 'j20': 34, 'jas39': 35, 'jf17': 36, 'jh7': 37, 'kc135': 38, 'kf21': 39, 'kj600': 40, 'ka27': 41, 'ka52': 42, 'mq9': 43, 'mi24': 44, 'mi26': 45, 'mi28': 46, 'mig29': 47, 'mig31': 48, 'mirage2000': 49, 'p3': 50, 'rq4': 51, 'rafale': 52, 'sr71': 53, 'su24': 54, 'su25': 55, 'su34': 56, 'su57': 57, 'tb001': 58, 'tb2': 59, 'tornado': 60, 'tu160': 61, 'tu22m': 62, 'tu95': 63, 'u2': 64, 'uh60': 65, 'us2': 66, 'v22': 67, 'vulcan': 68, 'wz7': 69, 'xb70': 70, 'y20': 71, 'yf23': 72, 'z19': 73}


### Defining Dataset Constants

In [3]:
IMG_WIDTH = 256 # Keras default
IMG_HEIGHT = 256 # Keras default
IMG_CHANNELS = 3 # RGB

BATCH_SIZE = 32

### Extracting the Dataset

We need to first extract the x and y data from the dataset (image paths and labels).

### Splitting the Dataset

We need to split out dataset into three parts, train, validation and test.

We're going to use (80%, 10%, 10%) repartition for now.

In [4]:
def split_dataset(paths: list[str], classes: list[int], seed: int = None) -> tuple[list[str], list[int], list[str], list[int], list[str], list[int]]:
    x_train = []
    y_train = []
    x_val = []
    y_val = []
    x_test = []
    y_test = []

    if not seed:
        seed = time.time()

    random.seed(seed)

    for i in tqdm(range(len(paths))):
        rand = random.randint(1, 10)

        # Validation (10%)
        if rand == 9:
            x_val.append(paths[i])
            y_val.append(classes[i])

        # Test (10%)
        elif rand == 10:
            x_test.append(paths[i])
            y_test.append(classes[i])

        # Train (80%)
        else:
            x_train.append(paths[i])
            y_train.append(classes[i])

    return x_train, y_train, x_val, y_val, x_test, y_test

def extract_dataset(dataset_dir: str = DATASET_DIR, seed: int = None) -> None:
    aircraft_filepaths = []
    aircraft_classes = []
    for aircraft_dir in tqdm(os.listdir(dataset_dir)):
        aircraft_class = get_class_id(aircraft_dir)

        dir_path = os.path.join(dataset_dir, aircraft_dir)
        
        for aircraft_img in os.listdir(dir_path):
            aircraft_img_path = os.path.join(dir_path, aircraft_img)

            aircraft_filepaths.append(aircraft_img_path)
            aircraft_classes.append(aircraft_class)

    print(f"Found {len(aircraft_filepaths)} aircraft images")

    return aircraft_filepaths, aircraft_classes

paths, classes = extract_dataset()

x_train, y_train, x_val, y_val, x_test, y_test = split_dataset(paths, classes)

print(f"Train: {len(x_train)}")
print(f"Validation: {len(x_val)}")
print(f"Test: {len(x_test)}")

100%|██████████| 74/74 [00:00<00:00, 865.22it/s]


Found 31917 aircraft images


100%|██████████| 31917/31917 [00:00<00:00, 1387781.85it/s]

Train: 25732
Validation: 3161
Test: 3024





### Transforming the Dataset

We need to transform the dataset into a format that can be used by the model.

This includes

- Loading the images as tensors
- Resizing the images
- Normalizing the images (done later by the model)

### Lazy Loading and Tensorflow Datasets

We're going to use the `tf.data.Dataset` API to load the images lazily.

This is done to avoid loading all the images into RAM at once (10GB+ of images).

We also need to shuffle the dataset to avoid biasing the model since the images are ordered by class.

Notes:

- We're going to use a batch size of 32 for now
- We're going to use a prefetch buffer, automatically tuned by Tensorflow


In [5]:
class AircraftDataGenerator:
    def __init__(self, filepaths: list[str], classes: list[int], batch_size: int = BATCH_SIZE):
        self.filepaths = filepaths
        self.classes = classes
        self.batch_size = batch_size

    def load_image(self, filepath: str) -> tf.Tensor:
        image = tf.io.read_file(filepath)
        image = tf.image.decode_jpeg(image, channels=3)
        image = tf.image.resize(image, [IMG_WIDTH, IMG_HEIGHT])
        return image

    def create_dataset(self) -> tf.data.Dataset:
        dataset = tf.data.Dataset.from_tensor_slices((self.filepaths, self.classes))

        dataset = dataset.shuffle(buffer_size=len(self.filepaths))
        
        dataset = dataset.map(lambda x, y: (self.load_image(x), y), 
                            num_parallel_calls=tf.data.AUTOTUNE)
        
        
        dataset = dataset.batch(self.batch_size)

        dataset = dataset.prefetch(tf.data.AUTOTUNE)
        return dataset
    
train_generator = AircraftDataGenerator(x_train, y_train)
val_generator = AircraftDataGenerator(x_val, y_val)
test_generator = AircraftDataGenerator(x_test, y_test)

train_dataset = train_generator.create_dataset()
val_dataset = val_generator.create_dataset()
test_dataset = test_generator.create_dataset()

### Building the Model



In [None]:
model = keras.Sequential([
    keras.layers.Rescaling(1./255, input_shape=(IMG_HEIGHT, IMG_WIDTH, 3)),
    keras.layers.Conv2D(32, 3, activation='relu'),
    keras.layers.MaxPooling2D(),
    keras.layers.Conv2D(32, 3, activation='relu'),
    keras.layers.MaxPooling2D(),
    keras.layers.Conv2D(32, 3, activation='relu'),
    keras.layers.MaxPooling2D(),
    keras.layers.Flatten(),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(128, activation='relu'),
    keras.layers.Dense(len(CLASSES), activation='softmax')
])

model.compile(optimizer='adam',
                loss='sparse_categorical_crossentropy',
                metrics=['accuracy'])

model.summary()

model.fit(train_dataset, validation_data=val_dataset, epochs=20)

model.evaluate(test_dataset)

model.save("identification.keras")

print("Model saved")

Epoch 1/20
[1m800/800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m189s[0m 235ms/step - accuracy: 0.0507 - loss: 4.1057 - val_accuracy: 0.0816 - val_loss: 3.8853
Epoch 2/20
[1m800/800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m184s[0m 230ms/step - accuracy: 0.1221 - loss: 3.7063 - val_accuracy: 0.1700 - val_loss: 3.5151
Epoch 3/20
[1m800/800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m185s[0m 231ms/step - accuracy: 0.2444 - loss: 3.1014 - val_accuracy: 0.2258 - val_loss: 3.3727
Epoch 4/20
[1m800/800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m185s[0m 231ms/step - accuracy: 0.3731 - loss: 2.4472 - val_accuracy: 0.2418 - val_loss: 3.4662
Epoch 5/20
[1m800/800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m185s[0m 231ms/step - accuracy: 0.5024 - loss: 1.8862 - val_accuracy: 0.2638 - val_loss: 3.6750
Epoch 6/20
[1m800/800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m184s[0m 230ms/step - accuracy: 0.6124 - loss: 1.4445 - val_accuracy: 0.2610 - val_loss: 3.8860
Epoc



Model saved
