# 1. Data Pre-Processing

### Import Library

In [1]:
import os
import shutil
from sklearn.model_selection import train_test_split

### Folder Paths

In [2]:
source_folder = "../dataset/ICDAR-2011"
train_folder = "../template/ICDAR-data/train"
test_folder = "../template/ICDAR-data/test"

### Train and Test Data Split

In [30]:
def split_data(source_folder, train_folder, test_folder):
    if not os.path.exists(train_folder):
        os.makedirs(train_folder)
    if not os.path.exists(test_folder):
        os.makedirs(test_folder)

    signatures = [folder for folder in os.listdir(source_folder) if os.path.isdir(os.path.join(source_folder, folder))]
    for signature in signatures:
        signature_path = os.path.join(source_folder, signature)
        images = os.listdir(signature_path)
        train_signature, test_signature = train_test_split(images, test_size=0.2, random_state=3)

        for sig in train_signature:
            source_path = os.path.join(signature_path, sig)
            destination_path = os.path.join(train_folder, signature)
            if not os.path.exists(destination_path):
                os.makedirs(destination_path)
            destination_path = os.path.join(destination_path, sig)
            shutil.copy(source_path, destination_path)

        for sig in test_signature:
            source_path = os.path.join(signature_path, sig)
            destination_path = os.path.join(test_folder, signature)
            if not os.path.exists(destination_path):
                os.makedirs(destination_path)
            destination_path = os.path.join(destination_path, sig)
            shutil.copy(source_path, destination_path)

In [31]:
# split_data(source_folder, train_folder, test_folder)

../dataset/ICDAR-2011\001\001_17.PNG
../dataset/ICDAR-2011\001\001_02.PNG
../dataset/ICDAR-2011\001\001_03.PNG
../dataset/ICDAR-2011\001\001_22.PNG
../dataset/ICDAR-2011\001\001_14.PNG
../dataset/ICDAR-2011\001\001_05.PNG
../dataset/ICDAR-2011\001\001_21.PNG
../dataset/ICDAR-2011\001\001_07.PNG
../dataset/ICDAR-2011\001\001_08.PNG
../dataset/ICDAR-2011\001\001_06.PNG
../dataset/ICDAR-2011\001\001_19.PNG
../dataset/ICDAR-2011\001\001_10.PNG
../dataset/ICDAR-2011\001\001_12.PNG
../dataset/ICDAR-2011\001\001_24.PNG
../dataset/ICDAR-2011\001\001_20.PNG
../dataset/ICDAR-2011\001\001_01.PNG
../dataset/ICDAR-2011\001\001_09.PNG
../dataset/ICDAR-2011\001\001_04.PNG
../dataset/ICDAR-2011\001\001_11.PNG
../dataset/ICDAR-2011\001_forg\0201001_01.png
../dataset/ICDAR-2011\001_forg\0201001_03.png
../dataset/ICDAR-2011\001_forg\0119001_04.png
../dataset/ICDAR-2011\001_forg\0119001_02.png
../dataset/ICDAR-2011\001_forg\0119001_01.png
../dataset/ICDAR-2011\001_forg\0119001_03.png
../dataset/ICDAR-2011

# 2. Feature Extraction

### Load vgg16 model

In [None]:
from tensorflow.keras.applications import VGG16
from tensorflow.keras.applications.vgg16 import preprocess_input
from tensorflow.keras.preprocessing import image
import numpy as np

In [None]:
model = VGG16(weights='imagenet', include_top=False)

### Extract Features

In [None]:
from tensorflow.keras.models import Model

In [None]:
layer_name = 'block5_pool'
intermediate_layer_model = Model(inputs=model.input, outputs=model.get_layer(layer_name).output)

In [None]:
def extract_features(img_path):
    img = image.load_img(img_path, target_size=(224, 224))
    img_array = image.img_to_array(img)
    img_array = np.expand_dims(img_array, axis=0)
    img_array = preprocess_input(img_array)

    features = intermediate_layer_model.predict(img_array)
    return features.flatten()