### Lo scopo di questo codice è prendere il dataset iniziale e trasformarlo tramite metodi di data augmentation in modo da migliorare le prestazioni della CNN.

### Caratteristiche del dataset iniziale: 1000 immagini di risoluzione 83x84.
### Caratteristiche del dataset finale: 1000 x (n+1) (n è un dato come argomento della funzione perform_augmentation) immagini di risoluzione 84x84 divise in 50 classi per il train/validation dataset, 1000 immagini di risoluzione 84x84 divise in 50 classi per il test dataset

### Alle immagini iniziali èstato aggiunto un padding di un pixel (nero) in modo da ottenere la risoluzione di 84x84. Per vedere le augmentations applicate guardare la pipeline. 

### Anche le immagini iniziali sono contenute in questo dataset.

In [1]:
from pathlib import Path
from matplotlib import pyplot as plt
from PIL import Image, ImageEnhance, ImageOps, ImageChops, ImageDraw, ImageFilter, ImageTransform
import albumentations as A
import random
import shutil
import numpy as np
import cv2
import glob
import os

In [2]:
# Set the source and destination directories
src_dir = r"C:\Users\andma\OneDrive\Documenti\hiragana images\hiragana_images_original\hiragana_images"
dst_dir = r"C:\Users\andma\OneDrive\Documenti\hiragana images\hiragana_images"

In [3]:
#Create the folders for the training/validation and test dataset if they don't already exist
train_data_dir = os.path.join(dst_dir, 'train')
test_data_dir = os.path.join(dst_dir, 'test')
os.makedirs(train_data_dir, exist_ok=True)
os.makedirs(test_data_dir, exist_ok=True)

#Create the classes folders if they don't already exist
for data_path in [train_data_dir, test_data_dir]:
    
    for file in os.listdir(src_dir):
        src_file = os.path.join(src_dir, file)
        name = ""
        for i in range(4, 7):
            if file[i].isdigit() == False:
                name = name + file[i]
        dst_folder_name = name  # get the letters after "kana"
        dst_folder = os.path.join(data_path, dst_folder_name)
        os.makedirs(dst_folder, exist_ok=True)

    # Delete all the images in the folders contained in dst_dir
    for folder in os.listdir(data_path):
        folder_path = os.path.join(data_path, folder)
        for file in os.listdir(folder_path):
            file_path = os.path.join(folder_path, file)
            os.remove(file_path)

# Copy all the images from src_dir to train_data_dir
for file in os.listdir(src_dir):
    src_file = os.path.join(src_dir, file)
    name = ""
    for i in range(4, 7):
        if file[i].isdigit() == False:
            name = name + file[i]
    dst_folder_name = name  # get the letters after "kana"
    dst_folder = os.path.join(train_data_dir, dst_folder_name)
    dst_file = os.path.join(dst_folder, file)
    shutil.copy(src_file, dst_folder)

In [9]:
def augment_image(image):
    # Define data augmentation parameters
    angle = np.random.uniform(-10, 10)
    trans = {'x' : int(np.random.uniform(-10, 10)),  'y' : int(np.random.uniform(-8, 8))}
   
    # Define the sequence of augmentations
    aug = A.Compose([
        A.PadIfNeeded (min_height=84, min_width=84, border_mode=0, value=0, always_apply=True),
        A.Affine(translate_px=trans,rotate=angle, p=0.5),
        A.GaussianBlur(blur_limit = [3, 7], sigma_limit = 0, p=0.5),
        #A.GridDistortion(),
        #A.Emboss(),
        A.Downscale(scale_min=0.6, scale_max=0.9,interpolation={'downscale':cv2.INTER_CUBIC, 'upscale':cv2.INTER_CUBIC}, p=0.2),
        A.CoarseDropout(max_holes=8,
                        max_height=8,
                        max_width=8, 
                        min_holes=2,
                        min_height=3,
                        min_width=3,
                        p=0.1),
        A.OpticalDistortion(distort_limit=0.05,
                            shift_limit=0.05,
                            interpolation=1,
                            border_mode=4,
                            p=0.3)
    ])
    
    # Apply the augmentations
    augmented_image = aug(image=image)['image']
   
    # Return the augmented image
    return augmented_image

def perform_augmentation(dir,subdir_train, subdir_test, file, n):
    # Load the image
    image = cv2.imread(os.path.join(subdir_train, file))
    for i in range(0,n):
        # Perform data augmentation
        augmented_image = augment_image(image)

        # Save augmented image with a different name
        new_file_name = file.split(".")[0] + "_aug" + str(i+1) + ".jpg"
        cv2.imwrite(os.path.join(subdir_train, new_file_name), augmented_image)
     
    # Perform data augmentation
    augmented_image = augment_image(image)

    # Save augmented image with a different name
    new_file_name = file.split(".")[0] + "_aug_test" + ".jpg"
    cv2.imwrite(os.path.join(subdir_test, new_file_name), augmented_image)
    
def add_padding(image, path):
    transform = A.PadIfNeeded(min_height=84, min_width=84, border_mode=0, value=0, always_apply=True)
    transformed_image = transform(image=np.array(image))['image']
    transformed_image = Image.fromarray(transformed_image)
    transformed_image.save(os.path.basename(file_path))

# Iterate through all subdirectories and image files
for subdir, dirs, files in os.walk(train_data_dir):
    subdir_test = os.path.join(test_data_dir, os.path.basename(subdir))
    for file in files:
        file_path = os.path.join(subdir, file)
        image = cv2.imread(file_path)
        
        # Add padding and perform augmentation
        add_padding(image, file_path)
        perform_augmentation(dirs,subdir,subdir_test, file, 9)