# Data Augmentation

### Import required libraries

In [None]:
import hashlib
import requests
import zipfile
import os
import pandas as pd
import Augmentor
from datetime import datetime
from datetime import date
import random
import shutil
import time

### Set parameters depending on dataset to be generated

In [None]:
current_dataset_type = 'TRG_'
current_dataset_number = '1_'

# Set different seed for data augmentation depending on dataset to be prepared
if current_dataset_number == '1_':
    seed = 100
elif current_dataset_number == '2_':
    seed = 200
elif current_dataset_number == '3_':
    seed = 300
else:
    seed = 400

### Define functions

In [None]:
# Define function that moves a file from a directory to another
def move_file(old_dir, old_file_name, new_dir, new_file_name):
    old_file = os.path.join(old_dir, old_file_name)
    new_file = os.path.join(new_dir, new_file_name)
    shutil.copy2(old_file, new_dir)

# Define a function to compute the hash of a file (except image file)
# and compares it to the hash computed/recorded when the file was generated
def check_hash(directory, file_name, hash_file_name):
    print(file_name)
    # Compute hash value
    BLOCK_SIZE = 65536
    file_hash = hashlib.sha512()
    file_path = os.path.join(directory, file_name)
    with open(file_path, "rb") as f:
        fb = f.read(BLOCK_SIZE) 
        while len(fb) > 0: 
            file_hash.update(fb)
            fb = f.read(BLOCK_SIZE)
        computed_hash = file_hash.hexdigest()
    #print(computed_hash)
    
    # Extract stored hash value 
    hash_file_path = os.path.join(directory, hash_file_name)
    with open(hash_file_path, "rb") as f:
        stored_hash = f.read().decode("utf-8")
    #print(stored_hash)

    # Compare hash values
    hash_compare = 0
    if computed_hash == stored_hash:
        hash_compare = 1
        print("File OK")
    else:
        zip_hash_compare = 0
        print("File NOK")

# Define a function to compute the hash of an image file
# and compares it to the hash computed/recorded when the file was generated
def check_hash_image(directory, file_name, stored_hash):
    print(file_name)
    # Compute hash value
    BLOCK_SIZE = 65536
    file_hash = hashlib.sha256()
    file_path = os.path.join(directory, file_name)
    with open(file_path, "rb") as f:
        fb = f.read(BLOCK_SIZE) 
        while len(fb) > 0: 
            file_hash.update(fb)
            fb = f.read(BLOCK_SIZE)
        computed_hash = file_hash.hexdigest()
    
    # Compare hash values
    hash_compare = 0
    if computed_hash == stored_hash:
        hash_compare = 1
        print("File OK")
    else:
        zip_hash_compare = 0
        print("File NOK")

In [None]:
# Define function that performs the augmentation, copies the images into the final folder and updates the csv
def modif_save_new_image (old_dir, new_dir, modif, seed):
    # Set variables
    image_param_column_list = ['UUID','DATE','TIME','HOLDING_POINT', 
                               'EDGE ROTATION', 'EDGE DIST', 'EDGE ANGLE','SIDE',
                               'MARKING', ' LIGHT', 'LONG DIST', 'LAT DIST', 'ANGLE','SHA256',
                               'CORRECTION', 'COMMENT', 'MODIF', 'SEED', 'FLIP', 
                               'SKEW_SIDE', 'SKEW_TOP', 'ROTATE_LEFT', 'ROTATE_RIGHT', 
                               'BRIGHT', 'DARK', 'SHEAR_LEFT', 'SHEAR_RIGHT', 
                               'DISTORT', 'CONTRAST', 'ORIGINAL']
    global combined_csv_df, combined_csv_df_frozen
    flip, skew_side, skew_top, rotate_left, rotate_right, bright, dark, shear_left, shear_right, distort, contrast = 0,0,0,0,0,0,0,0,0,0,0
    
    # Create augmentation pipeline
    Augmentor.Pipeline.set_seed(seed)
    if modif == "flip":
        p_flip = Augmentor.Pipeline(new_dir)
        p_flip.flip_left_right(probability=1)
        p_flip.process()
        flip = 1
    elif modif == "skew_side":
        p_skew_side = Augmentor.Pipeline(new_dir)
        p_skew_side.skew_left_right(0.5, magnitude=1)
        p_skew_side.sample(500)
        skew_side = 1
    elif modif == "skew_top":
        p_skew_top = Augmentor.Pipeline(new_dir)
        p_skew_top.skew_top_bottom(0.5, magnitude=1)
        p_skew_top.sample(500)
        skew_top = 1    
    elif modif == "rotate_left":
        p_rotate_left = Augmentor.Pipeline(new_dir)
        p_rotate_left.rotate(probability=0.5, max_left_rotation=5, max_right_rotation=0)
        p_rotate_left.sample(500)
        rotate_left = 1
    elif modif == "rotate_right":
        p_rotate_right = Augmentor.Pipeline(new_dir)
        p_rotate_right.rotate(probability=0.5, max_left_rotation=0, max_right_rotation=5)
        p_rotate_right.sample(500)
        rotate_right = 1
    elif modif == "bright":
        p_bright = Augmentor.Pipeline(new_dir)
        p_bright.random_brightness(probability = 0.5, min_factor = 1.05, max_factor = 1.2)
        p_bright.sample(500)
        bright = 1
    elif modif == "dark":
        p_dark = Augmentor.Pipeline(new_dir)
        p_dark.random_brightness(probability = 0.5, min_factor = 0.8, max_factor = 0.95)
        p_dark.sample(500)
        dark = 1
    elif modif == "shear_left":
        p_shear_left = Augmentor.Pipeline(new_dir)
        p_shear_left.shear(probability = 0.5, max_shear_left = 10, max_shear_right = 0.001)
        p_shear_left.sample(500)
        shear_left = 1
    elif modif == "shear_right":
        p_shear_right = Augmentor.Pipeline(new_dir)
        p_shear_right.shear(probability = 0.5, max_shear_left = 0.001, max_shear_right = 10)
        p_shear_right.sample(500)
        shear_right = 1
    elif modif == "distort":
        p_distort = Augmentor.Pipeline(new_dir)
        p_distort.random_distortion(probability = 0.5, grid_width = 10, grid_height = 10, magnitude = 2)
        p_distort.sample(500)
        distort = 1        
    elif modif == "contrast":
        p_contrast = Augmentor.Pipeline(new_dir)
        p_contrast.random_contrast(probability = 0.5, min_factor = 0.90, max_factor = 0.95)
        p_contrast.sample(500)
        contrast = 1
        
    for file in os.listdir(old_dir):
        # Find new file name/UUID and original file name/UUID
        long_file_name = os.fsdecode(file)
        split_file_name = long_file_name.split('.jpg_') 
        original_file_name = split_file_name[0].split('DATASET_original_')[0]
        new_file_name = split_file_name[1]
        # Move file to new directory
        old_file = os.path.join(old_dir, long_file_name)
        new_file = os.path.join(new_dir, new_file_name)
        os.rename(old_file, new_file)
        # Create temporary df to store new image parameters
        # Find new file name/UUID and original file name/UUID
        original_UUID = original_file_name.split('.jpg')[0].split('_')[2]
        print(original_UUID)
        new_UUID = new_file_name.split('.jpg')[0]
        print(new_UUID)
        # Record current date/time
        date_now = str(date.today())
        time_now = datetime.now().strftime("%H:%M:%S")
        # Compute hash for new image
        file_hash = hashlib.sha256()
        BLOCK_SIZE = 65536
        with open(new_file, "rb") as f:
            fb = f.read(BLOCK_SIZE) 
            while len(fb) > 0: 
                file_hash.update(fb)
                fb = f.read(BLOCK_SIZE)
            jpg_hash = file_hash.hexdigest()
        # Record parameters in temporary dataframe    
        temp_df = pd.DataFrame([[new_UUID, date_now, time_now, 
                                 combined_csv_df.loc[combined_csv_df['UUID'] == original_UUID, 'HOLDING_POINT'].item(),
                                 combined_csv_df.loc[combined_csv_df['UUID'] == original_UUID, 'EDGE ROTATION'].item(),
                                 combined_csv_df.loc[combined_csv_df['UUID'] == original_UUID, 'EDGE DIST'].item(),
                                 combined_csv_df.loc[combined_csv_df['UUID'] == original_UUID, 'EDGE ANGLE'].item(),
                                 abs(combined_csv_df.loc[combined_csv_df['UUID'] == original_UUID, 'SIDE'].item()-1) if flip == 1
                                 else combined_csv_df.loc[combined_csv_df['UUID'] == original_UUID, 'SIDE'].item(), 
                                 combined_csv_df.loc[combined_csv_df['UUID'] == original_UUID, 'MARKING'].item(),
                                 combined_csv_df.loc[combined_csv_df['UUID'] == original_UUID, ' LIGHT'].item(),
                                 combined_csv_df.loc[combined_csv_df['UUID'] == original_UUID, 'LONG DIST'].item(),
                                 combined_csv_df.loc[combined_csv_df['UUID'] == original_UUID, 'LAT DIST'].item(),
                                 combined_csv_df.loc[combined_csv_df['UUID'] == original_UUID, 'ANGLE'].item(),
                                 jpg_hash, 0, 'NA', modif, seed,
                                 combined_csv_df.loc[combined_csv_df['UUID'] == original_UUID, 'FLIP'].item()+flip,
                                 combined_csv_df.loc[combined_csv_df['UUID'] == original_UUID, 'SKEW_SIDE'].item()+skew_side,
                                 combined_csv_df.loc[combined_csv_df['UUID'] == original_UUID, 'SKEW_TOP'].item()+skew_top,
                                 combined_csv_df.loc[combined_csv_df['UUID'] == original_UUID, 'ROTATE_LEFT'].item()+rotate_left,
                                 combined_csv_df.loc[combined_csv_df['UUID'] == original_UUID, 'ROTATE_RIGHT'].item()+rotate_right,
                                 combined_csv_df.loc[combined_csv_df['UUID'] == original_UUID, 'BRIGHT'].item()+bright,
                                 combined_csv_df.loc[combined_csv_df['UUID'] == original_UUID, 'DARK'].item()+dark,
                                 combined_csv_df.loc[combined_csv_df['UUID'] == original_UUID, 'SHEAR_LEFT'].item()+shear_left,
                                 combined_csv_df.loc[combined_csv_df['UUID'] == original_UUID, 'SHEAR_RIGHT'].item()+shear_right,
                                 combined_csv_df.loc[combined_csv_df['UUID'] == original_UUID, 'DISTORT'].item()+distort,
                                 combined_csv_df.loc[combined_csv_df['UUID'] == original_UUID, 'CONTRAST'].item()+contrast,
                                 original_UUID]], columns=image_param_column_list)
        # Append temporary dataframe to image parameter dataframe
        combined_csv_df = combined_csv_df.append(temp_df)

        
        
        


### Retrieve .zip file and corresponding hash file from Downloads folder

In [None]:
downloads_directory = 'C:/Users/adeli/Downloads/'
local_directory = 'DATASET/' + current_dataset_type + 'ORIGINAL/'
dataset_file_name = 'dataset.zip'
dataset_hash_file_name = 'dataset_hash.txt'

move_file(downloads_directory, dataset_file_name, local_directory, dataset_file_name)
move_file(downloads_directory, dataset_hash_file_name, local_directory, dataset_hash_file_name)

### Check .zip file for corruption

In [None]:
local_directory = 'DATASET/' + current_dataset_type + 'ORIGINAL/'
dataset_file_name = 'dataset.zip'
dataset_hash_file_name = 'dataset_hash.txt'
check_hash(local_directory, dataset_file_name, dataset_hash_file_name)

### Unzip .zip file to TEMP directory

In [None]:
with zipfile.ZipFile('DATASET/' + current_dataset_type + 'ORIGINAL/dataset.zip') as dataset_zipfile:
    dataset_zipfile.extractall(path='DATASET/' + current_dataset_type + 'ORIGINAL/')

### Check .csv files for corruption

In [None]:
temp_directory = 'DATASET/' + current_dataset_type + 'ORIGINAL/dataset/'

for file in os.listdir(temp_directory):
    if file.endswith('.csv'):
        csv_file_name = file
        csv_hash_file_name = csv_file_name.split('.csv')[0] + '_hash.txt'
        check_hash(temp_directory, csv_file_name, csv_hash_file_name)
    

### Concatenate all .csv files into a single one

In [None]:
temp_directory = 'DATASET/' + current_dataset_type + 'ORIGINAL/dataset/'
combined_csv_column_list = ['UUID','DATE','TIME','HOLDING_POINT', 
                               'EDGE ROTATION', 'EDGE DIST', 'EDGE ANGLE','SIDE',
                               'MARKING', ' LIGHT', 'LONG DIST', 'LAT DIST', 'ANGLE','SHA256',
                               'CORRECTION', 'COMMENT', 'MODIF', 'SEED', 'FLIP', 
                               'SKEW_SIDE', 'SKEW_TOP', 'ROTATE_LEFT', 'ROTATE_RIGHT', 
                               'BRIGHT', 'DARK', 'SHEAR_LEFT', 'SHEAR_RIGHT', 
                               'DISTORT', 'CONTRAST', 'ORIGINAL']
combined_csv_df = pd.DataFrame(columns=combined_csv_column_list)

for file in os.listdir(temp_directory):
    if file.endswith('.csv'):
        new_df = pd.read_csv(temp_directory+file)
        combined_csv_df = pd.concat([combined_csv_df, new_df])
combined_csv_df.to_csv( 'DATASET/' + current_dataset_type + 'ORIGINAL/combined_csv.csv', index=False, encoding='utf-8-sig')       

### Perform corrections manually in combined_csv.csv file

### Data augmentation

In [None]:
# Copy all original images in final dataset folder and check all images for corruption

# Set directory names
temp_directory = 'DATASET/' + current_dataset_type + 'ORIGINAL/'
temp_holdingpoint_image_directory = 'DATASET/' + current_dataset_type + 'ORIGINAL/dataset/holdingpoint/'
temp_noholdingpoint_image_directory = 'DATASET/' + current_dataset_type + 'ORIGINAL/dataset/noholdingpoint/'

final_holdingpoint_image_directory = 'DATASET/' + current_dataset_type + current_dataset_number + 'FINAL/holdingpoint/'
final_noholdingpoint_image_directory = 'DATASET/' + current_dataset_type + current_dataset_number + 'FINAL/noholdingpoint/'

# Open .csv file containing all images information
combined_csv_df = pd.read_csv(temp_directory + "combined_csv.csv")

holdingpoint_image_counter = 0
noholdingpoint_image_counter = 0

# For each image listed in the .csv file, move the image to the relevant folder and check its hash
for index, row in combined_csv_df.iterrows():
    if row['UUID'] != 'NA':
        if row['HOLDING_POINT'] == 0:
            image_name = str(row['UUID']) + '.jpg'
            if os.path.isfile(temp_noholdingpoint_image_directory+image_name):
                move_file(temp_noholdingpoint_image_directory, image_name, final_noholdingpoint_image_directory, image_name)
                check_hash_image(final_noholdingpoint_image_directory, image_name, row['SHA256'])
                noholdingpoint_image_counter += 1
            else:
                print("Missing:",image_name)
        elif row['HOLDING_POINT'] == 1:
            image_name = str(row['UUID']) + '.jpg'
            if os.path.isfile(temp_holdingpoint_image_directory+image_name):
                move_file(temp_holdingpoint_image_directory, image_name, final_holdingpoint_image_directory, image_name)
                check_hash_image(final_holdingpoint_image_directory, image_name, row['SHA256'])
                holdingpoint_image_counter += 1
            else:
                print("Missing:",image_name)

print("images moved to holdingpoint folder")
print(holdingpoint_image_counter)
print("images moved to holdingpoint folder")
print(noholdingpoint_image_counter)

In [None]:
# Perform Data Augmentation

# Set directory names
temp_directory = 'DATASET/' + current_dataset_type + 'ORIGINAL/'
combined_csv_df = pd.read_csv(temp_directory + "combined_csv.csv")

output_holdingpoint_image_directory = 'DATASET/' + current_dataset_type + current_dataset_number + 'FINAL/holdingpoint/output/'
final_holdingpoint_image_directory = 'DATASET/' + current_dataset_type + current_dataset_number + 'FINAL/holdingpoint/'
output_noholdingpoint_image_directory = 'DATASET/' + current_dataset_type + current_dataset_number + 'FINAL/noholdingpoint/output/'
final_noholdingpoint_image_directory = 'DATASET/' + current_dataset_type + current_dataset_number + 'FINAL/noholdingpoint/'

# Set list of modifications to be performed
list_modif = list(['skew_side', 'skew_top', 'rotate_left', 'rotate_right', 'bright', 'dark', 
                   'shear_left', 'shear_right', 'distort', 'contrast'])

# Shuffle the list of modifications according to the seed set at the beginning
random.seed(seed)
random.shuffle(list_modif)

# For all images in the holdingpoint folder, flip the images and save them in the final holdingpoint folder
modif_save_new_image (output_holdingpoint_image_directory, final_holdingpoint_image_directory, 'flip', seed)

# For all images in the holdingpoint folder, apply all the other modifications in the order set by the seed
# and save them in the final holdingpoint folder
for modif in list_modif:
    modif_save_new_image (output_holdingpoint_image_directory, final_holdingpoint_image_directory, modif, seed)

# For all images in the noholdingpoint folder, flip the images and save them in the final noholdingpoint folder
modif_save_new_image (output_noholdingpoint_image_directory, final_noholdingpoint_image_directory, 'flip', seed)

# For all images in the noholdingpoint folder, apply all the other modifications in the order set by the seed
# and save them in the final noholdingpoint folder
for modif in list_modif:
    modif_save_new_image (output_noholdingpoint_image_directory, final_noholdingpoint_image_directory, modif, seed)

# Save a record of all the modification performed in a .csv file
combined_csv_df.to_csv('DATASET/' + current_dataset_type + current_dataset_number + 'FINAL/augmented_csv.csv', index = False)