**Ressources**:

- [TF Tutorial](https://www.tensorflow.org/tutorials/images/data_augmentation)
- [Image Augmentation](https://towardsdatascience.com/image-augmentation-for-deep-learning-using-keras-and-histogram-equalization-9329f6ae5085)
- [Image Augmentation with Keras in Histogramm Equalization](https://towardsdatascience.com/image-augmentation-for-deep-learning-using-keras-and-histogram-equalization-9329f6ae5085)
- 

In [1]:
import pandas as pd
import numpy as np

import os
import pathlib
import shutil
from skimage import io

import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras import layers

from sklearn.model_selection import train_test_split


# self-written scripts
import sys
sys.path.insert(0, 'Python_Scripts')

import util
import surf_hog_analysis

SEED = 42

---

### 1. Preparations

In [3]:
"""Only execute once to create `.csv` file"""
df = pd.read_csv('data/train_complete.csv')

df = util.add_blackness_attributes(df.query('Defect==1'), 'train_images')

util.isolate_single_defects(df)

df.to_csv('data/train_single_defects_with_blackness.csv', sep=',', index=False)

In [None]:
df_sd = pd.read_csv('data/train_single_defects_with_blackness.csv')

In [None]:
X = df_sd.copy()
y = X.pop('ClassId')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state = SEED)

Since oversampling is only applied to the training data, we needed to split our data set.

In [None]:
class1_images = X_train.loc[util.get_indices_for_class_id(y_train, 1)]
class2_images = X_train.loc[util.get_indices_for_class_id(y_train, 2)]
class3_images = X_train.loc[util.get_indices_for_class_id(y_train, 3)]
class4_images = X_train.loc[util.get_indices_for_class_id(y_train, 4)]

print(f'There are {len(class1_images)} train images for ClassId 1')
print(f'There are {len(class2_images)} train images for ClassId 2')
print(f'There are {len(class3_images)} train images for ClassId 3')
print(f'There are {len(class4_images)} train images for ClassId 4')

In [None]:
print(f'ClassId 2 corresponds to {len(class2_images) / len(X_train)} % of train images.')

In [None]:
# temporarily safe all images from `ClassId` 2
path = pathlib.Path.cwd()
try:
    os.mkdir(path.joinpath('data','oversampling_test'))
except:
    print('Images already exist.')
    
# von Michael kopiert für Ordnererstellung basierend auf x_train
for i in range(len(class2_images)):
    origin_train_path = path.joinpath('data', 'train_images')
    source_file = class2_images.iloc[i,1]
    target_directory = path.joinpath('data', 'oversampling_test')
    shutil.copy2(origin_train_path.joinpath(source_file) , target_directory.joinpath(source_file))


After having a look at the images from `ClassId` 2, it becomes clear, that many of them have a high percentage of black pixels, some are entirely black even. It may be useful to delete such images since an augmentation would not really help to strengthen the robustness to identify images of this `ClassId`.

In [None]:
def print_batch(df_with_filepath, class_ids, blackness=False, show_keypoints=False, number_images=5):
    # create random index for `number_images`
    random_index = np.array(np.random.rand(2 * number_images) * len(df_with_filepath.ImageId), dtype='int')
    # print(random_index)
    # define subplot grid
    fig, axes = plt.subplots(number_images, 2, figsize=(18,8))

    for i in range(number_images * 2):
        # gather required info to retrieve image and label the plots
        file_path_to_image = df_with_filepath['FilePath'].iloc[random_index[i]]
        class_id = class_ids.iloc[random_index[i]]
        image_id = df_with_filepath['ImageId'].iloc[random_index[i]]
        if blackness:
            blackness = df_with_filepath['PercentageBlack'].iloc[random_index[i]]
            # print(df_with_filepath['PercentageBlack'])
        if show_keypoints:
            keypoints = df_with_filepath['NumberKP'].iloc[random_index[i]]

        # read-in the image
        img = io.imread(file_path_to_image)
        
        row = i % number_images
        col = int(i // number_images)
        ax_ = axes[row][col]
        ax_.imshow(img)
        #plt.imshow(img)
        
        title = f'Image ID: {image_id} | ClassId: {class_id}'
        if blackness:
            title += f' | Percentage Black: {blackness}'
        if show_keypoints:
            title += f' | Number Keypoints: {keypoints}'
        ax_.set_title(title, fontsize=14);
        
        # adjust distance between subplots
        plt.subplots_adjust(left=0.1,
                    bottom=0.1, 
                    right=1, 
                    top=1.2, 
                    wspace=0.2, 
                    hspace=0.2)
        plt.axis("off")

In [None]:
class_ids = y_train[util.get_indices_for_class_id(y_train, 2)]
print_batch(class2_images, class_ids, blackness=True)

In [None]:
image = io.imread('data/oversampling_test/b963c168c.jpg')
plt.figure(figsize=(18,5))
plt.imshow(image);

In [None]:
black_columns = util.get_black_columns(image)
black_columns

In [None]:
plt.figure(figsize=(18,5))
plt.axvline(x=black_columns)
plt.imshow(image);

In [None]:
#class2_images = util.add_blackness_attributes_for_single_class(class2_images, y_train,'oversampling_test', class_id=2)

In [None]:
class2_images.describe()

In [None]:
num_mostly_black = class2_images.query('PercentageBlack >= 0.5').ImageId.count()
print(f'There are {num_mostly_black} pictures (or {num_mostly_black / class2_images.shape[0]} % \
        of total) images that are mostly black (>= 50 %).')

In [None]:
mostly_black = class2_images.query('PercentageBlack >= 0.5')
mostly_black.PercentageBlack.describe()

It is striking, that `mostly_black` images have at least 72 % blackness on them (up to 94.5 %).

In [None]:
# Visualize some images of `mostly_black`
class_ids_mb = y_train[util.get_indices_for_class_id(y_train, 2)]
print_batch(mostly_black, class_ids_mb, blackness=True)

In [None]:
# examine the distribution of blackness among all `ClassIds`
df_sd.groupby('ClassId').PercentageBlack.describe()

---

### Augementation trials

[`tf.image`](https://www.tensorflow.org/api_docs/python/tf/image)

In [4]:
import os
import albumentations
import cv2
import time
import random

# self-written scripts
import sys
sys.path.insert(0, 'Python_Scripts')

import data_preparation_cnn

In [5]:
def make_folder():
    # prepare folder structure
    try:
        path = os.getcwd()
        temp_path = path + "/data/augmentations"
        os.mkdir(temp_path)
    except:
        print('Folder already exists.')

In [6]:
import albumentations as A

augment = A.Compose([
    #A.VerticalFlip(p=0.5),
    A.HorizontalFlip(p=0.5),
    # A.OneOf([
    #     A.ElasticTransform(alpha=120, sigma=120 * 0.05, alpha_affine=120 * 0.03, p=0.5),
    #     A.GridDistortion(p=0.5),
    #     A.OpticalDistortion(distort_limit=2, shift_limit=0.5, p=1)                  
    #     ], p=0.8),
    A.CLAHE(p=0.8),
    A.RandomBrightnessContrast(p=0.8),    
    A.RandomGamma(p=0.8)
])

In [7]:
def augement_images(image_ids, num_augmentations, class_id):
    print(f'beginning augmentation for ClassId {class_id}...')
    start = time.time()
    
    path = os.getcwd()
    #path_suffix = 'c' + str(class_id) + '/'
    
    target_directory_image = '/data/augmentations/'
    
    aug_ids = []
    class_ids = []
    file_paths = []
    
    i = 1
    
    while i <= num_augmentations:
        #print(i)
        number = random.randint(0, len(image_ids) -1)
        image_id = image_ids[number]
        #print(image_id, mask_id)
        
        aug_ids.append('aug_' + str(i) + '_' + image_id)
        class_ids.append(class_id)
        file_paths.append(path + target_directory_image + image_id)
        
        original_image = cv2.imread('data/train_images/' + image_id)
        #print(original_image)
      
        augmented = augment(image=original_image)
        transformed_image = augmented['image']
        #transformed_mask = augmented['mask']
        
        os.chdir(path + target_directory_image)
        written = cv2.imwrite('aug_' + str(i) + '_' + image_id, transformed_image)
        #print('image written:',written')

        os.chdir(path)
        
        i += 1
    
    temp = pd.DataFrame(list(zip(file_paths,aug_ids, class_ids)), columns=['FilePath','ImageId','ClassId'])
    
    end = time.time()
    print(f'augmented {num_augmentations} images of ClassId {class_id}')
    print('time required for augmentation:', end - start)
    print()
    
    return temp

In [8]:
def create_df_aug(df):
    
    make_folder()
    
    num_images_class_3 = df.groupby('ClassId').count().ImageId[3]
    max_images = num_images_class_3

    # create empty df
    df_aug = pd.DataFrame(columns=['FilePath','ImageId','ClassId'])

    for i in [1,2,3,4]:
        image_ids = df.query('ClassId == @i').ImageId.values

        temp = augement_images(image_ids=image_ids, num_augmentations=max_images, class_id=i)
        df_aug=pd.concat([df_aug, temp], axis=0)

    return df_aug.reset_index(drop=True)

In [9]:
# df = pd.read_csv('data/train_single_defects_with_blackness.csv')

# df_aug = create_df_aug(df)

# df_aug.to_csv('data/train_single_defects_augmented.csv', sep=',', index=False)

---

### Augmentation for Train-Test-Split

In [10]:
df_sd = pd.read_csv('data/train_single_defects_with_blackness.csv')

In [11]:
X = df_sd.copy()
y = X.pop('ClassId')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state = SEED)

In [12]:
# create data frame for train and test
df_train = pd.concat([X_train, y_train], axis=1)
df_test = pd.concat([X_test, y_test], axis=1)

In [13]:
"""Only execute ONCE"""
# apply augmentation to train images
df_train_aug = create_df_aug(df_train)

beginning augmentation for ClassId 1...
augmented 2866 images of ClassId 1
time required for augmentation: 23.72112774848938

beginning augmentation for ClassId 2...
augmented 2866 images of ClassId 2
time required for augmentation: 21.805581092834473

beginning augmentation for ClassId 3...
augmented 2866 images of ClassId 3
time required for augmentation: 24.977601051330566

beginning augmentation for ClassId 4...
augmented 2866 images of ClassId 4
time required for augmentation: 25.065805196762085



In [15]:
# save train-test-splits to .csv to feed them into the models
df_train_aug.to_csv('data/train_set_augmented.csv', sep=',', index=False)
df_test.to_csv('data/test_set_for_augmented.csv', sep=',', index=False)

---

### Functions

In [None]:
def get_indices_for_class_id(y, class_id):
    pos_of_class_id = (y == class_id)
    indices = pos_of_class_id[pos_of_class_id].index.values
    return indices

In [None]:
def get_black_columns(image, threshold=5):
    num_columns = 0
    
    for column in range(image.shape[1]):
        color_sum = image[:, column].sum()
        
        if color_sum <= image.shape[0] * 3 * threshold:
            num_columns += 1
            
    return num_columns

In [None]:
def add_blackness_attributes(image_df, folder_extension):
    """returns the `image_df` extended by columns `BlackColumns` and `PercentageBlack`.
    
    Input parameters:
    image_df         - data frame that includes `ImageIds`
    folder_extension - the subfolder in 'data/' where pictures are located
    """
    black_columns = []
    black_columns_percentage = []

    for image_id in image_df.ImageId:
        image = io.imread('data/' + folder_extension + '/' + image_id)
        black_columns.append(get_black_columns(image))
        black_columns_percentage.append(get_black_columns(image) / image.shape[1])
    
    temp = pd.DataFrame(list(zip(black_columns, black_columns_percentage)), 
                        columns = ['BlackColumns', 'PercentageBlack'])
        
    image_df = pd.merge(image_df, temp, left_index=True, right_index=True)

    return image_df

In [None]:
# # Iterate and see the pictures and labels
# img_batch, labels = next(it)
# image = img_batch[0]
# #print(img_batch)
# plt.imshow(image)
# print(labels[0])
# image = image.numpy() 
# image *= 256
# written = cv2.imwrite(cwd.as_posix() + '/image.jpg', image)
# print(written)