**Ressources**:

- [TF Tutorial](https://www.tensorflow.org/tutorials/images/data_augmentation)
- [Image Augmentation](https://towardsdatascience.com/image-augmentation-for-deep-learning-using-keras-and-histogram-equalization-9329f6ae5085)

In [None]:
import pandas as pd
import numpy as np

import os
import pathlib
import shutil
from skimage import io

import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras import layers

from sklearn.model_selection import train_test_split


# self-written scripts
import sys
sys.path.insert(0, 'Python_Scripts')

import util
import surf_hog_analysis

SEED = 42

---

### 1. Preparations

In [None]:
df = pd.read_csv('data/train_complete.csv')

In [None]:
util.isolate_single_defects(df)

In [None]:
df = util.add_blackness_attributes(df, 'train_images')

In [None]:
df

In [None]:
X = df.copy()
y = X.pop('ClassId')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state = SEED)

Since oversampling is only applied to the training data, we needed to split our data set.

In [None]:
def get_indices_for_class_id(y, class_id):
    pos_of_class_id = (y == class_id)
    indices = pos_of_class_id[pos_of_class_id].index.values
    return indices

In [None]:
class1_images = X_train.loc[util.get_indices_for_class_id(y_train, 1)]
class2_images = X_train.loc[util.get_indices_for_class_id(y_train, 2)]
class3_images = X_train.loc[util.get_indices_for_class_id(y_train, 3)]
class4_images = X_train.loc[util.get_indices_for_class_id(y_train, 4)]

print(f'There are {len(class1_images)} train images for ClassId 1')
print(f'There are {len(class2_images)} train images for ClassId 2')
print(f'There are {len(class3_images)} train images for ClassId 3')
print(f'There are {len(class4_images)} train images for ClassId 4')

In [None]:
print(f'ClassId 2 correspinds to {len(class2_images) / len(X_train)} % of train images.')

In [None]:
# temporarily safe all images from `ClassId` 2
path = pathlib.Path.cwd()
try:
    os.mkdir(path.joinpath('data','oversampling_test'))

    # von Michael kopiert für Ordnererstellung basierend auf x_train
    for i in range(len(class2_images)):
        origin_train_path = path.joinpath('data', 'train_images')
        source_file = class2_images.iloc[i,1]
        target_directory = path.joinpath('data', 'oversampling_test')
        shutil.copy2(origin_train_path.joinpath(source_file) , target_directory.joinpath(source_file))
except:
    print('Images already exist.')

After having a look at the images from `ClassId` 2, it becomes clear, that many of them have a high percentage of black pixels, some are entirely black even. It may be useful to delete such images since an augmentation would not really help to strengthen the robustness to identify images of this `ClassId`.

In [None]:
class_ids = y_train[util.get_indices_for_class_id(y_train, 2)]
surf_hog_analysis.print_batch(class2_images, class_ids)

In [None]:
image = io.imread('data/oversampling_test/b963c168c.jpg')
plt.figure(figsize=(18,5))
plt.imshow(image);

In [None]:
def get_black_columns(image, threshold=5):
    num_columns = 0
    
    for column in range(image.shape[1]):
        color_sum = image[:, column].sum()
        
        if color_sum <= image.shape[0] * 3 * threshold:
            num_columns += 1
            
    return num_columns

In [None]:
black_columns = util.get_black_columns(image)
black_columns

In [None]:
plt.figure(figsize=(18,5))
plt.axvline(x=black_columns)
plt.imshow(image);

In [None]:
def add_blackness_attributes(image_df, folder_extension, class_id):
    black_columns = []
    black_columns_percentage = []

    for image_id in image_df.ImageId:
        image = io.imread('data/' + folder_extension + '/' + image_id)
        black_columns.append(get_black_columns(image))
        black_columns_percentage.append(get_black_columns(image) / image.shape[1])

    temp = pd.DataFrame(list(zip(black_columns, black_columns_percentage)), 
                        index=get_indices_for_class_id(y_train, class_id), 
                        columns = ['BlackColumns', 'PercentageBlack'])
    image_df = pd.merge(image_df, temp, left_index=True, right_index=True)
    #print(image_df)

    return image_df

In [None]:
class2_images = util.add_blackness_attributes_for_single_class(class2_images, y_train,'oversampling_test', class_id=2)

In [None]:
class2_images.describe()

In [None]:
num_mostly_black = class2_images.query('PercentageBlack >= 0.5').ImageId.count()
print(f'There are {num_mostly_black} pictures (or {num_mostly_black / class2_images.shape[0]} % \
        of total) images that are mostly black (>= 50 %).')

In [None]:
mostly_black = class2_images.query('PercentageBlack >= 0.5')
mostly_black.PercentageBlack.describe()

It is striking, that `mostly_black` images have at least 72 % blackness on them (up to 94.5 %).

In [None]:
# Visualize some images of `mostly_black`
class_ids_mb = y_train[util.get_indices_for_class_id(y_train, 2)]
surf_hog_analysis.print_batch(mostly_black, class_ids_mb, blackness=True)

In [None]:
black_image = io.imread('data/oversampling_test/08193cfc8.jpg')
plt.figure(figsize=(18,5))
plt.imshow(black_image);

---

### Analysis of black percentage per class

In [None]:
class1_images = util.add_blackness_attributes_for_single_class(class1_images, y_train, 'train_images', 1)
class3_images = util.add_blackness_attributes_for_single_class(class3_images, y_train, 'train_images', 3)
class4_images = util.add_blackness_attributes_for_single_class(class4_images, y_train, 'train_images', 4)

In [None]:
class1_images.describe()

In [None]:
class3_images.describe()

In [None]:
class4_images.describe()

---

### Augementation trials