# CREDO image processing

In [2]:
%run ./notebook_init.py

import os
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from sklearn.model_selection import train_test_split
from core import DATA_FOLDER

In [None]:
raw_data = os.path.join(DATA_FOLDER, "credo_raw_dataset")

data_path_line = os.path.join(raw_data, "hits_votes_4_Lines")
data_path_worm = os.path.join(raw_data, "hits_votes_4_Worms")
data_path_dot = os.path.join(raw_data, "hits_votes_4_Dots")
data_path_artefacts = os.path.join(raw_data, "artefacts")

In [None]:
x_data_line = os.listdir(data_path_line)
x_data_worm = os.listdir(data_path_worm)
x_data_dot = os.listdir(data_path_dot)
x_data_artefacts = os.listdir(data_path_artefacts)


In [None]:
particle_topology = {"line": 0, "worm": 1, "dot": 2, "artefacts": 4}

In [None]:
y_data_line = [particle_topology["line"]]*len(x_data_line)
y_data_worm = [particle_topology["worm"]]*len(x_data_worm)
y_data_dot = [particle_topology["dot"]]*len(x_data_dot)
y_data_artefacts = [particle_topology["artefacts"]]*len(x_data_artefacts)

In [None]:
train_ratio = 0.70
validation_ratio = 0.15
test_ratio = 0.15
random_state = 1000
test_val_size = 1-train_ratio
test_size = test_ratio/(test_ratio + validation_ratio)

In [None]:
x_train_line, x_test_line, _, _ = train_test_split(x_data_line,
                                                   y_data_line,
                                                   test_size=test_val_size,
                                                   random_state=random_state)
x_val_line, x_test_line, _, _ = train_test_split(x_test_line, _,
                                                 test_size=test_size,
                                                 random_state=random_state) 

x_train_worm, x_test_worm, _, _ = train_test_split(x_data_worm,
                                                   y_data_worm,
                                                   test_size=test_val_size,
                                                   random_state=random_state)
x_val_worm, x_test_worm, _, _ = train_test_split(x_test_worm, _,
                                                 test_size=test_size,
                                                 random_state=random_state)

x_train_dot, x_test_dot, _, _ = train_test_split(x_data_dot,
                                                 y_data_dot,
                                                 test_size=test_val_size,
                                                 random_state=random_state)
x_val_dot, x_test_dot, _, _ = train_test_split(x_test_dot, _,
                                               test_size=test_size,
                                               random_state=random_state)

x_train_artefacts, x_test_artefacts, _, _ = train_test_split(x_data_artefacts,
                                                             y_data_artefacts,
                                                             test_size=test_val_size,
                                                             random_state=random_state)
x_val_artefacts, x_test_artefacts, _, _ = train_test_split(x_test_artefacts, _,
                                                           test_size=test_size,
                                                           random_state=random_state)

In [None]:
print("Total line data: {}\n"
      "Train line data: {}\n"
      "Test line data: {}\n"
      "Val line data: {}"
      .format(len(x_data_line),
              len(x_train_line),
              len(x_test_line),
              len(x_val_line)))
print("---------------------------------")
print("Total worm data: {}\n"
      "Train worm data: {}\n"
      "Test worm data: {}\n"
      "Val worm data: {}"
      .format(len(x_data_worm),
              len(x_train_worm),
              len(x_test_worm),
              len(x_val_worm)))
print("---------------------------------")

print("Total dot data: {}\n"
      "Train dot data: {}\n"
      "Test dot data: {}\n"
      "Val dot data: {}"
      .format(len(x_data_dot),
              len(x_train_dot),
              len(x_test_dot),
              len(x_val_dot)))
print("---------------------------------")

print("Total artefacts data: {}\n"
      "Train artefacts data: {}\n"
      "Test artefacts data: {}\n"
      "Val artefacts data: {}"
      .format(len(x_data_artefacts),
              len(x_train_artefacts),
              len(x_test_artefacts),
              len(x_val_artefacts)))
print("---------------------------------")

In [None]:
for i in range(10):
    print("Image", i)
    img = Image.open(os.path.join(data_path_line, x_train_line[i]))
    img_array = np.asarray(img)
    print("Data type = ", img_array.dtype)
    print("Array shape = ", img_array.shape)
    print("Max value = ", np.max(img_array))
    print("Min value = ", np.min(img_array))
    print("-----------------------------")

In [None]:
fig, ax = plt.subplots(nrows=5, ncols=5)
for row in ax:
    for idx, col in enumerate(row):
        col.imshow(Image.open(os.path.join(data_path_line, x_train_line[idx])))
        col.axis("off")
plt.show()

In [None]:
def create_directory(folder):
    if not os.path.exists(folder):
        print("Creating folder {}".format(folder))
        os.makedirs(folder) 
    else:
        print("Folder {} already exists".format(folder))

In [None]:
processed_data_folder = os.path.join(DATA_FOLDER, "credo_processed_dataset")

train_data_folder = os.path.join(processed_data_folder, "train")
test_data_folder = os.path.join(processed_data_folder, "test")
val_data_folder = os.path.join(processed_data_folder, "val")

train_data_line_folder = os.path.join(train_data_folder, "line")
train_data_worm_folder = os.path.join(train_data_folder, "worm")
train_data_dot_folder = os.path.join(train_data_folder, "dot")
train_data_artefacts_folder = os.path.join(train_data_folder, "artefacts")

test_data_line_folder = os.path.join(test_data_folder, "line")
test_data_worm_folder = os.path.join(test_data_folder, "worm")
test_data_dot_folder = os.path.join(test_data_folder, "dot")
test_data_artefacts_folder = os.path.join(test_data_folder, "artefacts")

val_data_line_folder = os.path.join(val_data_folder, "line")
val_data_worm_folder = os.path.join(val_data_folder, "worm")
val_data_dot_folder = os.path.join(val_data_folder, "dot")
val_data_artefacts_folder = os.path.join(val_data_folder, "artefacts")

In [None]:
create_directory(processed_data_folder)

create_directory(train_data_folder)
create_directory(test_data_folder)
create_directory(val_data_folder)

create_directory(train_data_line_folder)
create_directory(train_data_worm_folder)
create_directory(train_data_dot_folder)
create_directory(train_data_artefacts_folder)

create_directory(test_data_line_folder)
create_directory(test_data_worm_folder)
create_directory(test_data_dot_folder)
create_directory(test_data_artefacts_folder)

create_directory(val_data_line_folder)
create_directory(val_data_worm_folder)
create_directory(val_data_dot_folder)
create_directory(val_data_artefacts_folder)

In [None]:
def save_cmyk2gray(source_files, input_folder, destination_folder):
    for file in source_files:
        img = Image.open(os.path.join(input_folder, file)).convert("L")
        img.save(os.path.join(destination_folder, file))

In [None]:
# train data
save_cmyk2gray(x_train_line, data_path_line, train_data_line_folder)
save_cmyk2gray(x_train_worm, data_path_worm, train_data_worm_folder)
save_cmyk2gray(x_train_dot, data_path_dot, train_data_dot_folder)
save_cmyk2gray(x_train_artefacts, data_path_artefacts, train_data_artefacts_folder)

# test data
save_cmyk2gray(x_test_line, data_path_line, test_data_line_folder)
save_cmyk2gray(x_test_worm, data_path_worm, test_data_worm_folder)
save_cmyk2gray(x_test_dot, data_path_dot, test_data_dot_folder)
save_cmyk2gray(x_test_artefacts, data_path_artefacts, test_data_artefacts_folder)

# val data
save_cmyk2gray(x_val_line, data_path_line, val_data_line_folder)
save_cmyk2gray(x_val_worm, data_path_worm, val_data_worm_folder)
save_cmyk2gray(x_val_dot, data_path_dot, val_data_dot_folder)
save_cmyk2gray(x_val_artefacts, data_path_artefacts, val_data_artefacts_folder)

In [None]:
for i in range(10):
    print("Image", i)
    img_processed = Image.open(os.path.join(train_data_line_folder, x_train_line[i]))
    img_processed_array = np.asarray(img_processed)
    print("Data type = ", img_processed_array.dtype)
    print("Array shape = ", img_processed_array.shape)
    print("Max value = ", np.max(img_processed_array))
    print("Min value = ", np.min(img_processed_array))
    print("-----------------------------")
