# 0. SETUP

Place all the image into a new folder call "all". The file will generate a CSV for analysing it

In [None]:
import os
import glob
import pandas as pd
import xml.etree.ElementTree as ET
import numpy as np

In [None]:
CSV_PATH = "workspace/annotations"
IMAGE_PATH = os.path.join("workspace", "images", "dataset")
LABEL_FILE = "dataset_label.csv"

In [None]:
def xml_to_csv(path):
    xml_list = []
    for xml_file in glob.glob(path + '/*.xml'):
        tree = ET.parse(xml_file)
        root = tree.getroot()
        for member in root.findall('object'):
            value = (root.find('filename').text,
                     int(root.find('size')[0].text),
                     int(root.find('size')[1].text),
                     member[0].text,
                     int(member[4][0].text),
                     int(member[4][1].text),
                     int(member[4][2].text),
                     int(member[4][3].text)
                     )
            xml_list.append(value)
    column_name = ['filename', 'width', 'height', 'class', 'xmin', 'ymin', 'xmax', 'ymax']
    xml_df = pd.DataFrame(xml_list, columns=column_name)
    return xml_df



In [None]:
def main():
    image_path = os.path.join(os.getcwd(), (IMAGE_PATH))
    xml_df = xml_to_csv(image_path)
    xml_df.to_csv((CSV_PATH + "/"+ LABEL_FILE), index=None)
    print('Successfully converted xml to csv.')


In [None]:
main()

# 1. Saving the Data

In [None]:
# Randomize the sequence for training and testing
df = pd.read_csv(os.path.join(CSV_PATH, LABEL_FILE)).sample(frac=1).reset_index(drop=True)

df

In [None]:
import random

list_of_image = {}

for index, row in df.iterrows():
    d = row.to_dict()
    list_of_image[d["filename"]] = d["class"]

len(list_of_image)

# 2. Analysing the Data

In [None]:
import matplotlib.pyplot as plt

label_class = {}

for index in list_of_image:
    category = list_of_image[index]

    if category not in label_class:
        label_class[category] = 1
    else:
        label_class[category] =  label_class[category] + 1

label_class

In [None]:
fig = plt.figure()
ax = fig.add_axes([0,0,2,2])

category = list(label_class.keys())
value = list(label_class.values())

ax.bar(category,value)
plt.show()

print(label_class)

# 4. Split the image

In [None]:
TRAIN_FOLDER =  os.path.join("workspace", "images", "train")
TEST_FOLDER =  os.path.join("workspace", "images", "test")

if not os.path.isdir(TRAIN_FOLDER):
    !mkdir {TRAIN_FOLDER}
    
if not os.path.isdir(TEST_FOLDER):
    !mkdir {TEST_FOLDER}

#Total image to be taken and split accordingly
TOTAL_COUNT = 180

In [None]:
# Add exception = ["moist plunger", "moist syringe"]

TRAIN_DATA = 0.8
# exception = ['wet plunger', 'wet syringe', 'moist syringe']
# exception = ['dry plunger', 'dry syringe']
exception = []

for _category in category:
    if _category not in exception:
        
        img_num = 0
        
        # Get the value
        if label_class[_category] < TOTAL_COUNT:
            total_num = label_class[_category]
            training_num = int(label_class[_category] * TRAIN_DATA)

        else:
            total_num = TOTAL_COUNT
            training_num =  int(TOTAL_COUNT * TRAIN_DATA)


        print(training_num)
        print(_category)
        

        for img_name in list_of_image:
            
            # Name of the image and retrieve the xml to store in to the directory respectively
            name = os.path.splitext(img_name)        

            img_file = os.path.join(os.getcwd(), IMAGE_PATH, img_name)
            xml_file = os.path.join(os.getcwd(), IMAGE_PATH, "{}.xml".format(name[0]))

            img_class = list_of_image[img_name]

            if img_class == _category:
                if img_num < training_num:
                    !copy "{img_file}" {TRAIN_FOLDER}
                    !copy "{xml_file}" {TRAIN_FOLDER}
                elif img_num <= total_num:
                    !copy "{img_file}" {TEST_FOLDER}
                    !copy "{xml_file}" {TEST_FOLDER}

                img_num = img_num + 1

            else:
                pass

        print("{} done with training image : {}, test image : {}".format(_category, training_num, int(total_num * 0.2)))


# 4. Sort the image folder (Optional)

In [None]:
exception = []
for _category in category:
    if _category not in exception:
        
        print(_category)
        # Place where you want save the folder at
        CATEGORY_FOLDER =   os.path.join(IMAGE_PATH, _category)
        
        if not os.path.isdir(CATEGORY_FOLDER):
            !mkdir "{CATEGORY_FOLDER}"
        
        for img_name in list_of_image:
            
            # Name of the image and retrieve the xml to store in to the directory respectively
            name = os.path.splitext(img_name)   
            img_file =  os.path.join(IMAGE_PATH, img_name)
            
            img_class = list_of_image[img_name]

            if img_class == _category:
                !copy "{img_file}" "{CATEGORY_FOLDER}"
                
        print("done with {}".format(_category))

# 5. Rename the all the label in xml file (Optional)

In [None]:
# Changing the label name
import fileinput

FOLDER_PATH = os.path.join("workspace", "images", "dataset")
ALIAS = "MS"
PREFIX = 1

for img_file in glob.glob(FOLDER_PATH + '/*.jpg'):

    img_base = os.path.basename(img_file)
    name = os.path.splitext(img_base)[0]
        
    try:
        xml_file = os.path.join(FOLDER_PATH, "{}.xml".format(name))


        with fileinput.FileInput(xml_file, inplace=True) as file:
            for line in file:
                print(line.replace("moist plunger", "wet plunger"), end='')

        print(img_base)
    except:
        pass
    finally:
        PREFIX = PREFIX + 1
    


# 6. Change the image name (Optional)

In [None]:
import fileinput

FOLDER_PATH = os.path.join("dataset", "640x640", "workspace_image")
ALIAS = "DTP"
PREFIX = 1

for img_file in glob.glob(FOLDER_PATH + '/*.jpg'):

    img_base = os.path.basename(img_file)
    name = os.path.splitext(img_base)[0]
        
    try:
        xml_file = os.path.join(FOLDER_PATH, "{}.xml".format(name))
        print(xml_file)

        with fileinput.FileInput(xml_file, inplace=True) as file:
            for line in file:
                print(line.replace(img_base, "{}_{:0>4}.jpg".format(ALIAS,PREFIX)), end='')
                
        
        rename_img_file = os.rename(img_file, os.path.join(FOLDER_PATH, "{}_{:0>4}.jpg".format(ALIAS,PREFIX)))
        rename_xml_file = os.rename(xml_file, os.path.join(FOLDER_PATH, "{}_{:0>4}.xml".format(ALIAS,PREFIX)))
    except:
#         print("pass")
        rename_img_file = os.rename(img_file, os.path.join(FOLDER_PATH, "{}_{:0>4}.jpg".format(ALIAS,PREFIX)))
    finally:
        PREFIX = PREFIX + 1
    


# 7. Changing the image to grayscale

In [None]:
IMAGE_PATH = os.path.join("dataset", "640x640", "workspace_image")

In [None]:
from PIL import Image


for img_file in glob.glob(IMAGE_PATH + '/*.jpg'):

    img_base = os.path.basename(img_file)
    
    img = Image.open(img_file)
    imgGray = img.convert('L')
    imgGray.save(os.path.join(IMAGE_PATH, 'grayscale_{}'.format(img_base)))
    

# 8. Otsu Thresholding

In [None]:
import cv2 as cv
import numpy as np
from matplotlib import pyplot as plt

for img_file in glob.glob(IMAGE_PATH + '/*.jpg'):
    print("Evaluating {}".format(img_file))
    
    original_image = cv.imread(img_file)
    img = cv.imread(img_file,0)

    # global thresholding
    g_threshold, g_image_result = cv.threshold(img,127,255,cv.THRESH_BINARY)
    # Otsu's thresholding
    o_threshold, o_image_result = cv.threshold(img,0,255,cv.THRESH_BINARY+cv.THRESH_OTSU)

    # plot all the images and their histograms
    images = [original_image , img, g_image_result, o_image_result]

    titles = ["Original Image",
              "GrayScale",
              "Global Thresholding (v=127)",
              "Ostu's Thresholding"]

    plt.figure(i,figsize=(20, 20))

    plt.subplot(221),plt.imshow(images[0])
    plt.title(titles[0]), plt.xticks([]), plt.yticks([])

    plt.subplot(222),plt.imshow(images[1],'gray')
    plt.title(titles[1]), plt.xticks([]), plt.yticks([])

    plt.subplot(223),plt.imshow(images[2],'gray')
    plt.title(titles[2]), plt.xticks([]), plt.yticks([])

    plt.subplot(224),plt.imshow(images[3],'gray')
    plt.title(titles[3]), plt.xticks([]), plt.yticks([])
    plt.show()

In [None]:

for img_file in glob.glob(IMAGE_PATH + '/*.jpg'):
    img = cv.imread(img_file,0)
    
    # Take the last image extension
    img_base = os.path.basename(img_file)
    
    # Otsu's thresholding
    threshold,image_result = cv.threshold(img,0,255,cv.THRESH_BINARY+cv.THRESH_OTSU)
    print("Evaluating {}, Otsu thresholding score: {}".format(img_file, threshold))

    # plot all the images and their histograms
    images = [img, threshold, image_result]
    titles = ['Original Noisy Image','Histogram',"Otsu's Thresholding"]

    plt.figure(i,figsize=(20, 20))
    plt.subplot(3,3,1),plt.imshow(images[0],'gray')
    plt.title(titles[0]), plt.xticks([]), plt.yticks([])

    plt.subplot(3,3,2),plt.hist(images[0].ravel(),256)
    plt.title(titles[1]), plt.xticks([]), plt.yticks([])

    plt.subplot(3,3,3),plt.imshow(images[2],'gray')
    plt.title(titles[2]), plt.xticks([]), plt.yticks([])
    plt.show()

    # Save the image
    cv2.imwrite(os.path.join(IMAGE_PATH, "otsu_{}".format(img_base)), image_result )