### Importing the necessary packages

In [1]:
# handling the file system
import os
from pathlib import Path
# handling metadata in the zip file
import zipfile as zf
import xml.etree.ElementTree as ET
# documenting the annotation
import pandas as pd
# handling json file
import json
# handling the annotation and image
import matplotlib.pyplot as plt
from matplotlib.patches import Polygon
# handling the environment variable
from dotenv import load_dotenv

# load the environment variable
load_dotenv()

True

### Initialize the global variable

In [2]:
# path to variable
path_full = os.environ.get("ORI_PATH")
path_download = os.path.join(path_full, "manual_download")
path_dataset = os.path.join(path_full, "datasets/preprocessed")
path_docs = os.path.join(path_full, "data")
path_zip_files = os.path.join(path_download, "zipped_files_annotation")
path_target_annot = os.path.join(path_dataset, "annotations")
path_src_imgs = os.path.join(path_dataset, "fundus_image")
# list variable
classes  = ['glaucoma', 'non_glaucoma']
zip_files = [file for file in os.listdir(path_zip_files) if file.endswith(".zip")]  # List of all zip files

### Create a directory to store the annotation file

In [3]:
for label in classes:
    labelled_dir = os.path.join(path_target_annot, label)
    os.makedirs(labelled_dir, exist_ok=True)
del labelled_dir, label

### Handle the zipped files

In [4]:
for zip_file in zip_files:
    with zf.ZipFile(os.path.join(path_zip_files, zip_file), 'r') as zip_ref:
        if zip_file.startswith(classes[0]):
            zip_ref.extractall(os.path.join(path_target_annot, classes[0]))
        elif zip_file.startswith(classes[1]):
            zip_ref.extractall(os.path.join(path_target_annot, classes[1]))
del zip_file, zip_ref, zip_files

### Handle the annotation files

In [5]:
def get_annotation(xml_file:str, counter_id:int=1):
    """get the annotation from the xml file and document it in a dataframe

    Args:
        xml_file (str): the path to the xml file
        counter_id (int, optional): the starting id for the annotation. Defaults to 0.

    Returns:
        list, pd.DataFrame: a list of dictionary containing the annotation and a dataframe documenting the annotation
    """
    # Parse the xml file
    tree = ET.parse(xml_file)
    root = tree.getroot()
    # Create a dataframe to document the annotation
    documentation = pd.DataFrame(columns=['img_name', 'disc annotated', 'cup annotated'])
    annotasi = []

    # Loop through the xml file
    for child in root:
        if child.tag == "image":
            metadata = {}
            annot_content = []
            metadata["id_count"] = counter_id
            metadata["img_name"] = child.attrib["name"]
            metadata["img_width"] = child.attrib["width"]
            metadata["img_height"] = child.attrib["height"]
            for counter, subchild in enumerate(child):
                content = {}
                content["label"] = subchild.attrib["label"]
                content["points"] = subchild.attrib["points"]
                annot_content.append(content)
            annotasi.append({"metadata": metadata,
                            "annotation": annot_content})
            documentation.loc[len(documentation)] = [metadata["img_name"], True, True if counter == 1 else False]
            counter_id += 1
    
    return annotasi, documentation, counter_id

In [6]:
# extract the annotation from the xml file
gcm_annot, gcm_documentation, gcm_lastid = get_annotation(os.path.join(path_target_annot, classes[0], "annotations1.xml"))
gcm_err_annot, gcm_err_documentation, _ = get_annotation(os.path.join(path_target_annot, classes[0], "annotations.xml"), gcm_lastid)
ngcm_annot, ngcm_documentation, _ = get_annotation(os.path.join(path_target_annot, classes[1], "annotations.xml"))
for annot in gcm_err_annot:
    gcm_annot.append(annot)
del annot, _

In [7]:
# merge the two documentations
annotation_doc = pd.concat([gcm_err_documentation, gcm_documentation, ngcm_documentation], ignore_index=True)

# add the label column
annotation_doc["label"] = annotation_doc["img_name"].apply(lambda x: x.split("_")[1])
# add the status column
annotation_doc.loc[annotation_doc["disc annotated"] == annotation_doc["cup annotated"], "status"] = "keep"
annotation_doc.loc[annotation_doc["disc annotated"] != annotation_doc["cup annotated"], "status"] = "discard"
# change the label to the actual label
annotation_doc["label"] = annotation_doc["label"].apply(lambda x: classes[0] if x == "1" else classes[1])

In [8]:
ngcm = annotation_doc.loc[annotation_doc.label == classes[1]]
gcm = annotation_doc.loc[annotation_doc.label == classes[0]]

In [9]:
gcm_view = pd.DataFrame(gcm.status.value_counts())
ngcm_view = pd.DataFrame(ngcm.status.value_counts())

gcm_view["glaucoma percentage"] = round(gcm.status.value_counts(normalize=True) * 100, 2)
ngcm_view["non-glaucoma percentage"] = round(ngcm.status.value_counts(normalize=True) * 100, 2)

gcm_view.reset_index(inplace=True)
ngcm_view.reset_index(inplace=True)

gcm_view.rename(columns={"count":"glaucoma cnt"}, inplace=True)
ngcm_view.rename(columns={"count":"non-glaucoma cnt"}, inplace=True)

In [10]:
sum_view = ngcm_view.merge(gcm_view, on="status", how="outer")
sum_view

Unnamed: 0,status,non-glaucoma cnt,non-glaucoma percentage,glaucoma cnt,glaucoma percentage
0,discard,29,14.72,36,16.22
1,keep,168,85.28,186,83.78


### Save annotation and documentation

In [11]:
# documentation of the annotation
sum_view.to_excel(os.path.join(path_docs, "summary_annotation_documentation.xlsx"), index=False)
annotation_doc.to_csv(os.path.join(path_docs, "annotation_documentation.csv"), index=False)

In [12]:
# the annotation
with open(os.path.join(path_target_annot, classes[0], "annotations.json"), "w") as file:
    json.dump(gcm_annot, file)
with open(os.path.join(path_target_annot, classes[1], "annotations.json"), "w") as file:
    json.dump(ngcm_annot, file)

# Create annotation images

In [13]:
# path to the annotation
path_annot_gcm = os.path.join(path_target_annot, classes[0], "annotations.json")
path_annot_ngcm = os.path.join(path_target_annot, classes[1], "annotations.json")
# path to the image
path_img_gcm = os.path.join(path_src_imgs, classes[0])
path_img_ngcm = os.path.join(path_src_imgs, classes[1])

In [14]:
annot_gcm = json.load(open(path_annot_gcm))
annot_ngcm = json.load(open(path_annot_ngcm))

In [15]:
for class_label in classes:
    os.makedirs(os.path.join(path_dataset, "annot_image", class_label), exist_ok=True)

In [16]:
def save_annotated_image(annotation_data, path_img, classes):
    """save annotated image using information of the annotation data in form of json file

    Args:
        annotation_data (list): the annotation data in form of json file
        path_img (str): the path to directory where image is stored
        classes (str): the class of the image
    """
    for annot in annotation_data:
        # check if the annotated image is exist
        file_name = annot['metadata']['img_name']
        image_path = os.path.join(path_dataset, 'annot_image', classes, f'{file_name.split(".")[0]}.png')
        # skip the existing image
        if os.path.exists(image_path) or not Path(os.path.join(path_img, file_name)).is_file():
            continue
        # read the image
        image = plt.imread(os.path.join(path_img, file_name))
        # make sure the image is show on the plot that will be saved
        plt.imshow(image)
        # create the annotation on the image
        for label in annot['annotation']:
            # color for the annotation
            color = '#00DB3B'
            if label['label'] == "cup":
                color = '#0028DB'
            # get the points of the annotation
            points = [list(map(float, item.split(','))) for item in label['points'].split(';')]
            # draw the annotation
            poly = Polygon(points, edgecolor=color, facecolor='none', label=label['label'], linewidth=.5)
            # add the annotation to the plot
            plt.gca().add_patch(poly)
        # save the image
        plt.axis('off')
        plt.savefig(os.path.join(path_dataset, 'annot_image', classes, f'{file_name.split(".")[0]}.png'),
                    bbox_inches='tight', dpi=300,
                    transparent=True, pad_inches=0)
        plt.close()

In [17]:
for class_label, annot, path_img in zip(classes,
                                        [annot_gcm, annot_ngcm],
                                        [path_img_gcm, path_img_ngcm]):
    save_annotated_image(annot, path_img, class_label)