### Importing the necessary packages

In [1]:
# handling the file system
import os
# handling metadata in the zip file
import zipfile as zf
import xml.etree.ElementTree as ET
# documenting the annotation
import pandas as pd
# handling json file
import json

### Initialize the global variable

In [2]:
# path to variable
path_dataset = "./../../../dataset_used/"
path_docs = "./../../../data"
path_zip_files = os.path.join(path_dataset, "zipped_files_annotation")
path_annotation = os.path.join(path_dataset, "annotations")
# list variable
labels  = ['glaucoma', 'non_glaucoma']

### Create the directory to store the annotation file

In [3]:
for label in labels:
    labelled_dir = os.path.join(path_annotation, label)
    if not os.path.exists(labelled_dir):
        os.makedirs(labelled_dir)
    else:
        print(f"{labelled_dir} directory already exists")
del labelled_dir

./../../../dataset_used/annotations\glaucoma directory already exists
./../../../dataset_used/annotations\non_glaucoma directory already exists


### Handle the zipped files

In [4]:
zip_files = [file for file in os.listdir(path_zip_files) if file.endswith(".zip")]  # List of all zip files

for zip_file in zip_files:
    with zf.ZipFile(os.path.join(path_zip_files, zip_file), 'r') as zip_ref:
        if zip_file.startswith(labels[0]):
            zip_ref.extractall(os.path.join(path_annotation, labels[0]))
        elif zip_file.startswith(labels[1]):
            zip_ref.extractall(os.path.join(path_annotation, labels[1]))

### Handle the annotation files

In [5]:
def get_annotation(xml_file:str, counter_id:int=1):
    """get the annotation from the xml file and document it in a dataframe

    Args:
        xml_file (str): the path to the xml file
        counter_id (int, optional): the starting id for the annotation. Defaults to 0.

    Returns:
        list, pd.DataFrame: a list of dictionary containing the annotation and a dataframe documenting the annotation
    """
    # Parse the xml file
    tree = ET.parse(xml_file)
    root = tree.getroot()
    # Create a dataframe to document the annotation
    documentation = pd.DataFrame(columns=['img_name', 'disc annotated', 'cup annotated'])
    annotasi = []

    # Loop through the xml file
    for child in root:
        if child.tag == "image":
            metadata = {}
            annot_content = []
            metadata["id_count"] = counter_id
            metadata["img_name"] = child.attrib["name"]
            metadata["img_width"] = child.attrib["width"]
            metadata["img_height"] = child.attrib["height"]
            for counter, subchild in enumerate(child):
                content = {}
                content["label"] = subchild.attrib["label"]
                content["points"] = subchild.attrib["points"]
                annot_content.append(content)
            annotasi.append({"metadata": metadata,
                            "annotation": annot_content})
            documentation.loc[len(documentation)] = [metadata["img_name"], True, True if counter == 1 else False]
            counter_id += 1
    
    return annotasi, documentation, counter_id

In [6]:
gcm_annot, gcm_documentation, gcm_lastid = get_annotation(os.path.join(path_annotation, labels[0], "annotations1.xml"))
gcm_err_annot, gcm_err_documentation, _ = get_annotation(os.path.join(path_annotation, labels[0], "annotations.xml"), gcm_lastid)
ngcm_annot, ngcm_documentation, _ = get_annotation(os.path.join(path_annotation, labels[1], "annotations.xml"))
for annot in gcm_err_annot:
    gcm_annot.append(annot)

In [7]:
# merge the two documentations
annotation_doc = pd.concat([gcm_err_documentation, gcm_documentation, ngcm_documentation], ignore_index=True)

# add the label column
annotation_doc["label"] = annotation_doc["img_name"].apply(lambda x: x.split("_")[1])
# add the status column
annotation_doc.loc[annotation_doc["disc annotated"] == annotation_doc["cup annotated"], "status"] = "keep"
annotation_doc.loc[annotation_doc["disc annotated"] != annotation_doc["cup annotated"], "status"] = "discard"
# change the label to the actual label
annotation_doc["label"] = annotation_doc["label"].apply(lambda x: labels[0] if x == "1" else labels[1])

In [8]:
ngcm = annotation_doc.loc[annotation_doc.label == labels[1]]
gcm = annotation_doc.loc[annotation_doc.label == labels[0]]

In [9]:
gcm_view = pd.DataFrame(gcm.status.value_counts())
ngcm_view = pd.DataFrame(ngcm.status.value_counts())

gcm_view["glaucoma percentage"] = round(gcm.status.value_counts(normalize=True) * 100, 2)
ngcm_view["non-glaucoma percentage"] = round(ngcm.status.value_counts(normalize=True) * 100, 2)

gcm_view.reset_index(inplace=True)
ngcm_view.reset_index(inplace=True)

gcm_view.rename(columns={"count":"glaucoma cnt"}, inplace=True)
ngcm_view.rename(columns={"count":"non-glaucoma cnt"}, inplace=True)

In [10]:
sum_view = ngcm_view.merge(gcm_view, on="status", how="outer")
sum_view

Unnamed: 0,status,non-glaucoma cnt,non-glaucoma percentage,glaucoma cnt,glaucoma percentage
0,discard,29,14.72,36,16.22
1,keep,168,85.28,186,83.78


### Save annotation and documentation

In [11]:
# documentation of the annotation
sum_view.to_excel(os.path.join(path_docs, "summary_annotation_documentation.xlsx"), index=False)
annotation_doc.to_csv(os.path.join(path_docs, "annotation_documentation.csv"), index=False)

In [12]:
# the annotation
with open(os.path.join(path_annotation, labels[0], "annotations.json"), "w") as file:
    json.dump(gcm_annot, file)
with open(os.path.join(path_annotation, labels[1], "annotations.json"), "w") as file:
    json.dump(ngcm_annot, file)