# Author: Tim Harmling
- **Description:** Creates a Dataset that can be used from our Handwriting Model with *.jpg and *.txt files. The *.txt files contain the text of the handwritten text in the image. The *.jpg files contain the cropped images of the handwritten text.

# Create Dataset

In [None]:
import os
import xml.etree.ElementTree as ET
from builtins import print

import tensorflow as tf
# import keras_cv

from tqdm.auto import tqdm
from tensorflow import keras
#from keras_cv import bounding_box
#from keras_cv import visualization
from keras.models import load_model
import numpy as np
from keras.models import Sequential, model_from_json
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import cv2
from keras.callbacks import EarlyStopping
import re

In [None]:
he = 0
print(bool(he))

In [None]:
path = "../data_zettel/Annotations"

# Get all XML file paths in path_annot and sort them
xml_files = sorted(
    [
        os.path.join(path, file_name)
        for file_name in os.listdir(path)
        if file_name.endswith(".xml")
    ]
)
 
# Get all JPEG image file paths in path_images and sort them
jpg_files = sorted(
    [
        os.path.join(path, file_name)
        for file_name in os.listdir(path)
        if file_name.endswith(".jpg")
    ]
)

In [None]:
def create_box(bbox):
    xmin = float(bbox.find("xmin").text)
    ymin = float(bbox.find("ymin").text)
    xmax = float(bbox.find("xmax").text)
    ymax = float(bbox.find("ymax").text)
    return [xmin, ymin, xmax, ymax]

In [None]:
def parse_annotation_fake(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()

    image_name = root.find("filename").text
    image_path = f'../data_zettel/filled_resized/{image_name}'

    boxes = []
    classes = []
    main_classes = []
    sub_classes = []
    main_boxes = []
    sub_boxes = []
    values = []
    
    for obj in root.iter("object"):
        cls = obj.find("name").text
        classes.append(cls)
        
        bbox = obj.find("bndbox")
        boxes.append(create_box(bbox))
        
        # main labels
        if (cls == 'Wohnsitz_waehrend_Ausbildung') or (cls == 'Ausbildung') or (cls == 'Person') or (cls == 'Wohnsitz'):
            main_classes.append(cls)
            bbox = obj.find("bndbox")
            main_boxes.append(create_box(bbox))
        else:
            attributes = obj.findall("attributes/attribute")
            for attribute in attributes:
                value_element = attribute.find("value")
                value = value_element.text
                if value is None:
                    continue
                elif value is None or value.lower() in ["true", "false"]:
                    continue
                
                # Versuche, den Wert in einen Float zu konvertieren
                try:
                    float_value = float(value)
                    int_value = int(float_value)
                    values.append(str(int_value))  # Hier wird der Integer in einen String umgewandelt
                except ValueError:
                    # Wenn die Konvertierung fehlschlägt, füge den originalen Wert zur Liste hinzu
                    values.append(value)
                break
            
            bbox = obj.find("bndbox")
            sub_boxes.append(create_box(bbox))
            sub_classes.append(cls)
    return image_path,main_boxes,main_classes,values, sub_classes

In [None]:
from bounding_box.model import load_weight_model, predict_image,plot_image, get_templated_data, edit_sub_boxes_cut_links, edit_sub_boxes_cut_top
from bounding_box.template import build_templating_data
org_ms_boxes_person, org_ms_boxes_wohnsitz, org_ms_boxes_ausbildung, org_ms_boxes_wwa, person_class_ids, ausbildung_class_ids, wohnsitz_class_ids, wwa_class_ids, widthOrgImag, heightOrgImag = build_templating_data()



In [None]:
from bounding_box.model import load_weight_model, predict_image,plot_image, get_templated_data, edit_sub_boxes_cut_links, edit_sub_boxes_cut_top
from bounding_box.template import build_templating_data


In [None]:
def sort_box_to_class(sub_boxes, sub_classes_numbers, sub_classes_string, values):
    new_sub_boxes = []
    new_values = []
    new_sub_box_classes = []
    new_sub_box_number = []
    cls_values = [8, 5, 3, 4,
                  14, 16, 17, 9, 11, 18, 19, 20,
                  22, 25, 26, 23, 24, 27, 28, 29, 33, 30, 31,  34]
    comment_strings = [
        "Ausbildung_Staette",
        "Ausbilung_Abschluss",
        "Ausbildung_Amt",
        "Ausbildung_Foerderungsnummer",
        
        "Person_Name",
        "Person_Vorname",
        "Person_Geburtsname",
        "Person_Geburtsort",
        "Person_Geburtsdatum",
        "Person_Familienstand_seit",
        "Person_Stattsangehörigkeit_eigene",
        "Person_Kinder",
        
        "Wohnsitz_Strasse",
        "Wohnsitz_Hausnummer",
        "Wohnsitz_Adresszusatz",
        "Wohnsitz_Land",
        "Wohnsitz_Postleitzahl",
        "Wohnsitz_Ort",
        
        "Wohnsitz_waehrend_Ausbildung_Strasse",
        "Wohnsitz_waehrend_Ausbildung_Hausnummer",
        "Wohnsitz_waehrend_Ausbildung_Adresszusatz",
        "Wohnsitz_waehrend_Ausbildung_Land",
        "Wohnsitz_waehrend_Ausbildung_Postleitzahl",
        "Wohnsitz_waehrend_Ausbildung_ort"
    ]
    for j,class_string in enumerate(sub_classes_string):
        for i, comment_string in enumerate(comment_strings):
            if class_string == comment_string:
                for x, sub_class_number in enumerate(sub_classes_numbers):
                    if cls_values[i] == sub_class_number:
                        new_sub_class_number = cls_values[i]
                        new_sub_box = sub_boxes[x]
                        
                        value = values[j]
                        new_sub_class_string = class_string
                        
                        new_sub_box_number.append(new_sub_class_number)
                        new_sub_boxes.append(new_sub_box)
                        new_values.append(value)
                        new_sub_box_classes.append(new_sub_class_string)
        
    return new_sub_boxes, new_values, new_sub_box_classes, new_sub_box_number

In [None]:
from collections import namedtuple
from bounding_box.template import get_for_main_bbox_sub_bboxes
ImageInfo = namedtuple('ImageInfo', ['path', 'boxes', 'values', 'classes','numbers'])
image_list = []
for xml_file in tqdm(xml_files): 
    image_path, main_boxes, main_classes,values, sub_classes = parse_annotation_fake(xml_file)
    # confidence = [[0.9, 0.9, 1, 1]]
    # main_classes = [[0,1,2,3]]
    # main_boxes = [main_boxes]
    import os

   
    base_name = os.path.basename(xml_file)
    file_name, file_extension = os.path.splitext(base_name)
    desired_part = file_name.split('_')[-1]
    bbox_model = load_weight_model(r"../bounding_box\workspace\models\main_bbox_detector_model.h5",4)
    image_path = f"../data_zettel/filled_resized/image_{desired_part}.jpg"
    main_boxes, confidence, classes, ratios = predict_image(image_path, bbox_model)
    
    ausbildung, person, wohnsitz, wwa, best_predicted = get_templated_data(main_boxes, confidence, classes, org_ms_boxes_person,
                                                                       org_ms_boxes_wohnsitz, org_ms_boxes_ausbildung,
                                                                       org_ms_boxes_wwa, person_class_ids,
                                                                       ausbildung_class_ids, wohnsitz_class_ids,
                                                                       wwa_class_ids)
    from bounding_box.ressize import scale_up
    ausbildung, person, wohnsitz, wwa = edit_sub_boxes_cut_top(ausbildung, person, wohnsitz, wwa)
    ausbildung, person, wohnsitz, wwa = scale_up( ausbildung, person, wohnsitz, wwa, ratios)
    print(desired_part)
    plot_image(image_path, ausbildung, person, wohnsitz, wwa, best_predicted)
    
    
    sub_boxes = ausbildung[0] + person[0] + wohnsitz[0] + wwa[0]
    sub_classes_number = ausbildung[1] + person[1] + wohnsitz[1] + wwa[1]
    new_sub_boxes, new_values, sub_classes_string, new_sub_box_number = sort_box_to_class(sub_boxes,sub_classes_number,sub_classes,values)
    
    #sub_boxes = edit_sub_boxes_cut_top(ausbildung, person, wohnsitz, wwa)
    #sub_boxes = edit_sub_boxes_cut_links(ausbildung, person, wohnsitz, wwa)
    
    image = ImageInfo(path=image_path, boxes=new_sub_boxes, values=new_values,classes=sub_classes_string, numbers=new_sub_box_number)
    numbers_to_check = ["31", "32", "33", "35", "36"]
    # Check if the string does not contain any of the specified numbers
    does_not_contain_numbers = all(number not in image.path for number in numbers_to_check)
    
    if does_not_contain_numbers:
        
            image_list.append(image)
 


### Crop ROI

In [None]:
print(len(image_list[0].values))

In [None]:
# Crop ROI
save_path_crops = "../data_zettel/cropped_images"

import cv2
def delete_file(file_path):
    try:
        os.remove(file_path)
        print(f"File {file_path} deleted successfully.")
    except FileNotFoundError:
        print(f"File {file_path} not found.")
    except Exception as e:
        print(f"An error occurred while deleting {file_path}: {e}")
# Crop ROI
import cv2
from bounding_box.ressize import resize_imaged_without_expand_dim
from bounding_box.config import YOLO_WIDTH, YOLO_HEIGHT



In [None]:
# Crop ROI
import cv2
from bounding_box.ressize import resize_imaged_without_expand_dim
from bounding_box.config import YOLO_WIDTH, YOLO_HEIGHT
from bounding_box.model import load_weight_model,predict_image,get_image_as_array, show_image 
from bounding_box.config import NUM_CLASSES_ALL,BBOX_PATH,MAIN_BBOX_DETECTOR_MODEL,SUB_BBOX_DETECTOR_MODEL  
from bounding_box.model import load_weight_model, predict_image,plot_image, get_templated_data, edit_sub_boxes_cut_links, edit_sub_boxes_cut_top
from bounding_box.template import build_templating_data

def crop(xmin, ymin, xmax, ymax, image_path):
    image = cv2.imread(image_path)
    image = resize_imaged_without_expand_dim(image, YOLO_WIDTH, YOLO_HEIGHT)
    xmin = int(round(xmin))
    ymin = int(round(ymin))
    xmax = int(round(xmax))
    ymax = int(round(ymax))
    # width = int(round(width))
    # height = int(round(height))
    # rowBeg = y
    # rowEnd = y + height
    # columnBeg = x
    # columnEnd = x + width
    imgCropped = image[ymin:ymax, xmin:xmax]
    return imgCropped


In [None]:
def is_number(value):
    if isinstance(value, (int, float)):
        return True
    elif isinstance(value, str):
        if value.isdigit() or (value.replace('.', '', 1).isdigit()):
            return True
    return False
        
for index, image in enumerate(image_list):
    boxes = image.boxes
    for i, box in enumerate(boxes):
        xmin, ymin, xmax, ymax = np.array(box)
        imgCropped = crop(xmin, ymin, xmax, ymax, image.path)
        if imgCropped is not None:
            not_list = ["0", "1", "", None]
            not_classes_list = [31]
            if image.numbers[i] not in not_classes_list:
                if image.values[i] not in not_list and image.values[i] is not None:
                    if is_number(image.values[i]):       
                        img_file_path = f"{save_path_crops}/{index}_{i}.jpg"
                        txt_file_path = f"{save_path_crops}/{index}_{i}.txt"
                        try:
                            cv2.imwrite(img_file_path, imgCropped)
                            with open(txt_file_path, 'w', encoding='utf-8') as file:
                                image.values[i] = image.values[i].replace(" ", "|")
                                 
                                file.write(image.values[i])
                                print(f"{image.numbers[i]}: {image.classes[i]}: {image.values[i]}")
                        except:
                            delete_file(img_file_path)
                            delete_file(txt_file_path)
                            continue