In [1]:
from glob import glob
from pathlib import Path

import cv2
import xml.etree
import numpy as np
import os
import xml.etree.ElementTree as ET

In [2]:
#collect and filter xml and image files

xmls = glob('/home/erik/Riksarkivet/Projects/helsinki/data/NAF_GT_Court_Records/newer_material/**/*.xml', recursive=True)
xmls = [x for x in xmls if Path(x).name != 'metadata.xml' and Path(x).name != 'mets.xml']
imgs = glob('/home/erik/Riksarkivet/Projects/helsinki/data/NAF_GT_Court_Records/newer_material/**/*.jpg', recursive=True)

xmls_check = ['/'.join(x.split('/')[-3:]) for x in xmls]
imgs = [x for x in imgs if '/'.join([x.split('/')[-2], 'page', x.split('/')[-1]]).replace('jpg', 'xml') in xmls_check]

xmls.sort()
imgs.sort()

#group the files in volumes, to later print the dataset to (path_to_line_imgs or path to gt_files)/NAF_GT_Court_Records/volume/page/(gt_file or line_imgs)
groups = list(set([x.split('/')[-2] for x in imgs]))
groups.sort()

grouped_imgs = []
grouped_xmls = []

for group in groups:
    imgs_group = []
    xmls_group = []
    for img, xml in zip(imgs, xmls):
        if group in img:
            imgs_group.append(img)
        if group in xml:
            xmls_group.append(xml)


    grouped_imgs.append(imgs_group)
    grouped_xmls.append(xmls_group)


In [3]:
# assert that xmls and imgs matches

for g_i, g_x in zip(grouped_imgs, grouped_xmls):
    assert len(g_i) == len(g_x)

In [5]:
# Parse xmls to get ground truth transcriptions, crop and binarize images, match with transcriptions, write line_images and gt file for each page

no_of_lines = 0 

path_to_line_images = '/home/erik/Riksarkivet/Projects/helsinki/data/line_images/NAF_GT_Court_records'
path_to_gt_files = '/home/erik/Riksarkivet/Projects/helsinki/data/gt_files/NAF_GT_Court_Records'

#for each volume  
for v, (gr_imgs, gr_xmls) in enumerate(zip(grouped_imgs, grouped_xmls)):

    print(v)
    
    volume_str = gr_imgs[0].split('/')[-2]
    volume_gts_path = os.path.join(path_to_gt_files, volume_str)
    volume_imgs_path = os.path.join(path_to_line_images, volume_str)

    
    os.mkdir(volume_gts_path)
    os.mkdir(volume_imgs_path)

    #for each img, page-xml in volume
    for i, (image, page) in enumerate(zip(gr_imgs, gr_xmls)):

        ground_truths = []
        line_number = 0

        page_str = image.split('/')[-1].replace('.jpg', '')
        page_imgs_path = os.path.join(volume_imgs_path, volume_str + '_' + page_str)
      
        try:
            os.mkdir(page_imgs_path)
        except:
            pass
      
        #binarize image
        img_ori = cv2.imread(image)
        img_gray = cv2.cvtColor(img_ori, cv2.COLOR_BGR2GRAY)
        dst = cv2.fastNlMeansDenoising(img_gray, h=31, templateWindowSize=7, searchWindowSize=21)
        img_blur = cv2.medianBlur(dst,3).astype('uint8')
        img = cv2.adaptiveThreshold(img_blur, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)

        #parse xml
        tree = ET.parse(page)
        root=tree.getroot()    

        #iterate through text_lines in xml
        for text_line in root.iter('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}TextLine'):
            empty_text_field = False #exclude empty transcriptions
          
            for child in text_line:
            
                #get ground truth trancription and if not empty, save to gt_list
                if child.tag == '{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}TextEquiv':
              
                    for text_field in child:
                
                        if text_field.text == None or text_field.text == '':
                            empty_text_field = True
                            continue
                        else:
                            #string wrangling to get the right gt_line
                            ground_truths.append(os.path.join(path_to_line_images.split('/')[-1], volume_imgs_path.split('/')[-1], page_imgs_path.split('/')[-1], volume_str + '_' + page_str + '_' + str(line_number).zfill(4) + '.jpg') + '|' + text_field.text)

                #get cropping coordinates and crop the text_line, using the polygon mask in the xml-file, 
                #mask out everything else, calculate bounding box and crop the line from the masked image 
                elif child.tag == '{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Coords':
              
                    mask = np.zeros(img.shape[0:2], dtype=np.uint8)
              
                    coordinates = child.attrib['points'].split()
                    temp = [coord.split(',') for coord in coordinates]
                    temp2 = [[int(x) for x in lst] for lst in temp]
                    points = np.array(temp2)
              
                    cv2.drawContours(mask, [points], -1, (255, 255, 255), -1, cv2.LINE_AA)
                    res = cv2.bitwise_and(img,img,mask = mask)
                    rect = cv2.boundingRect(points)

                    wbg = np.ones_like(img, np.uint8)*255
                    cv2.bitwise_not(wbg,wbg, mask=mask)
              
                    #overlap the resulted cropped image on the white background
                    dst = wbg+res
              
                    cropped = dst[rect[1]: rect[1] + rect[3], rect[0]: rect[0] + rect[2]]
                    #cropped_border = cv2.copyMakeBorder(src=cropped, top=10, bottom=10, left=10, right=10, borderType=cv2.BORDER_CONSTANT, value=(255, 255, 255),)                   

            #if not empty transcription, write image
            if not empty_text_field:
                no_of_lines += 1
            
                img_file_path = os.path.join(page_imgs_path, volume_str + '_' + page_str + '_' + str(line_number).zfill(4) + '.jpg')
                try:
                    cv2.imwrite(img_file_path, cropped)
                    line_number += 1
                except:
                    ground_truths.pop()
                    print(image)

        #write gt_file for entire image
        path_to_ground_truths = os.path.join(volume_gts_path, volume_str + '_' + page_str + '_' + 'gt.txt')
        with open(path_to_ground_truths, 'w') as f:
            f.write('\n'.join(ground_truths))
      
print(no_of_lines)

0


FileExistsError: [Errno 17] File exists: '/home/erik/Riksarkivet/Projects/helsinki/data/gt_files/NAF_GT_Court_Records/Alavuden_Kob55_1908'