In [2]:
from glob import glob
import os
from pathlib import Path
from pathlib import PurePath
import cv2
from riks_ds_utils.page_transforms import PageTransforms
import numpy as np
import multiprocessing
from multiprocessing import Pool
import shutil
import json
import random
from riks_ds_utils.preprocess import Preprocess

In [3]:
def get_img_page_pairs(basepath):
    imgs = glob(os.path.join(basepath, '**'), recursive=True)
    page = glob(os.path.join(basepath, '**'), recursive=True)

    page = [x for x in imgs if x.lower().endswith(('.xml'))]
    imgs = [x for x in imgs if x.lower().endswith(('.png', '.jpg', '.jpeg', '.tiff', 'tif','.bmp', '.gif'))]

    imgs.sort()

    #page_names = [Path(x).stem for x in page]
    page_names = ['_'.join(PurePath(x).parts[-3:]).split('.')[0] for x in page]

    page_names = [x.replace('_page', '') for x in page_names]
    imgs_page = list()

    for i, img in enumerate(imgs):
        img_name = '_'.join(PurePath(img).parts[-2:]).split('.')[0]

        #img_name = Path(img).stem
    
        try:
            ind = page_names.index(img_name)
            imgs_page.append((imgs[i], page[ind]))    
        except:
            pass

    return imgs_page

In [4]:
#write the code for conditional file_names if duplicates, also write code for structure the dataset by subpaths (volumes)

def binarise_and_write_dataset_structure(img, page, output_path, duplicates):

    #if the image names are unique within the dataset
    img_stem = Path(img).stem
    img_name = Path(img).name
    page_name = Path(page).name

    #otherwise, use part of the path to separate the images, giving them new unique names
    #img_stem = '_'.join(PurePath(img).parts[-2:]).split('.')[0]
    #img_name = '_'.join(PurePath(img).parts[-2:])
    #page_name = '_'.join(PurePath(page).parts[-3:]).split('.')[0].replace('_page', '')

    #police reports separated by volumes as well?

    try:
        img_ori = cv2.imread(img)
        img_gray = cv2.cvtColor(img_ori, cv2.COLOR_BGR2GRAY)
        dst = cv2.fastNlMeansDenoising(img_gray, h=31, templateWindowSize=7, searchWindowSize=21)
        img_blur = cv2.medianBlur(dst,3).astype('uint8')
        threshed = cv2.adaptiveThreshold(img_blur, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)
    except:
        pass
        

    os.makedirs(os.path.join(output_path, img_stem), exist_ok=True)
    os.makedirs(os.path.join(output_path, img_stem, 'bin_image'), exist_ok=True)
    os.makedirs(os.path.join(output_path, img_stem, 'ori_image'), exist_ok=True)
    os.makedirs(os.path.join(output_path, img_stem, 'page'), exist_ok=True)
    os.makedirs(os.path.join(output_path, img_stem, 'text_regions_bin'), exist_ok=True)
    os.makedirs(os.path.join(output_path, img_stem, 'text_regions_ori'), exist_ok=True)
    os.makedirs(os.path.join(output_path, img_stem, 'line_images_bin'), exist_ok=True)
    os.makedirs(os.path.join(output_path, img_stem, 'line_images_ori'), exist_ok=True)
    os.makedirs(os.path.join(output_path, img_stem, 'gt'), exist_ok=True)

    cv2.imwrite(os.path.join(output_path, img_stem, 'bin_image', 'bin_' + img_name), threshed)
    
    shutil.copy(page, os.path.join(output_path, img_stem, 'page', page_name))
    shutil.copy(img, os.path.join(output_path, img_stem, 'ori_image', img_name))
    #dst_file = os.path.join(output_path, dataset_path,  Path(page).name)
    

In [5]:
def write_bin_ori_gt(f_w_ori, f_w_bin, ground_truths_ori, ground_truths_bin):
    for gt_ori, gt_bin in zip(ground_truths_ori, ground_truths_bin):
            line_ori = json.loads(gt_ori)
            line_bin = json.loads(gt_bin)
            s_ori = json.dumps(line_ori, ensure_ascii=False)
            s_bin = json.dumps(line_bin, ensure_ascii=False)
            f_w_ori.write(s_ori)
            f_w_ori.write('\n')
            f_w_bin.write(s_bin)
            f_w_bin.write('\n')

In [6]:
def collect_and_write_text_rec_gt(basepath: str, split: float):

    gts = glob(os.path.join(basepath, '**', 'gt', '*.txt'), recursive=True)
    gts_bin = [x for x in gts if 'bin_gt' in x]
    gts_ori = [x for x in gts if 'bin_gt' not in x]

    assert len(gts_bin) == len(gts_ori)

    ground_truths_ori = list()
    ground_truths_bin = list()

    for gt_ori, gt_bin in zip(gts_ori, gts_bin):
        with open(gt_ori, 'r') as f_ori, open(gt_bin, 'r') as f_bin:
            ground_truths_ori += f_ori.readlines()
            ground_truths_bin += f_bin.readlines()


    with open(os.path.join(basepath, 'gt_files', 'text_recognition_all_ori.jsonl'), 'w') as f_w_ori, open(os.path.join(basepath, 'gt_files', 'text_recognition_all_bin.jsonl'), 'w') as f_w_bin:
        write_bin_ori_gt(f_w_ori, f_w_bin, ground_truths_ori, ground_truths_bin)

    cutoff = int(len(ground_truths_ori) * split)

    random.shuffle(ground_truths_ori)
    random.shuffle(ground_truths_bin)

    val_ori = ground_truths_ori[0:cutoff]
    train_ori = ground_truths_ori[cutoff:]
    val_bin = ground_truths_bin[0:cutoff]
    train_bin = ground_truths_bin[cutoff:]


    with open(os.path.join(basepath, 'gt_files', 'text_recognition_ori_train.jsonl'), 'w') as f_w_ori, open(os.path.join(basepath, 'gt_files', 'text_recognition_bin_train.jsonl'), 'w') as f_w_bin:
        write_bin_ori_gt(f_w_ori, f_w_bin, train_ori, train_bin)

    with open(os.path.join(basepath, 'gt_files', 'text_recognition_ori_val.jsonl'), 'w') as f_w_ori, open(os.path.join(basepath, 'gt_files', 'text_recognition_bin_val.jsonl'), 'w') as f_w_bin:
        write_bin_ori_gt(f_w_ori, f_w_bin, val_ori, val_bin)

In [15]:
def check_for_duplicate_file_names(imgs):
    img_names = [Path(x).stem for x in imgs]
    
    if len(set(img_names)) == len(img_names):
        return False
    else:
        return True
        
        """
        img_names = ['_'.join(PurePath(x[0]).parts[-2:]).split('.')[0] for x in imgs_page]
        if len(set(img_names)) == len(img_names):
            print(len(img_names))
            print(len(set(img_names)))
            return 'no duplicates when fixed'
        else:
            return 'still duplicates'
    
    #img_stem = '_'.join(PurePath(img).parts[-2:]).split('.')[0]
    #img_name = '_'.join(PurePath(img).parts[-2:])
    #page_name = '_'.join(PurePath(page).parts[-3:]).split('.')[0].replace('_page', '')
    """

In [16]:
basepath = '/media/erik/Elements/Riksarkivet/data/datasets/htr/HTR_1700_23_09_29'

print("binarizing and writing dataset structure")

#os.makedirs(basepath, exist_ok=True)

imgs_page = get_img_page_pairs('/media/erik/Elements/Riksarkivet/data/datasets/htr/raw/HTR_1700')

print(check_for_duplicate_file_names(imgs_page))




binarizing and writing dataset structure
5209
4178
5209
5209
no duplicates when fixed


In [9]:
#binarize and write dataset structure

basepath = '/media/erik/Elements/Riksarkivet/data/datasets/htr/Trolldomskommissionen2'

print("binarizing and writing dataset structure")

basepath = '/media/erik/Elements/Riksarkivet/data/datasets/htr/Trolldomskommissionen2'
os.makedirs(basepath, exist_ok=True)

imgs_page = get_img_page_pairs('/home/erik/Riksarkivet/Projects/riks_ds_utils/data/Trolldomskommissionen-GT-2023-07-08') #transkribus-export
imgs = [x[0] for x in imgs_page]
if check_for_duplicate_file_names(imgs):
    imgs_new = ['_'.join(PurePath(x).parts[-2:]).split('.')[0] for x in imgs]
    duplicates = True
    if check_for_duplicate_file_names(imgs_new):
        raise Exception('Duplicate file_names and could not be fixed by including part of path, needs to be dealt with manually')
else:
    duplicates = False

args = [(img, page, basepath, duplicates) for img, page in imgs_page]

cpu_count = multiprocessing.cpu_count()
p = Pool(cpu_count)
p.starmap(binarise_and_write_dataset_structure, args)

#Write coco-file for regions

print('Creating coco-file for regions')

os.makedirs(os.path.join(basepath, 'gt_files'), exist_ok=True)

imgs_bin = glob(os.path.join(basepath, '**', 'bin_image', '**'), recursive=True)
imgs_ori = glob(os.path.join(basepath, '**', 'ori_image', '**'), recursive=True)
page = glob(os.path.join(basepath, '**', 'page', '**'), recursive=True)
imgs_bin = [x.strip() for x in imgs_bin if os.path.isfile(x)]
imgs_ori = [x.strip() for x in imgs_ori if os.path.isfile(x)]
page = [x for x in page if os.path.isfile(x)]

imgs_bin.sort()
imgs_ori.sort()
page.sort()

assert len(imgs_bin) == len(page) == len(imgs_ori)

PageTransforms.page_to_region_coco(
    xmls=page,
    imgs=imgs_ori,
    out_path=os.path.join(basepath, 'gt_files', 'coco_regions_ori_TK.json'),
    elems=['TextRegion'],
    schema='http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15'
)

PageTransforms.page_to_region_coco(
    xmls=page,
    imgs=imgs_bin,
    out_path=os.path.join(basepath, 'gt_files', 'coco_regions_bin_TK.json'),
    elems=['TextRegion'],
    schema='http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15'
)



#Crop text-line images and write a text-recognition gt_file for each page

print('Crop text-line images and writing jsonl for text-recognition for each page')

args = [
    (p, 
     im, 
     basepath, 
     os.path.join(*PurePath(im).parts[0:-2], 'line_images_ori'), 
     os.path.join(*PurePath(p).parts[0:-2], 'gt'), 
     'http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15', 
     False) for p, im in zip(page, imgs_ori)
     ]

args_bin = [
    (p, 
     im, 
     basepath, 
     os.path.join(*PurePath(im).parts[0:-2], 'line_images_bin'), 
     os.path.join(*PurePath(p).parts[0:-2], 'gt'), 
     'http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15', 
     True) for p, im in zip(page, imgs_bin)
     ]

cpu_count = multiprocessing.cpu_count()
p = Pool(cpu_count)
p.starmap(PageTransforms.crop_line_imgs_page, args)
p.starmap(PageTransforms.crop_line_imgs_page, args_bin)

#Collect all gt_files and write combined gt_files for text-recognition

print('Collect all gt_files and write combined gt_files for text-recognition')

collect_and_write_text_rec_gt(basepath, 0.1)

#Crop regions and write coco_lines (images in coco are all the cropped regions, line coords are translated with respect to these regions)

print('Crop regions and write coco_lines')

PageTransforms.crop_text_reg_write_text_line_coco(page, imgs_bin, os.path.join(basepath, 'gt_files', 'coco_lines_bin_TK.json'), binarized=True)
PageTransforms.crop_text_reg_write_text_line_coco(page, imgs_ori, os.path.join(basepath, 'gt_files', 'coco_lines_ori_TK.json'), binarized=False)





In [14]:
basepath = '/media/erik/Elements/Riksarkivet/data/datasets/htr/Trolldomskommissionen'

imgs = glob(os.path.join(basepath, '**', 'bin_image', '**'), recursive=True)
page = glob(os.path.join(basepath, '**', 'page', '**'), recursive=True)
imgs = [x for x in imgs if os.path.isfile(x)]
page = [x for x in page if os.path.isfile(x)]

imgs.sort()
page.sort()

assert len(imgs) == len(page)

#imgs_checked = glob(os.path.join('/media/erik/Elements/Riksarkivet/data/datasets/htr/segmentation/ICDAR-2019/fiftyone/data', '**'))
#imgs_checked = [Path(x).name for x in imgs_checked]

imgs_page = list(zip(imgs, page))

print(len(imgs_page))

#imgs_page = [(x, y) for (x, y) in imgs_page if Path(x).name in imgs_checked]

#print(len(imgs_page))

imgs = [x for (x, y) in imgs_page]
page = [y for (x, y) in imgs_page]

PageTransforms.page_to_region_coco(
    xmls=page,
    imgs=imgs,
    out_path='/media/erik/Elements/Riksarkivet/data/datasets/htr/Trolldomskommissionen/gt_files/coco_regions_TK.json',
    elems=['TextRegion'],
    schema='http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15'
)


780
0
50
100
150
200
250
300
350
400
450
500
550
600
650
700
750


In [3]:
basepath = '/media/erik/Elements/Riksarkivet/data/datasets/htr/Trolldomskommissionen'

imgs = glob(os.path.join(basepath, '**', 'bin_image', '**'), recursive=True)
page = glob(os.path.join(basepath, '**', 'page', '**'), recursive=True)
imgs = [x for x in imgs if os.path.isfile(x)]
page = [x for x in page if os.path.isfile(x)]

imgs.sort()
page.sort()



print(len(imgs))
print(len(page))

# imgs = [os.path.sep.join(x.split(os.path.sep)[-3:]) for x in imgs]

"""
for img in imgs:
    dst = os.path.join('/media/erik/T7/Data/Text_line_segmentation/ICDAR-2019/fiftyone/data', Path(img).name)
    shutil.copy(img, dst)




PageTransforms.page_to_region_coco(
    xmls=page,
    imgs=imgs,
    out_path='/media/erik/Elements/Riksarkivet/data/datasets/htr/segmentation/police_records/gt_files/coco_regions.json',
    elems=['TextRegion'],
    schema='http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15'
)



args = [(p, im, basepath, os.path.join(*PurePath(im).parts[0:-2], 'line_images'), os.path.join(*PurePath(p).parts[0:-2], 'gt'), 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15') for p, im in zip(page, imgs)]

cpu_count = multiprocessing.cpu_count()
p = Pool(cpu_count)
p.starmap(PageTransforms.crop_line_imgs_page, args)

"""

PageTransforms.crop_text_reg_write_text_line_coco(page, imgs, os.path.join(basepath, 'gt_files', 'coco_lines_TK.json'))

780
780
0
50
100
150
200
250
300
350
400
min() arg is an empty sequence
450
500
550
600
650
700
750


In [2]:
gts = glob(os.path.join('/media/erik/Elements/Riksarkivet/data/datasets/htr/Trolldomskommissionen', '**', 'gt', '*.txt'), recursive=True)

ground_truths = list()

for gt in gts:
    with open(gt, 'r') as f:
        ground_truths += f.readlines()


with open('/media/erik/Elements/Riksarkivet/data/datasets/htr/Trolldomskommissionen/gt_files/text_recognition_all.jsonl', 'w') as f:
    for gt in ground_truths:
        line = json.loads(gt)
        s = json.dumps(line, ensure_ascii=False)
        f.write(s)
        f.write('\n')

cutoff = int(len(ground_truths) * 0.1)

random.shuffle(ground_truths)

val = ground_truths[0:cutoff]
train = ground_truths[cutoff:]


with open('/media/erik/Elements/Riksarkivet/data/datasets/htr/Trolldomskommissionen/gt_files/text_recognition_train.jsonl', 'w') as f:
    for gt in train:
        line = json.loads(gt)
        s = json.dumps(line, ensure_ascii=False)
        f.write(s)
        f.write('\n')

with open('/media/erik/Elements/Riksarkivet/data/datasets/htr/Trolldomskommissionen/gt_files/text_recognition_val.jsonl', 'w') as f:
    for gt in val:
        line = json.loads(gt)
        s = json.dumps(line, ensure_ascii=False)
        f.write(s)
        f.write('\n')

In [None]:
PageTransforms.crop_line_imgs_page(
    image='/home/erik/Riksarkivet/Data/HTR/HTR_1700_clean/images/Bergskollegium_E3_10_(1718-1727)*40004031_00007.tif',
    page_file='/home/erik/Riksarkivet/Data/HTR/HTR_1700_clean/page/Bergskollegium_E3_10_(1718-1727)*40004031_00007.xml',
    schema='',
    output_base_path='/home/erik/Riksarkivet/Data/HTR/HTR_1700_clean',
    separator='*'
)

In [None]:
PageTransforms.page_to_mmlabs_ocr(
    page_path='/home/erik/Riksarkivet/Data/HTR/HTR_1700_clean/page',
    imgs_path='/home/erik/Riksarkivet/Data/HTR/HTR_1700_clean/images',
    out_path='/home/erik/Riksarkivet/Projects/riks_ds_utils/data/processed/OCRDataset2_1700.json',
    schema='http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15'
    )

In [None]:
from riks_ds_utils.mmlabs_utils import MMLabsUtils

train, val = MMLabsUtils.split_ocr_dataset('/home/erik/Riksarkivet/Projects/riks_ds_utils/data/processed/OCRDataset2_1700.json', 0.1)

PageTransforms._write_json('/home/erik/Riksarkivet/Projects/riks_ds_utils/data/processed/OCRDataset2_1700_train.json', train)
PageTransforms._write_json('/home/erik/Riksarkivet/Projects/riks_ds_utils/data/processed/OCRDataset2_1700_val.json', val)

In [None]:
dict = PageTransforms.extract_dict_from_page('/home/erik/Riksarkivet/Data/HTR/HTR_1700_clean/page', 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15')

In [None]:
with open('/home/erik/Riksarkivet/Projects/riks_ds_utils/data/processed/dict1700.txt', 'w') as f:
    for char in dict:
        f.write(char + '\n')

In [None]:
for i, (img, page) in enumerate(imgs_page):
    im = cv2.imread(img)
    if im is None:
        print('a')
    else:
        if i % 100 == 0:
            print(i)
