In [19]:
from glob import glob
import os
from pathlib import Path
from pathlib import PurePath
import cv2
from riks_ds_utils.page_transforms import PageTransforms
import numpy as np
import multiprocessing
from multiprocessing import Pool
import shutil
import json
import random
from riks_ds_utils.preprocess import Preprocess

In [10]:
def get_img_page_pairs(basepath):
    imgs = glob(os.path.join(basepath, '**'), recursive=True)
    page = glob(os.path.join(basepath, '**'), recursive=True)

    page = [x for x in imgs if x.lower().endswith(('.xml'))]
    imgs = [x for x in imgs if x.lower().endswith(('.png', '.jpg', '.jpeg', '.tiff', 'tif','.bmp', '.gif'))]

    imgs.sort()

    #page_names = [Path(x).stem for x in page]
    page_names = ['_'.join(PurePath(x).parts[-3:]).split('.')[0] for x in page]

    page_names = [x.replace('_page', '') for x in page_names]
    imgs_page = list()

    for i, img in enumerate(imgs):
        img_name = '_'.join(PurePath(img).parts[-2:]).split('.')[0]

        #img_name = Path(img).stem
    
        try:
            ind = page_names.index(img_name)
            imgs_page.append((imgs[i], page[ind]))    
        except:
            pass

    return imgs_page

In [11]:
def binarise_and_write_pairs(img, page, output_path):

    #if the image names are unique within the dataset
    img_stem = Path(img).stem
    img_name = Path(img).name
    page_name = Path(page).name

    #otherwise, use part of the path to separate the images, giving them new unique names
    #img_stem = '_'.join(PurePath(img).parts[-2:]).split('.')[0]
    #img_name = '_'.join(PurePath(img).parts[-2:])
    #page_name = '_'.join(PurePath(page).parts[-3:]).split('.')[0].replace('_page', '')

    try:
        img_ori = cv2.imread(img)
        img_gray = cv2.cvtColor(img_ori, cv2.COLOR_BGR2GRAY)
        dst = cv2.fastNlMeansDenoising(img_gray, h=31, templateWindowSize=7, searchWindowSize=21)
        img_blur = cv2.medianBlur(dst,3).astype('uint8')
        threshed = cv2.adaptiveThreshold(img_blur, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)
    except:
        return 0
        

    os.makedirs(os.path.join(output_path, img_stem), exist_ok=True)
    os.makedirs(os.path.join(output_path, img_stem, 'bin_image'), exist_ok=True)
    os.makedirs(os.path.join(output_path, img_stem, 'ori_image'), exist_ok=True)
    os.makedirs(os.path.join(output_path, img_stem, 'page'), exist_ok=True)
    os.makedirs(os.path.join(output_path, img_stem, 'text_regions'), exist_ok=True)
    os.makedirs(os.path.join(output_path, img_stem, 'line_images'), exist_ok=True)
    os.makedirs(os.path.join(output_path, img_stem, 'gt'), exist_ok=True)

    
 
    cv2.imwrite(os.path.join(output_path, img_stem, 'bin_image', 'bin_' + img_name), threshed)
    
    shutil.copy(page, os.path.join(output_path, img_stem, 'page', page_name))
    shutil.copy(img, os.path.join(output_path, img_stem, 'ori_image', img_name))
    #dst_file = os.path.join(output_path, dataset_path,  Path(page).name)
    
    return 1 

In [12]:
imgs_page = get_img_page_pairs('/home/erik/Riksarkivet/Projects/riks_ds_utils/data/Trolldomskommissionen-GT-2023-07-08')
print(imgs_page[0])
args = [(img, page, '/media/erik/Elements/Riksarkivet/data/datasets/htr/Trolldomskommissionen') for img, page in imgs_page]

cpu_count = multiprocessing.cpu_count()
p = Pool(cpu_count)
p.starmap(binarise_and_write_pairs, args)

('/home/erik/Riksarkivet/Projects/riks_ds_utils/data/Trolldomskommissionen-GT-2023-07-08/654345/Trolldom_och_annan_vidskepelse,_Skrivelser_till_Kungl__Maj_t_(1584_–_1734)/A0060186_00007.tif', '/home/erik/Riksarkivet/Projects/riks_ds_utils/data/Trolldomskommissionen-GT-2023-07-08/654345/Trolldom_och_annan_vidskepelse,_Skrivelser_till_Kungl__Maj_t_(1584_–_1734)/page/A0060186_00007.xml')


[1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,


In [14]:
basepath = '/media/erik/Elements/Riksarkivet/data/datasets/htr/Trolldomskommissionen'

imgs = glob(os.path.join(basepath, '**', 'bin_image', '**'), recursive=True)
page = glob(os.path.join(basepath, '**', 'page', '**'), recursive=True)
imgs = [x for x in imgs if os.path.isfile(x)]
page = [x for x in page if os.path.isfile(x)]

imgs.sort()
page.sort()

assert len(imgs) == len(page)

#imgs_checked = glob(os.path.join('/media/erik/Elements/Riksarkivet/data/datasets/htr/segmentation/ICDAR-2019/fiftyone/data', '**'))
#imgs_checked = [Path(x).name for x in imgs_checked]

imgs_page = list(zip(imgs, page))

print(len(imgs_page))

#imgs_page = [(x, y) for (x, y) in imgs_page if Path(x).name in imgs_checked]

#print(len(imgs_page))

imgs = [x for (x, y) in imgs_page]
page = [y for (x, y) in imgs_page]

PageTransforms.page_to_region_coco(
    xmls=page,
    imgs=imgs,
    out_path='/media/erik/Elements/Riksarkivet/data/datasets/htr/Trolldomskommissionen/gt_files/coco_regions_TK.json',
    elems=['TextRegion'],
    schema='http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15'
)


780
0
50
100
150
200
250
300
350
400
450
500
550
600
650
700
750


In [20]:
basepath = '/media/erik/Elements/Riksarkivet/data/datasets/htr/Trolldomskommissionen'

imgs = glob(os.path.join(basepath, '**', 'bin_image', '**'), recursive=True)
page = glob(os.path.join(basepath, '**', 'page', '**'), recursive=True)
imgs = [x for x in imgs if os.path.isfile(x)]
page = [x for x in page if os.path.isfile(x)]

imgs.sort()
page.sort()



print(len(imgs))
print(len(page))

# imgs = [os.path.sep.join(x.split(os.path.sep)[-3:]) for x in imgs]

"""
for img in imgs:
    dst = os.path.join('/media/erik/T7/Data/Text_line_segmentation/ICDAR-2019/fiftyone/data', Path(img).name)
    shutil.copy(img, dst)




PageTransforms.page_to_region_coco(
    xmls=page,
    imgs=imgs,
    out_path='/media/erik/Elements/Riksarkivet/data/datasets/htr/segmentation/police_records/gt_files/coco_regions.json',
    elems=['TextRegion'],
    schema='http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15'
)



args = [(p, im, basepath, os.path.join(*PurePath(im).parts[0:-2], 'line_images'), os.path.join(*PurePath(p).parts[0:-2], 'gt'), 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15') for p, im in zip(page, imgs)]

cpu_count = multiprocessing.cpu_count()
p = Pool(cpu_count)
p.starmap(PageTransforms.crop_line_imgs_page, args)

"""

PageTransforms.crop_text_reg_write_text_line_coco(page, imgs, os.path.join(basepath, 'text_regions'), os.path.join(basepath, 'gt_files', 'coco_lines_TK.json'))

780
780


[ WARN:0@12913.481] global loadsave.cpp:244 findDecoder imread_('/media/erik/Elements/Riksarkivet/data/datasets/htr/Trolldomskommissionen/text_regions/A0060185_00003/text_regions/bin_A0060185_00003_000.jpg'): can't open/read file: check file path/integrity


AttributeError: 'NoneType' object has no attribute 'shape'

In [None]:
gts = glob(os.path.join('/home/erik/Riksarkivet/Data/HTR/HTR_1700_testsets_clean/HTR-validering', '**', 'gt', '*.txt'), recursive=True)

ground_truths = list()

for gt in gts:
    with open(gt, 'r') as f:
        ground_truths += f.readlines()


with open('/home/erik/Riksarkivet/Projects/riks_ds_utils/data/processed/1700_testsets_gt/testsets_combined_all.jsonl', 'w') as f:
    for gt in ground_truths:
        line = json.loads(gt)
        s = json.dumps(line, ensure_ascii=False)
        f.write(s)
        f.write('\n')

cutoff = int(len(ground_truths) * 0.1)

random.shuffle(ground_truths)

val = ground_truths[0:cutoff]
train = ground_truths[cutoff:]


with open('/home/erik/Riksarkivet/Projects/riks_ds_utils/data/processed/1700_testsets_gt/testsets_combined_train.jsonl', 'w') as f:
    for gt in train:
        line = json.loads(gt)
        s = json.dumps(line, ensure_ascii=False)
        f.write(s)
        f.write('\n')

with open('/home/erik/Riksarkivet/Projects/riks_ds_utils/data/processed/1700_testsets_gt/testsets_combined_val.jsonl', 'w') as f:
    for gt in val:
        line = json.loads(gt)
        s = json.dumps(line, ensure_ascii=False)
        f.write(s)
        f.write('\n')

In [None]:
PageTransforms.crop_line_imgs_page(
    image='/home/erik/Riksarkivet/Data/HTR/HTR_1700_clean/images/Bergskollegium_E3_10_(1718-1727)*40004031_00007.tif',
    page_file='/home/erik/Riksarkivet/Data/HTR/HTR_1700_clean/page/Bergskollegium_E3_10_(1718-1727)*40004031_00007.xml',
    schema='',
    output_base_path='/home/erik/Riksarkivet/Data/HTR/HTR_1700_clean',
    separator='*'
)

In [None]:
PageTransforms.page_to_mmlabs_ocr(
    page_path='/home/erik/Riksarkivet/Data/HTR/HTR_1700_clean/page',
    imgs_path='/home/erik/Riksarkivet/Data/HTR/HTR_1700_clean/images',
    out_path='/home/erik/Riksarkivet/Projects/riks_ds_utils/data/processed/OCRDataset2_1700.json',
    schema='http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15'
    )

In [None]:
from riks_ds_utils.mmlabs_utils import MMLabsUtils

train, val = MMLabsUtils.split_ocr_dataset('/home/erik/Riksarkivet/Projects/riks_ds_utils/data/processed/OCRDataset2_1700.json', 0.1)

PageTransforms._write_json('/home/erik/Riksarkivet/Projects/riks_ds_utils/data/processed/OCRDataset2_1700_train.json', train)
PageTransforms._write_json('/home/erik/Riksarkivet/Projects/riks_ds_utils/data/processed/OCRDataset2_1700_val.json', val)

In [None]:
dict = PageTransforms.extract_dict_from_page('/home/erik/Riksarkivet/Data/HTR/HTR_1700_clean/page', 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15')

In [None]:
with open('/home/erik/Riksarkivet/Projects/riks_ds_utils/data/processed/dict1700.txt', 'w') as f:
    for char in dict:
        f.write(char + '\n')

In [None]:
for i, (img, page) in enumerate(imgs_page):
    im = cv2.imread(img)
    if im is None:
        print('a')
    else:
        if i % 100 == 0:
            print(i)
