In [1]:
from glob import glob
import os
from pathlib import Path
from pathlib import PurePath
import cv2
from riks_ds_utils.page_transforms import PageTransforms
import numpy as np
import multiprocessing
from multiprocessing import Pool
import shutil

In [None]:
def get_img_page_pairs(basepath):
    imgs = glob(os.path.join(basepath, '**'), recursive=True)
    page = glob(os.path.join(basepath, '**'), recursive=True)

    page = [x for x in imgs if x.lower().endswith(('.xml'))]
    imgs = [x for x in imgs if x.lower().endswith(('.png', '.jpg', '.jpeg', '.tiff', 'tif','.bmp', '.gif'))]

    page_names = ['_'.join(PurePath(x).parts[-3:]).split('.')[0] for x in page]

    page_names = [x.replace('_page', '') for x in page_names]
    imgs_page = list()

    for i, img in enumerate(imgs):
        img_name = '_'.join(PurePath(img).parts[-2:]).split('.')[0]
    
        try:
            ind = page_names.index(img_name)
            imgs_page.append((imgs[i], page[ind]))    
        except:
            pass

    return imgs_page

In [None]:
def binarise_and_write_pairs(img, page, output_path):

    os.makedirs(os.path.join(output_path, 'images'), exist_ok=True)
    os.makedirs(os.path.join(output_path, 'page'), exist_ok=True)

    img_name = '_'.join(PurePath(img).parts[-2:]).replace(' ', '_')
    page_name = '_'.join(PurePath(page).parts[-3:]).replace(' ', '_')
    page_name = page_name.replace('_page', '')

    bgray = cv2.imread(img)[...,0]

    blured1 = cv2.medianBlur(bgray,3)
    blured2 = cv2.medianBlur(bgray,51)
    divided = np.ma.divide(blured1, blured2).data
    normed = np.uint8(255*divided/divided.max())
    th, threshed = cv2.threshold(normed, 100, 255, cv2.THRESH_OTSU)
 
    cv2.imwrite(os.path.join(output_path, 'images', img_name), threshed)
    
    shutil.copy(page, output_path)
    dst_file = os.path.join(output_path, Path(page).name)
    os.rename(dst_file, os.path.join(output_path, 'page', page_name))
    

In [None]:
imgs_page = get_img_page_pairs('/home/erik/Riksarkivet/Data/HTR/HTR_1700')

args = [(img, page, '/home/erik/Riksarkivet/Data/HTR/HTR_1700_clean') for img, page in imgs_page]

cpu_count = multiprocessing.cpu_count()
p = Pool(cpu_count)
p.starmap(binarise_and_write_pairs, args)

In [None]:
PageTransforms.page_to_mmlabs_ocr(
    page_path='/home/erik/Riksarkivet/Data/HTR/HTR_1700_clean/page',
    imgs_path='/home/erik/Riksarkivet/Data/HTR/HTR_1700_clean/images',
    out_path='/home/erik/Riksarkivet/Projects/riks_ds_utils/data/processed/OCRDataset2_1700.json',
    schema='http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15'
    )

In [4]:
from riks_ds_utils.mmlabs_utils import MMLabsUtils

train, val = MMLabsUtils.split_ocr_dataset('/home/erik/Riksarkivet/Projects/riks_ds_utils/data/processed/OCRDataset2_1700.json', 0.1)

PageTransforms._write_json('/home/erik/Riksarkivet/Projects/riks_ds_utils/data/processed/OCRDataset2_1700_train.json', train)
PageTransforms._write_json('/home/erik/Riksarkivet/Projects/riks_ds_utils/data/processed/OCRDataset2_1700_val.json', val)

In [2]:
dict = PageTransforms.extract_dict_from_page('/home/erik/Riksarkivet/Data/HTR/HTR_1700_clean/page', 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15')

In [4]:
with open('/home/erik/Riksarkivet/Projects/riks_ds_utils/data/processed/dict1700.txt', 'w') as f:
    for char in dict:
        f.write(char + '\n')

In [None]:
for i, (img, page) in enumerate(imgs_page):
    im = cv2.imread(img)
    if im is None:
        print('a')
    else:
        if i % 100 == 0:
            print(i)
