In [170]:
import pandas as pd
import os
import json

class Const:
    OCR = 'ocr'
    GROUND = 'groundtruth'
    REGION = 'region'
    LINE = 'line'
    SENTENCE = 'sentence'
    FILE = 'filename'
    DATASET = 'dataset_name'
    PREDICTION = 'prediction'
    PROMPT = 'prompt'
    LANGUAGE = 'language'
    NONE = None


In [238]:
datasets = []

for root, dirs, files in os.walk('../data/datasets/ocr/converted'):
    for file in files:
        if file.endswith(".jsonl"):
            input_file = os.path.join(root, file)
            print(input_file)
            with open(input_file) as f:
                lines = f.read().splitlines()
            df_inter = pd.DataFrame(lines)
            df_inter.columns = ['json_element']
            df_inter['json_element'].apply(json.loads)
            df = pd.json_normalize(df_inter['json_element'].apply(json.loads))
            datasets.append(df)

../data/datasets/ocr/converted/ajmc_mixed.jsonl
../data/datasets/ocr/converted/ajmc_primary_text.jsonl
../data/datasets/ocr/converted/icdar-2017.jsonl
../data/datasets/ocr/converted/overproof.jsonl
../data/datasets/ocr/converted/icdar-2019.jsonl
../data/datasets/ocr/converted/impresso-nzz.jsonl


In [239]:
for dataset in datasets:
    print(dataset['dataset_name'].unique())
    print('No. lines:', dataset['ocr.line']. nunique(), len(dataset['ocr.sentence']), 
          'No. sentences:', dataset['ocr.sentence']. nunique(), len(dataset['ocr.sentence']), 
          'No. regions:', dataset['ocr.region']. nunique(), len(dataset['ocr.region']))

['ajmc']
No. lines: 870 2131 No. sentences: 679 2131 No. regions: 63 2131
['ajmc']
No. lines: 151 330 No. sentences: 112 330 No. regions: 33 330
['icdar-2017']
No. lines: 0 477 No. sentences: 461 477 No. regions: 28 477
['overproof']
No. lines: 2278 2669 No. sentences: 399 2669 No. regions: 41 2669
['icdar-2019']
No. lines: 0 404 No. sentences: 404 404 No. regions: 41 404
['impresso-nzz']
No. lines: 3709 6140 No. sentences: 1943 6140 No. regions: 635 6140


In [240]:
datasets[0].head()

Unnamed: 0,filename,dataset_name,ocr.line,ocr.sentence,ocr.region,groundtruth.line,groundtruth.sentence,groundtruth.region
0,bsb10234118,ajmc,ἀπ’ ἐκείνων ἐπὶ τὰ πάθη μετάβασις Comm. III. i...,ἀπ’ ἐκείνων ἐπὶ τὰ πάθη μετάβασις Comm.,ἀπ’ ἐκείνων ἐπὶ τὰ πάθη μετάβασις Comm. II. in...,ἀπ’ ἐκείνων ἐπὶ τὰ πάθη μετάβασις Comm. II. in...,ἀπ’ ἐκείνων ἐπὶ τὰ πάθη μετάβασις Comm.,ἀπ’ ἐκείνων ἐπὶ τὰ πάθη μετάβασις Comm. III. i...
1,bsb10234118,ajmc,I. T. XVI. p. 731. et 718. eamque κακόζηλον ἑρ...,p. 731.,ἀπ’ ἐκείνων ἐπὶ τὰ πάθη μετάβασις Comm. II. in...,1. T. XVI. p. 731. et 718. eamque κακόζηλον ἑρ...,p. 731.,ἀπ’ ἐκείνων ἐπὶ τὰ πάθη μετάβασις Comm. III. i...
2,bsb10234118,ajmc,V. 9. Ἔνδον γὰρ ἀνήρ - Olim adnotavi articulum...,V. 9.,ἀπ’ ἐκείνων ἐπὶ τὰ πάθη μετάβασις Comm. II. in...,"V. 9. ""Evdov γὰρ ‘arıjg — Olim adnotavi articu...",V. 9.,ἀπ’ ἐκείνων ἐπὶ τὰ πάθη μετάβασις Comm. III. i...
3,bsb10234118,ajmc,δατος Aristoph. Lys. 370. αἱρώμεθ’ ὑμεῖς θοὔδ...,Lys.,ἀπ’ ἐκείνων ἐπὶ τὰ πάθη μετάβασις Comm. II. in...,durog Aristoph. Lys. 370. αἱρώμεθ’ ὑμεῖς θοὔδα...,Lys.,ἀπ’ ἐκείνων ἐπὶ τὰ πάθη μετάβασις Comm. III. i...
4,bsb10234118,ajmc,xerit Sophocles χθονὸς ἀείρας et Oppian. Cyn. ...,Cyn.,ἀπ’ ἐκείνων ἐπὶ τὰ πάθη μετάβασις Comm. II. in...,xerit Sophocles χθονὸς deigag et Oppian. Cyn. ...,Cyn.,ἀπ’ ἐκείνων ἐπὶ τὰ πάθη μετάβασις Comm. III. i...


In [241]:
dataset = pd.concat(datasets)

In [242]:
len(dataset)

12151

In [243]:
dataset.columns

Index(['filename', 'dataset_name', 'ocr.line', 'ocr.sentence', 'ocr.region',
       'groundtruth.line', 'groundtruth.sentence', 'groundtruth.region',
       'language', 'File', 'Date', 'Type', 'NbAlignedChar', 'article_id'],
      dtype='object')

In [244]:
from sklearn.model_selection import train_test_split


files_keep, files_removed, _, _ = train_test_split(dataset, dataset['dataset_name'], test_size=0.90, random_state=42)

In [245]:
len(files_keep)

1215

In [246]:
2836*10


28360

In [248]:
output_file = '../data/datasets/ocr/converted/sample/sample.jsonl'
with open(output_file, "w") as outfile:
    for index, row in files_keep.iterrows():
    
        json_line = json.dumps({Const.LANGUAGE: row['language'],
                                Const.FILE: row['filename'],
                                Const.DATASET: row['dataset_name'],
                                Const.OCR: {Const.LINE: row['ocr.line'],
                                            Const.SENTENCE: row['ocr.sentence'],
                                            Const.REGION: row['ocr.region']}, 
                                Const.GROUND: {Const.LINE: row['groundtruth.line'],
                                               Const.SENTENCE: row['groundtruth.sentence'],
                                               Const.REGION: row['groundtruth.region']},
                                'File': row['File'], 
                                'Date': row['Date'],
                                'Type': row['Type'], 
                                'NbAlignedChar': row['NbAlignedChar'], 
                                'article_id': row['article_id']
                                })

        outfile.write(json_line + "\n")
        outfile.flush()

In [249]:
results = []

for root, dirs, files in os.walk('../data/output'):
    for file in files:
        if file.endswith(".jsonl"):
            input_file = os.path.join(root, file)
            print(input_file)
            with open(input_file) as f:
                lines = f.read().splitlines()
            df_inter = pd.DataFrame(lines)
            df_inter.columns = ['json_element']
            df_inter['json_element'].apply(json.loads)
            df = pd.json_normalize(df_inter['json_element'].apply(json.loads))
            results.append(df)
            print(df.head())

../data/output/prompt_basic_01/sample/results-sample-davinci.jsonl
   language                                           filename  dataset_name  \
0       NaN  ../../data/datasets/ocr/original/impresso-nzz/...  impresso-nzz   
1       NaN  ../../data/datasets/ocr/original/impresso-nzz/...  impresso-nzz   
2       NaN  ../../data/datasets/ocr/original/impresso-nzz/...  impresso-nzz   
3       NaN  ../../data/datasets/ocr/original/overproof/dat...     overproof   
4       NaN  ../../data/datasets/ocr/original/overproof/dat...     overproof   

   File  Date  Type  NbAlignedChar  \
0   NaN   NaN   NaN            NaN   
1   NaN   NaN   NaN            NaN   
2   NaN   NaN   NaN            NaN   
3   NaN   NaN   NaN            NaN   
4   NaN   NaN   NaN            NaN   

                                          article_id  \
0                                                NaN   
1                                                NaN   
2                                                NaN   

  language                                           filename  dataset_name  \
0      NaN  ../../data/datasets/ocr/original/impresso-nzz/...  impresso-nzz   
1      NaN  ../../data/datasets/ocr/original/impresso-nzz/...  impresso-nzz   
2      NaN  ../../data/datasets/ocr/original/impresso-nzz/...  impresso-nzz   
3       fr  ../../data/datasets/ocr/original/icdar-2019/IC...    icdar-2019   
4       fr  ../../data/datasets/ocr/original/icdar-2019/IC...    icdar-2019   

  File Date Type NbAlignedChar article_id  \
0  NaN  NaN  NaN           NaN        NaN   
1  NaN  NaN  NaN           NaN        NaN   
2  NaN  NaN  NaN           NaN        NaN   
3  NaN  NaN  NaN           NaN        NaN   
4  NaN  NaN  NaN           NaN        NaN   

                                            ocr.line  \
0  ve>nachlässig'c» Slilrichtung wieder gebührend...   
1  ve>nachlässig'c» Slilrichtung wieder gebührend...   
2  ve>nachlässig'c» Slilrichtung wieder gebührend...   
3                             

In [250]:
results[0].columns

Index(['language', 'filename', 'dataset_name', 'File', 'Date', 'Type',
       'NbAlignedChar', 'article_id', 'ocr.line', 'ocr.sentence', 'ocr.region',
       'groundtruth.line', 'groundtruth.sentence', 'groundtruth.region',
       'prediction.prompt', 'prediction.line', 'prediction.sentence',
       'prediction.region'],
      dtype='object')

In [251]:
for _, item in results[1].iterrows():
    print('**', item['groundtruth.sentence'])
    print('**', item['prediction.sentence'])
    print('--'*100)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [252]:
from Levenshtein import distance

def levenshtein(reference, hypothesis, progress_bar=False):
    print(reference, hypothesis)
    
    assert len(reference) == len(hypothesis)
    text = zip(reference, hypothesis)
    if progress_bar:
        text = tqdm(text, total=len(reference))
    d = [distance(r, h) for r, h in text]
    output = pd.DataFrame({"reference": reference, "hypothesis": hypothesis})\
        .assign(distance=lambda df: d)\
        .assign(
        cer=lambda df: df.apply(
            lambda r: 100 * r["distance"] / max(len(r["reference"]), 1),
            axis=1
        )
    )
    return output

In [253]:
levenshtein("ve>nachlässig'c» Slilrichtung wieder gebührende Beach".split(), 
            "ve>nachlässig'c» Slilrichtung wieder gebührende Beach".split(), progress_bar=False)

["ve>nachlässig'c»", 'Slilrichtung', 'wieder', 'gebührende', 'Beach'] ["ve>nachlässig'c»", 'Slilrichtung', 'wieder', 'gebührende', 'Beach']


Unnamed: 0,reference,hypothesis,distance,cer
0,ve>nachlässig'c»,ve>nachlässig'c»,0,0.0
1,Slilrichtung,Slilrichtung,0,0.0
2,wieder,wieder,0,0.0
3,gebührende,gebührende,0,0.0
4,Beach,Beach,0,0.0


In [254]:
result.isnull().values.any()

False

In [257]:
from genalog.text import anchor

def align_texts(gt_text, ocr_text):

    # We align the texts with RETAS Method
    aligned_gt, aligned_noise = anchor.align_w_anchor(gt_text, ocr_text)
    
    print('GT:', gt_text)
    print('OCR:', ocr_text)
    print('--'*100)
    return aligned_gt, aligned_noise

In [258]:
for idx, result in enumerate(results):
    
    results[idx] = results[idx].fillna('No text')
    
    def replace(x):
        if len(x.strip()) == 0:
            return 'No text'
        return x
    
    for column in results[idx].columns:
        results[idx][column] = results[idx][column].apply(lambda x: replace(x))
        
#     results[idx][['groundtruth.line', 'ocr.line']] = results[idx].apply(lambda x: align_texts(x['groundtruth.line'], 
#                                                                                               x['ocr.line']), axis=1)
    results[idx][['groundtruth.line', 'prediction.line']] = results[idx].apply(lambda x: align_texts(x['groundtruth.line'], 
                                                                                                     x['prediction.line']), axis=1)
    
#     x = results[idx].apply(lambda x: levenshtein(x['groundtruth.line'].split(), 
#                                                             x['ocr.line'].split()), axis=1)
#     x = results[idx].apply(lambda x: levenshtein(x['groundtruth.sentence'].split(), 
#                                                                 x['ocr.sentence'].split()), axis=1)
#     x = results[idx].apply(lambda x: levenshtein(x['groundtruth.region'].split(), 
#                                                               x['ocr.region'].split()), axis=1)
    print(x)
    

GT: lich den Mobilisirungsbeschluß, der uns an das Lichterauslöschen
OCR: lich den Mobilistrungöbeschluß, der unS an daS LtchterauSlöschen
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
GT: lich den Mobilisirungsbeschluß, der uns an das Lichterauslöschen
OCR: lich den Mobilistrungöbeschluß, der unS an daS LtchterauSlöschen
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
GT: lich den Mobilisirungsbeschluß, der uns an das Lichterauslöschen
OCR: lich den Mobilistrungöbeschluß, der unS an daS LtchterauSlöschen
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

ValueError: Columns must be same length as key

In [100]:
idx = 9

In [101]:
df.iloc[idx]['ocr.line']

'Frankreich. Paris, 7 Pluv. (17 Jan.)'

In [102]:
df.iloc[idx]['ocr.sentence']

'Frankreich.'

In [103]:
df.iloc[idx]['ocr.region']

'Frankreich. Paris, 7 Pluv. (17 Jan.) Das Tribunal hat nach einem zweimaligen Scratin Dupuy (Mitglied des National-Institutö ) zu seinem Candidaten für den Erhaltunge - Senat ernannt. Im gesezgebcnden Rath erhielt noch niemand die absolute Mehrheit; wohl aber erblikt man unter den in Vorschlag gebrachten Namen die Exdi- rektorrn Merlin, Revcilliere und Treilhard, dexen erster sogar zi Stimmen hatte.'

In [104]:
df.columns

Index(['filename', 'ocr.line', 'ocr.sentence', 'ocr.region',
       'groundtruth.line', 'groundtruth.sentence', 'groundtruth.region'],
      dtype='object')

In [115]:
# ht_raw = " ".join(df['ocr.sentence'].to_list())
# print(f"{len(set(ht_raw.lower()))} characters in human transcription")
# print(f"The following characters have not been system-transcribed: \n{set(ht_raw.lower())-set(st_raw.lower())}")
tokens = ht_raw.split()
WORDS = set(tokens)

In [116]:
def eddi(input_text, reference_words=WORDS, ed_threshold=25, max_unk_tokens=3):
    """ Baseline I: Edit distance -based Baseline
    An edit distance-based baseline: Given a list of valid (reference) words,
    this baseline (called eddi) detects words not in the reference list and 
    changes them to the closest one in the reference list.
    :param input_text: the source text
    :param reference_words: a list of valid words (e.g., computed from the target data) 
    :param ed_threshold: the edit distance threshold below from which a word is replaced
    :param max_unk_tokens: the max number of unknown tokens in the transcribed text 
    :return: the new text
    """
    tokens = input_text.split()
    # Unknown transcribed tokens; proceed only if few
    unknowns = [i for i, w in enumerate(tokens) if w not in reference_words]
    if len(unknowns) > max_unk_tokens:
        return " ".join(tokens)

    for ind in unknowns:
        # Replace each uknown token with the ground truth token w/min edit distance 
        word = tokens[ind]
        min_cer, new_word = 100, word
        for ref in reference_words:
            candidate_min_cer = pywer.cer([ref], [word])
            if candidate_min_cer < min_cer:
                min_cer = candidate_min_cer
                if min_cer < ed_threshold:
                    new_word = ref
    tokens[ind] = new_word
    
    return " ".join(tokens)