In [14]:
import pandas as pd
import os
import json

!pip install pywer
import pywer



In [37]:
datasets = []

for root, dirs, files in os.walk('../data/outputs'):
    for file in files:
        if file.endswith(".jsonl"):
            
            input_file = os.path.join(root, file)
            print(input_file)
            
            with open(input_file) as f:
                lines = f.read().splitlines()
            df_inter = pd.DataFrame(lines)
            df_inter.columns = ['json_element']
            df_inter['json_element'].apply(json.loads)
            df = pd.json_normalize(df_inter['json_element'].apply(json.loads))
            
            datasets.append(df)
            

../data/outputs/prompt_complex_02/icdar-2017/results-icdar-2017-gpt2.jsonl


In [34]:
dataset = datasets[0]

ht_raw = " ".join(dataset['groundtruth.sentence'].to_list())
st_raw = " ".join(dataset['prediction.sentence'].to_list())
print(f"{len(set(ht_raw.lower()))} characters in human transcription")
print(f"{len(set(st_raw.lower()))} characters in system transcription")
print(f"The following characters have not been system-transcribed: \n{set(ht_raw.lower())-set(st_raw.lower())}")
print(f"The following *have been* system-transcribed: \n{set(ht_raw.lower()).intersection(set(st_raw.lower()))}")

46 characters in human transcription
56 characters in system transcription
The following characters have not been system-transcribed: 
{'•', '■', '!'}
The following *have been* system-transcribed: 
{'6', 'o', 'd', 'e', 'l', 'x', 'b', ' ', 'k', 's', 'h', 'c', '8', '.', 'q', '9', '-', '5', '7', 'r', '2', '4', 'f', 'n', 'p', '0', 'g', '?', 'y', 'a', 'j', 'z', 'v', 'u', 'm', '1', ',', 't', 'i', '3', "'", 'w', '£'}


In [36]:
sample = dataset#.sample(frac=0.1)
for ocr_sent, ground_sent, pred_sent in zip(sample['ocr.sentence'], sample['groundtruth.sentence'], sample['prediction.sentence']):
    print('--'*100)
    print('*', ocr_sent)
    print('**', ground_sent)
    print('***', pred_sent)
                                                                                                          
                                                                                                

--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
* The price one penny, is pub- lished every Monday, Tuesday, Wednesday, Thursday, and Friday mornings, at the oﬃces, No. 38, New Street, Birmingham, and is ready for delivery to news agents at six o'clock.
** The Birmingham Daily Post, 1 prioe one penny, is 'pub luhed every Monday, Tuesday, Wednesday, Thursday, end Friday mornings, at the offices, No. 38, New Street, Birmingham, and is ready for delivery to news agents at six o'clock.
*** "The price one penny, is pub- lished every Monday, Tuesday, Wednesday, Thursday, and Friday mornings, at the oﬃces, No. 38, New Street,
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
* Arrangements have

In [10]:
tokens = ht_raw.split()
WORDS = set(tokens)

In [22]:
def eddi(input_text, reference_words=WORDS, ed_threshold=25, max_unk_tokens=3):
    """ Baseline I: Edit distance -based Baseline
      An edit distance-based baseline: Given a list of valid (reference) words,
      this baseline (called eddi) detects words not in the reference list and 
      changes them to the closest one in the reference list.
      :param input_text: the source text
      :param reference_words: a list of valid words (e.g., computed from the target data) 
      :param ed_threshold: the edit distance threshold below from which a word is replaced
      :param max_unk_tokens: the max number of unknown tokens in the transcribed text 
      :return: the new text
    """
    tokens = input_text.split()
    # Unknown transcribed tokens; proceed only if few
    unknowns = [i for i, w in enumerate(tokens) if w not in reference_words]
    if len(unknowns) > max_unk_tokens:
        return " ".join(tokens)

    for ind in unknowns:
        # Replace each uknown token with the ground truth token w/min edit distance 
        word = tokens[ind]
        min_cer, new_word = 100, word
        for ref in reference_words:
            candidate_min_cer = pywer.cer([ref], [word])
            if candidate_min_cer < min_cer:
                min_cer = candidate_min_cer
                if min_cer < ed_threshold:
                    new_word = ref
        tokens[ind] = new_word
    return " ".join(tokens)

In [23]:
# Predict for dataset
dataset["B1"] = dataset['prediction.sentence'].apply(eddi)

# Calculate CER for baseline predictions
dataset["B1_CER"] = dataset.apply(lambda row: pywer.cer([row['groundtruth.sentence']], [row.B1]), axis=1)
print(f"B1 CER: {dataset.B1_CER.mean()}")

# Computing the character error *reduction* rate (CERR)
dataset["CER"] = dataset.apply(lambda row: pywer.cer([row['groundtruth.sentence']], [row['prediction.sentence']]), axis=1)
print(f"B1 CERR: {(dataset.CER - dataset.B1_CER).mean()}")

B1 CER: 98.36057509612908
B1 CERR: 0.014437290709310576


In [None]:
# Use B1 to predict for the test
# test_df["B1"] = test_df['prediction.sentence'].apply(eddi)
# test_df.sample()

In [17]:
# LMing -based baseline
!git clone https://github.com/ipavlopoulos/lm.git
from lm.markov.models import LM
wlm = LM(gram="WORD").train(tokens)
wlm.generate_text()

Cloning into 'lm'...
remote: Enumerating objects: 510, done.[K
remote: Counting objects: 100% (28/28), done.[K
remote: Compressing objects: 100% (17/17), done.[K
remote: Total 510 (delta 19), reused 11 (delta 11), pack-reused 482[K
Receiving objects: 100% (510/510), 117.99 KiB | 2.14 MiB/s, done.
Resolving deltas: 100% (277/277), done.


"The Birmingham Daily Post, 1 prioe one penny, is 'pub luhed every Monday, Tuesday, Wednesday, Thursday, end Friday mornings, at the offices, No. 38, New Street, Birmingham, and is ready for delivery to news agents at six o'clock. Arrangements have been made for the delivery of the Post to subscribers in Birmingham by eight o'clock, and we believe that all news agents will ensure its prompt delivery by that hour. The Post will be on sale in all the principal towns of Warwickshire, Staffordshire, East Worcestershire, and Salop, by seven o'clock on the mornings of publication. The price of the Dailt Post will be 5s. 5d. per quarter, or 10s. lOtf. per half year. The Saturday Evening Post, price three-halfpence, a weekly journal for the working classes, will be published every Saturday afternoon at fire o'clock, and may be obtained from all news agents in Birmingham and the district. The Daily and Saturday Evening Post are registered for transmission abroad. Three copies of the Daily Post 

In [18]:
def lamo(input_text, reference_words=WORDS, lm = wlm, max_unk_tokens=2):
  """ Baseline II: LM-based Baseline
  Any unknown words in the transcribed text are replaced by word suggested by
  a language model trained on the ground truth texts.
  :param input_text: the (transcribed) text in question
  :param reference_words: the reference vocabulary
  :param lm: a word-based statistical language model
  :param max_unk_tokens: the max number of unkown words in the text 
  :return: the new text
  """
  tokens = input_text.split()
  # Unknown transcribed tokens; proceed only if few
  unknowns = [i for i, w in enumerate(tokens) if w not in reference_words]
  if len(unknowns) > max_unk_tokens:
    return " ".join(tokens)
  for ind in unknowns:
    # Replace each uknown token with the ground truth token w/min edit distance 
    new_word = wlm.generate_next_gram(tokens[:ind-1])
    tokens[ind] = new_word
  return " ".join(tokens)

In [19]:
# computing the B2 
dataset["B2"] = dataset['ocr.sentence'].apply(lamo)
# computing the B2 CER
dataset["B2_CER"] = dataset.apply(lambda row: pywer.cer([row['groundtruth.sentence']], [row.B2]), axis=1)
print("B2's CER:", dataset.B2_CER.mean())
# computing B2's CERR
print((dataset.CER - dataset.B2_CER).mean())

B2's CER: 11.172359861073762
87.2026525257646


In [None]:
# Use B2 to predict for the test
# test_df["B2"] = test_df.SYSTEM_TRANSCRIPTION.apply(lamo)
# test_df.sample()