# Grams

To battle OCR errors, we work with letter 2,3-grams, filter them to retain
only the ones that may occur in real words in order to distil a set of legal words.

In [2]:
import sys
import os
import collections

from tf.app import use
from tf.core.helpers import unexpanduser

In [3]:
A = use("CLARIAH/wp6-daghregisters:clone", checkout="clone", hoist=globals())

This is Text-Fabric 9.2.2
Api reference : https://annotation.github.io/text-fabric/tf/cheatsheet.html

14 features found and 0 ignored


In [47]:
wordOccs = collections.defaultdict(list)
for w in range(1, F.otype.maxSlot + 1):
    wordOccs[F.letters.v(w)].append(w)

In [87]:
CHAR_CLASSES = """
*0 •™_~"[
i1 fijklrtBDEFIJKLPRT1!ïÈÉËÏ£|!\
i2 nhuHNUüÜ«°]
i3 mM
o1 abdgopqOQ690óöÓÖ()»}#&><^
c1 ecCGèéêë€*®?
v1 vxyVXY
v2 ww
s1 sS5$§
z1 zZ
21 2%
a1 A
"""

In [88]:
OCR_KEY = {}

for line in CHAR_CLASSES.strip().split("\n"):
    (clsCard, chars) = line.split(" ", 1)
    (cls, card) = clsCard
    for c in chars:
        OCR_KEY[c] = (cls, card)

In [89]:
def getOcrKey(word):
    clses = []
    for c in word:
        (cls, card) = OCR_KEY.get(c, (c, 1))
        if clses and clses[-1][0] == cls:
            clses[-1][1] += card
        else:
            clses.append([cls, card])
    return "".join(f"{cls}{card}" for (cls, card) in clses)

In [51]:
# doe dit ook op woord niveau met edit distances:
# maak een similarity matrix tussen laagfrequente woorden en iets hoger frequente woorden
# op basis van Levenshtein
N = 3

GRAM = collections.defaultdict(list)
GRAM_CONTEXT = {x: collections.defaultdict(collections.Counter) for x in (False, True)}

# wordOccs = {"/2": 1}
for (word, occs) in wordOccs.items():
    testWord = "".join(c for c in word if c.isalnum())
    if not testWord or testWord.isdigit():
        continue
    freq = len(occs)
    wl = len(word)
    if wl <= N:
        GRAM[word].append(word)
        GRAM_CONTEXT[False][word][""] += freq
        GRAM_CONTEXT[True][word][""] += freq
        continue
    for i in range(wl - N + 1):
        gram = word[i:i + N]
        pre = word[max((i - N, 0)):i]
        post = word[i + N:min((i + N + N, wl))]
        GRAM[gram].append(word)
        GRAM_CONTEXT[False][gram][pre] += freq
        GRAM_CONTEXT[True][gram][post] += freq
print(len(GRAM))
GRAM_FREQ = {gram: sum(len(wordOccs[word]) for word in words) for (gram, words) in GRAM.items()}

10179


In [53]:
ghBase = os.path.expanduser(f"~/github")
reportDir = f"{ghBase}/clariah/wp6-daghregisters/postocr"
filePath = f"{reportDir}/gramfreq.tsv"

with open(filePath, "w") as fh:
    fh.write("gram\tfreq\n")
    for (gram, freq) in sorted(GRAM_FREQ.items()):
        fh.write(f"{gram}\t{freq}\n")
print(f"GRAM_FREQ written to {unexpanduser(filePath)}")

GRAM_FREQ written to ~/github/clariah/wp6-daghregisters/postocr/gramfreq.tsv


In [93]:
HAPAX_GRAMS = sorted(gram for (gram, freq) in GRAM_FREQ.items() if freq <= 2)
GOOD_GRAMS = sorted(gram for (gram, freq) in GRAM_FREQ.items() if freq > 20)
nHAPAX = len(HAPAX_GRAMS)
nGOOD = len(GOOD_GRAMS)
total = nHAPAX * nGOOD

print(f"{nHAPAX=} {nGOOD=} {total=}")

nHAPAX=5317 nGOOD=2346 total=12473682


In [94]:
GRAM_INCLUSION_Q = collections.defaultdict(lambda: collections.defaultdict(list))

total = nHAPAX * nGOOD 
print(f"Computing {total} inclusions")

k = 0
c = 0
chunkSize = int(round(total / 100))

for i in range(nHAPAX):
    gramH = HAPAX_GRAMS[i]
    presH = GRAM_CONTEXT[False][gramH]
    postsH = GRAM_CONTEXT[True][gramH]
    preWeight = sum(presH.values())
    postWeight = sum(postsH.values())
    keyH = getOcrKey(gramH)
    
    for j in range(nGOOD):
        if c == chunkSize:
            c = 0
            sys.stdout.write(f"\r{k:>9} = {int(round(k / chunkSize)):>3} %")
        k += 1
        c += 1
        gramG = GOOD_GRAMS[j]
        keyG = getOcrKey(gramG)
        if keyG != keyH:
            continue
        presG = GRAM_CONTEXT[False][gramG]
        postsG = GRAM_CONTEXT[True][gramG]
        qualityPre = sum(freq for (pre, freq) in presH.items() if pre in presG) / preWeight
        qualityPost = sum(freq for (post, freq) in postsH.items() if post in postsG) / postWeight
        thisValue = qualityPre * qualityPost
        GRAM_INCLUSION_Q[gramH][thisValue].append(gramG)
sys.stdout.write(f"\r{k:>9} = {int(round(k / chunkSize)):>3} %")
print("\nDone")

Computing 12473682 inclusions
 12473682 = 100 %
Done


In [95]:
GRAM_INCLUSION = {}

for (gramH, qualities) in GRAM_INCLUSION_Q.items():
    found = False
    gramG = None
    qual = None
    for quality in reversed(sorted(qualities)):
        if gramG is None:
            gramGs = qualities[quality]
            if len(gramGs) > 1:
                found = False
                break
            gramG = gramGs[0]
            qual = quality
        else:
            if quality > qual - 0.2:
                found = False
                break
            GRAM_INCLUSION[gramH] = (gramG, qual)
    if not found:
        continue

print(len(GRAM_INCLUSION))

803


In [96]:
filePath = f"{reportDir}/gramhapax.tsv"

with open(filePath, "w") as fh:
    fh.write("gramHapax\tgramGood\tquality\n")
    for (gramH, (gramG, quality)) in GRAM_INCLUSION.items():
        fh.write(f"{gramH}\t{gramG}\t{quality}\n")
print(f"GRAM_INCLUSION written to {unexpanduser(filePath)}")

GRAM_INCLUSION written to ~/github/clariah/wp6-daghregisters/postocr/gramhapax.tsv


In [102]:
gramH = "]ol"
gramG = "Got"
  
presH = GRAM_CONTEXT[False][gramH]
postsH = GRAM_CONTEXT[True][gramH]
preWeight = sum(presH.values())
postWeight = sum(postsH.values())

In [103]:
presH

Counter({'': 1})

In [104]:
postsH

Counter({'uar': 1})

In [105]:
presG = GRAM_CONTEXT[False][gramG]
postsG = GRAM_CONTEXT[True][gramG]

In [106]:
postsG["uar"]

2