# OCR - From images to text

In [None]:
# basic python tools
import re, os, sys
sys.path.append(os.path.join("..", "..", "CDS-LANG"))

# OCR tools
import cv2
import pytesseract
from utils.imutils import jimshow, jimshow_channel

# data processing tools
import numpy as np 
import pandas as pd 
import gensim
import gensim.downloader as api

# readymade spellchecker
from autocorrect import Speller

In [None]:
"""
Peter Norvig's spell-checker, modified work with with ranked embeddings

http://norvig.com/spell-correct.html
"""

def words(text): 
    return re.findall(r'\w+', text.lower())

def P(word): 
    "Probability of `word`."
    # use rank as a proxy for likelihood
    # returns 0 if the word isn't in the dictionary
    return WORDS.get(word, 0)

def correction(word): 
    "Most probable spelling correction for word."
    return max(candidates(word), key=P)

def candidates(word): 
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])

def known(words): 
    "The subset of `words` that appear in the dictionary of WORDS."
    return set(w for w in words if w in WORDS)

def edits1(word):
    "All edits that are one edit away from `word`."
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def edits2(word): 
    "All edits that are two edits away from `word`."
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))

def check(target):
    try: 
        WORDS[target]
    except:
        print(f"[ERROR]: Word '{target}' not found. Suggested correction: '{correction(target)}'")
        
def regex_clean(string):
    processed = string.replace("\n"," ")\
                     .replace("\n\n"," ")\
                     .replace("__"," ")\
                     .replace(" - "," ")\
                     .replace('-""' ," ")\
                     .replace("|", "")\
                     .replace("!", "")\
                     .replace("\s\s"," ")\
                     .lstrip()
    return " ".join(processed.split())

## OCR using ```Tesseract```

## Preprocess with Open-CV

Note that Tesseract on Github give a bunch of tips for how best to preprocess images to improve performance. You should have the skills to actually do all of these things using Open-CV: https://github.com/tesseract-ocr/tessdoc/blob/main/ImproveQuality.md#rescaling

__Crop__

__Greyscale__

__OCR again__

__Thresholding__

## Quick and cheap spell checking

__Initialize speller__

## Use word embedding rankings

## Get word rankings from model

__Check word embeddings rankings for candidate replacements__