# Spelling correction
## by
### Florian Eder    00819174
### Moritz Enderle  00819536

#### imports

In [1]:
import re
import sys
from collections import Counter

#### open txt

In [2]:
def words(_text):
    """
    Return all words in text
    :param text: text to be parsed
    :return: list of words
    """
    return re.findall(r'\w+', _text.lower())

with open('big.txt', encoding="utf-8") as f:
    text = f.read()
    WORDS = Counter(words(text))

#### given functions

In [3]:
def probability(_word, _no_words=sum(WORDS.values())):
    """
    Probability of `word`
    :param _word: word to be probed
    :param _no_words: number of words in the counter
    :return probability of `word`
    """
    return WORDS[_word] / _no_words


def correction(_word):
    """
    Most probable spelling correction for word
    :param _word: word to be corrected
    :return most probable spelling correction for word
    """
    return max(candidates(_word), key=probability)


def candidates(_word):
    """
    Generate possible spelling corrections for word
    :param _word: word to be corrected
    :return: list of possible spelling corrections for word
    """
    return known([_word]) or known(edits1(_word)) or known(edits2(_word)) or [_word]


def known(_words):
    """
    The subset of `words` that appear in the dictionary of WORDS
    :param _words: list of words to be checked
    :return: list of words that appear in the dictionary of WORDS
    """
    return set(w for w in _words if w in WORDS)


def edits1(_word):
    """All edits that are one edit away from `word`
    :param _word: word to be corrected
    :return: list of possible spelling corrections for word
    """
    letters = 'abcdefghijklmnopqrstuvwxyz'
    splits = [(_word[:i], _word[i:]) for i in range(len(_word) + 1)]
    deletes = [L + R[1:] for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
    replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
    inserts = [L + c + R for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)


def edits2(_word):
    """
    All edits that are two edits away from `word`
    :param _word: word to be corrected
    :return: list of possible spelling corrections for word
    """
    return (e2 for e1 in edits1(_word) for e2 in edits1(e1))

#### improvements
#### context approach

In [4]:
%%time
CONTEXT = {}
text = text.strip().split()
text = [x.strip().replace(".", "").lower() for x in text if x.isalpha()]
for index, word in enumerate(text):
    word = word.lower()
    if word not in CONTEXT:
        CONTEXT[word] = {}
    if index > 0:
        if text[index - 1] not in CONTEXT[word]:
            CONTEXT[word][text[index - 1]] = 0
        CONTEXT[word][text[index - 1]] += 1


CPU times: total: 109 ms
Wall time: 115 ms


In [5]:
def correction_context(_word, _context):
    _word = _word.lower()
    """
    Most probable spelling correction for word
    :param _word: word to be corrected
    :param _context: context of the word
    :return: most probable spelling correction for word
    """
    if _context == "":
        return max(candidates(_word), key=probability)
    else:
        return max(candidates(_word), key=lambda x: probability_context(x, _context))


def probability_context(_word, _context):
    """
    Probability of `word`
    :param _word: word to be probed
    :param _context: context of the word
    :return probability of `word`
    """
    if _word in CONTEXT and _context in CONTEXT[_word]:
        return CONTEXT[_word][_context] / sum(CONTEXT[_word].values())
    else:
        return 0

#### testing

In [6]:
from pprint import pprint

pprint(CONTEXT["their"])

{'about': 2,
 'after': 2,
 'all': 1,
 'and': 3,
 'as': 2,
 'at': 2,
 'break': 1,
 'buy': 1,
 'by': 2,
 'closed': 1,
 'completed': 1,
 'covered': 1,
 'digesting': 1,
 'do': 2,
 'done': 1,
 'down': 1,
 'find': 1,
 'for': 6,
 'from': 4,
 'half': 1,
 'have': 2,
 'in': 7,
 'into': 2,
 'loading': 1,
 'made': 1,
 'make': 2,
 'monotonous': 1,
 'of': 11,
 'on': 1,
 'out': 1,
 'over': 1,
 'prove': 1,
 'put': 1,
 'regained': 1,
 'send': 1,
 'studying': 1,
 'supply': 1,
 'that': 2,
 'the': 1,
 'to': 6,
 'told': 1,
 'upon': 4,
 'was': 1,
 'what': 1,
 'where': 1,
 'with': 6,
 'within': 1}


In [7]:
print(correction("ther"))

print(correction_context("ther", "and"))

print(correction_context("ther", "but"))

print(correction_context("ther", "about"))

the
then
there
their


In [8]:
%%script echo skipping
def spelltest(tests, verbose=False):
    "Run correction(wrong) on all (right, wrong) pairs; report results."
    import time
    start = time.time()
    good, unknown = 0, 0
    n = len(tests)
    for right, wrong in tests:
        w = correction(wrong)
        good += (w == right)
        if w != right:
            unknown += (right not in WORDS)
            if verbose:
                print('correction({}) => {} ({}); expected {} ({})'
                      .format(wrong, w, WORDS[w], right, WORDS[right]))
    dt = time.time() - start
    print('{:.0%} of {} correct ({:.0%} unknown) at {:.0f} words per second '
          .format(good / n, n, unknown / n, n / dt))

def testset(lines):
    "Parse 'right: wrong1 wrong2' lines into [('right', 'wrong1'), ('right', 'wrong2')] pairs."
    return [(right, wrong)
            for (right, wrongs) in (line.split(':') for line in lines)
            for wrong in wrongs.split()]

spelltest(testset(open('spell-testset1.txt'))) # Development set
spelltest(testset(open('spell-testset2.txt'))) # Final test set

Couldn't find program: 'echo'
