# Spelling correction
## by
### Florian Eder    00819174
### Moritz Enderle  00819536

#### imports

In [13]:
import os
import re
from collections import Counter

#### open txt

In [14]:
def words(_text):
    """
    Return all words in text
    :param text: text to be parsed
    :return: list of words
    """
    return re.findall(r'\w+', _text.lower())

WORDS = Counter()
texts = []
am = 0
for file in os.listdir("files"):
    if file.endswith(".txt"):
        with open("files/" + file, "r") as f:
            text = f.read()
            am += len(words(text))
            texts.append(text)
            WORDS.update(words(text))
len(WORDS)

20799

#### given functions

In [15]:
def probability(_word, _no_words=sum(WORDS.values())):
    """
    Probability of `word`
    :param _word: word to be probed
    :param _no_words: number of words in the counter
    :return probability of `word`
    """
    return WORDS[_word] / _no_words


def correction(_word):
    """
    Most probable spelling correction for word
    :param _word: word to be corrected
    :return most probable spelling correction for word
    """
    return max(candidates(_word), key=probability)


def candidates(_word):
    """
    Generate possible spelling corrections for word
    :param _word: word to be corrected
    :return: list of possible spelling corrections for word
    """
    return known([_word]) or known(edits1(_word)) or known(edits2(_word)) or [_word]


def known(_words):
    """
    The subset of `words` that appear in the dictionary of WORDS
    :param _words: list of words to be checked
    :return: list of words that appear in the dictionary of WORDS
    """
    return set(w for w in _words if w in WORDS)


def edits1(_word):
    """All edits that are one edit away from `word`
    :param _word: word to be corrected
    :return: list of possible spelling corrections for word
    """
    letters = 'abcdefghijklmnopqrstuvwxyz'
    splits = [(_word[:i], _word[i:]) for i in range(len(_word) + 1)]
    deletes = [L + R[1:] for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
    replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
    inserts = [L + c + R for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)


def edits2(_word):
    """
    All edits that are two edits away from `word`
    :param _word: word to be corrected
    :return: list of possible spelling corrections for word
    """
    return (e2 for e1 in edits1(_word) for e2 in edits1(e1))

#### improvements
#### context approach

In [16]:
%%time
CONTEXT = {}
for text in texts:
    text = text.strip().split()
    text = [x.strip().replace('.', '').replace('"', '').lower() for x in text if x.isalpha()]
    for index, word in enumerate(text):
        word = word.lower()
        if word not in CONTEXT:
            CONTEXT[word] = {}
        if index > 0:
            if text[index - 1] not in CONTEXT[word]:
                CONTEXT[word][text[index - 1]] = 0
            CONTEXT[word][text[index - 1]] += 1


CPU times: total: 828 ms
Wall time: 1.04 s


In [17]:
def correction_context(_sentence):
    """
    Most probable spelling correction for word
    :param _sentence: sentence to be corrected
    :return: most probable spelling correction for word
    """
    _words = _sentence.lower().split()
    result = ""
    for i in range(len(_words)):
        _word, _context = _words[i], _words[i - 1] if i > 0 else (_words[i], "")
        if _context == "":
            result += max(candidates(_word), key=probability)
        else:
            result += max(candidates(_word), key=lambda x: probability_context(x, _context))

    return result


def probability_context(_word, _context):
    """
    Probability of `word`
    :param _word: word to be probed
    :param _context: context of the word
    :return probability of `word`
    """
    if _context in CONTEXT[_word]:
        return CONTEXT[_word][_context] / sum(CONTEXT[_word].values())
    else:
        return probability(_word=_word)

#### testing

In [22]:
from pprint import pprint
pprint(CONTEXT)

{'a': {'a': 36,
       'about': 21,
       'above': 1,
       'accented': 1,
       'accenting': 1,
       'accept': 1,
       'accomodate': 2,
       'accounts': 1,
       'accuracy': 1,
       'achieve': 1,
       'acm': 1,
       'acquire': 2,
       'across': 4,
       'activate': 1,
       'actually': 1,
       'add': 3,
       'added': 4,
       'address': 1,
       'addressed': 1,
       'adds': 1,
       'admit': 2,
       'admittedly': 1,
       'adopt': 1,
       'adopted': 2,
       'adopting': 1,
       'advanced': 1,
       'advancemeny': 1,
       'afforded': 1,
       'afraid': 1,
       'after': 22,
       'afterwards': 2,
       'again': 5,
       'against': 7,
       'aged': 1,
       'air': 1,
       'all': 5,
       'allophones': 1,
       'allow': 2,
       'allowed': 2,
       'allowing': 1,
       'allows': 4,
       'almost': 6,
       'along': 3,
       'already': 3,
       'also': 12,
       'although': 7,
       'always': 5,
       'am': 21,
       'amid': 2,

In [19]:
print(correction("ther"))

print(correction_context("ther", "and"))

print(correction_context("ther", "but"))

print(correction_context("ther", "about"))

print(correction_context("ther", "at"))

the
then
there
their
the
