# Spelling correction
## by
### Florian Eder    00819174
### Moritz Enderle  00819536

#### imports

In [1]:
import re
import sys
from collections import Counter

#### open txt

In [2]:
def words(_text):
    """
    Return all words in text
    :param text: text to be parsed
    :return: list of words
    """
    return re.findall(r'\w+', _text.lower())

with open('big.txt', encoding="utf-8") as f:
    text = f.read()
    WORDS = Counter(words(text))

#### given functions

In [3]:
def probability(_word, _no_words=sum(WORDS.values())):
    """
    Probability of `word`
    :param _word: word to be probed
    :param _no_words: number of words in the counter
    :return probability of `word`
    """
    return WORDS[_word] / _no_words


def correction(_word):
    """
    Most probable spelling correction for word
    :param _word: word to be corrected
    :return most probable spelling correction for word
    """
    return max(candidates(_word), key=probability)


def candidates(_word):
    """
    Generate possible spelling corrections for word
    :param _word: word to be corrected
    :return: list of possible spelling corrections for word
    """
    return known([_word]) or known(edits1(_word)) or known(edits2(_word)) or [_word]


def known(_words):
    """
    The subset of `words` that appear in the dictionary of WORDS
    :param _words: list of words to be checked
    :return: list of words that appear in the dictionary of WORDS
    """
    return set(w for w in _words if w in WORDS)


def edits1(_word):
    """All edits that are one edit away from `word`
    :param _word: word to be corrected
    :return: list of possible spelling corrections for word
    """
    letters = 'abcdefghijklmnopqrstuvwxyz'
    splits = [(_word[:i], _word[i:]) for i in range(len(_word) + 1)]
    deletes = [L + R[1:] for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
    replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
    inserts = [L + c + R for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)


def edits2(_word):
    """
    All edits that are two edits away from `word`
    :param _word: word to be corrected
    :return: list of possible spelling corrections for word
    """
    return (e2 for e1 in edits1(_word) for e2 in edits1(e1))

#### improvements
#### context approach

In [4]:
%%time
CONTEXT = {}
text = text.strip().split()
text = [x.strip().replace('.', '').replace().lower('"', '') for x in text if x.isalpha()]
for index, word in enumerate(text):
    word = word.lower()
    if word not in CONTEXT:
        CONTEXT[word] = {}
    if index > 0:
        if text[index - 1] not in CONTEXT[word]:
            CONTEXT[word][text[index - 1]] = 0
        CONTEXT[word][text[index - 1]] += 1


CPU times: total: 109 ms
Wall time: 115 ms


In [5]:
def correction_context(_word, _context):
    _word = _word.lower()
    """
    Most probable spelling correction for word
    :param _word: word to be corrected
    :param _context: context of the word
    :return: most probable spelling correction for word
    """
    if _context == "":
        return max(candidates(_word), key=probability)
    else:
        return max(candidates(_word), key=lambda x: probability_context(x, _context))


def probability_context(_word, _context):
    """
    Probability of `word`
    :param _word: word to be probed
    :param _context: context of the word
    :return probability of `word`
    """
    if _word in CONTEXT and _context in CONTEXT[_word]:
        return CONTEXT[_word][_context] / sum(CONTEXT[_word].values())
    else:
        return 0

#### testing

In [9]:
from pprint import pprint

pprint(CONTEXT["the"])

{'a': 20,
 'abominable': 1,
 'about': 32,
 'above': 11,
 'accumulated': 1,
 'across': 16,
 'add': 1,
 'addressed': 1,
 'adler': 1,
 'admirably': 1,
 'admire': 1,
 'admiring': 1,
 'advise': 1,
 'affect': 1,
 'afraid': 1,
 'after': 17,
 'afterwards': 3,
 'against': 29,
 'ago': 1,
 'agony': 1,
 'alice': 1,
 'alicia': 1,
 'all': 54,
 'along': 6,
 'aloysius': 1,
 'already': 2,
 'also': 1,
 'alter': 1,
 'always': 2,
 'am': 3,
 'amid': 10,
 'among': 24,
 'an': 3,
 'and': 196,
 'announced': 1,
 'answer': 1,
 'answered': 1,
 'answering': 2,
 'apparently': 1,
 'apply': 1,
 'approached': 2,
 'are': 23,
 'around': 2,
 'arrange': 1,
 'arrested': 1,
 'arrived': 1,
 'as': 36,
 'ascend': 1,
 'ascended': 2,
 'aside': 2,
 'ask': 1,
 'asked': 1,
 'assisted': 1,
 'at': 246,
 'attacked': 1,
 'attention': 1,
 'attract': 1,
 'autumnal': 1,
 'avoid': 1,
 'avoided': 1,
 'avoiding': 1,
 'away': 2,
 'bachelor': 2,
 'back': 2,
 'balancing': 1,
 'band': 1,
 'be': 34,
 'bear': 3,
 'beat': 2,
 'beautiful': 1,
 'beau

In [10]:
print(correction("ther"))

print(correction_context("ther", "and"))

print(correction_context("ther", "but"))

print(correction_context("ther", "about"))

print(correction_context("ther", "at"))

the
then
there
their
the
