# Spelling correction
## by
### Florian Eder    00819174
### Moritz Enderle  00819536

#### imports

In [1]:
import os
import re
from collections import Counter

#### open txt

In [2]:
def words(_text):
    """
    Return all words in text
    :param _text: text to be parsed
    :return: list of words
    """
    return re.findall(r'\w+', _text.lower())

WORDS = Counter()
texts = []
for file in os.listdir("files"):
    if file.endswith(".txt"):
        with open("files/" + file, "r") as f:
            text = f.read()
            texts.append(text)
            WORDS.update(words(text))
WORDS.most_common(10)

[('the', 21698),
 ('of', 10500),
 ('a', 9957),
 ('and', 9818),
 ('to', 9438),
 ('in', 6787),
 ('i', 5433),
 ('is', 4951),
 ('it', 4649),
 ('that', 4480)]

#### given functions

In [3]:
def probability(_word, _no_words=sum(WORDS.values())):
    """
    Probability of `word`
    :param _word: word to be probed
    :param _no_words: number of words in the counter
    :return probability of `word`
    """
    return WORDS[_word] / _no_words


def correction_old(_word):
    """
    Most probable spelling correction for word
    :param _word: word to be corrected
    :return most probable spelling correction for word
    """
    return max(candidates(_word), key=probability)


def candidates(_word):
    """
    Generate possible spelling corrections for word
    :param _word: word to be corrected
    :return: list of possible spelling corrections for word
    """
    return known([_word]) or known(edits1(_word)) or known(edits2(_word)) or [_word]


def known(_words):
    """
    The subset of `words` that appear in the dictionary of WORDS
    :param _words: list of words to be checked
    :return: list of words that appear in the dictionary of WORDS
    """
    return set(w for w in _words if w in WORDS)


def edits1(_word):
    """All edits that are one edit away from `word`
    :param _word: word to be corrected
    :return: list of possible spelling corrections for word
    """
    letters = 'abcdefghijklmnopqrstuvwxyz'
    splits = [(_word[:i], _word[i:]) for i in range(len(_word) + 1)]
    deletes = [L + R[1:] for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
    replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
    inserts = [L + c + R for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)


def edits2(_word):
    """
    All edits that are two edits away from `word`
    :param _word: word to be corrected
    :return: list of possible spelling corrections for word
    """
    return (e2 for e1 in edits1(_word) for e2 in edits1(e1))

#### improvements
#### context approach

In [4]:
%%time
CONTEXT = {}
for text in texts:
    text = text.strip().split()
    text = [x.strip().replace('.', '').replace('"', '').lower() for x in text if x.isalpha()]
    for index, word in enumerate(text):
        word = word.lower()
        if word not in CONTEXT:
            CONTEXT[word] = {}
        if index > 0:
            if text[index - 1] not in CONTEXT[word]:
                CONTEXT[word][text[index - 1]] = 0
            CONTEXT[word][text[index - 1]] += 1


CPU times: total: 891 ms
Wall time: 1.14 s


In [5]:
def correction_context(_sentence):
    """
    Most probable spelling correction for word
    :param _sentence: sentence to be corrected
    :return: most probable spelling correction for word
    """
    _words = _sentence.lower().split()
    result = ""
    for i in range(len(_words)):
        if i > 0:
            _word, _context = _words[i], _words[i - 1]
        else:
            _word, _context = _words[i], ""

        if _context == "":
            result += " " + max(candidates(_word), key=probability)
        else:
            result += " " + max(candidates(_word), key=lambda x: probability_context(x, correction_old(_context)))
    return result.strip()

def probability_context(_word, _context):
    """
    Probability of `word`
    :param _word: word to be probed
    :param _context: context of the word
    :return probability of `word`
    """
    if _word in CONTEXT:
        if _context in CONTEXT[_word]:
            return CONTEXT[_word][_context] / sum(CONTEXT[_word].values())
        else:
            return probability(_word)
    else:
        return 0

#### Improvment correction if word is not in list

In [6]:
def endings(_word):
    """
    Generate possible endings for word
    :param _word: word to be corrected
    :return: list of possible endings for word
    """
    if _word[-5:-1] == 'ilit' and _word[-1] != 'y':
        return _word[:-1] + 'y'
    elif _word[-5:-1] == 'call' and _word[-1] != 'y':
        return _word[:-1] + 'y'
    elif _word[-4:-1] == 'ial' and _word[-1] != 'y':
        return _word[:-1] + 'y'
    elif _word[-4:-1] == 'abl' and _word[-1] != 'e':
        return _word[:-1] + 'e'

def correction_not_in_list(_word):
    """
    Most probable spelling correction for word
    :param _word: word to be corrected
    :return: most probable spelling correction for word
    """
    if _word in WORDS:
        return _word
    elif max(candidates(_word), key=probability) == _word:
        return endings(_word)
    else:
        return max(candidates(_word), key=probability)

In [7]:
def correction(_sentence):
    """
    Apply all corrections to sentence.
    :param _sentence: sentence to be corrected
    :return: most probable spelling correction for word
    """
    return correction_context(" ".join([correction_not_in_list(x) for x in _sentence.split()]))

#### testing

In [16]:
print(correction_old("ther"))

print(correction_context("and ther"))

print(correction_context("but ther"))

print(correction_context("about ther"))

print(correction_context("at ther"))

print(correction_not_in_list("transportabilitx"))

print(correction_not_in_list("addressablr"))

print(correction_not_in_list("electroencephalographicallz"))

print(correction("hte quik browwn f9x junpa ovrr ghe lazz dog."))

the
and then
but there
about their
at the
transportability
transportibility
addressable
electroencephalographically
the quick brown fox jumps over the lazy dog


In [40]:
print(correction("ther"))

print(correction_context("ther", "and"))

print(correction_context("ther", "but"))

print(correction_context("ther", "about"))

print(correction_context("ther", "at"))

print(correction_not_in_list("transportabilitx"))

print(correction_not_in_list("transportibilitx"))

print(correction_not_in_list("addressablr"))

print(correction_not_in_list("electroencephalographicallz"))

the
then
there
their
the
transportability
transportibility
addressable
electroencephalographically
