In [1]:
import re
from collections import Counter

def words(text):
    return re.findall(r'\w+', text.lower())

WORDS = Counter(words(open('big.txt').read()))

def P(word, N=sum(WORDS.values())):
    "Probability of 'word'."
    return WORDS[word] / N

def correction(word):
    "Most probable spelling correction for word."
    return max(candidates(word), key=P)

def candidates(word):
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])

def known(words):
    "The subset of 'words' that appear in the dictionary of WORDS."
    return set(w for w in words if w in WORDS)

def edits1(word):
    "All edits that are one edit away from 'word'."
    letters = 'abcdefghijklmnopqrstuvwxyz'
    splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
    deletes = [L + R[1:] for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
    replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
    inserts = [L + c + R for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def edits2(word):
    "All edits that are two edits away from 'word'."
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))

In [2]:
correction('spellin')

'spelling'

In [3]:
correction('korrectud')

'corrected'

In [4]:
correction('I want')

'want'

In [5]:
len(edits1('somthing'))

442

In [6]:
known(edits1('somthing'))

{'something', 'soothing'}

In [7]:
len(set(edits2('something')))

114324

In [8]:
known(edits2('somthing'))

{'loathing',
 'nothing',
 'scathing',
 'seething',
 'smoothing',
 'something',
 'soothing',
 'sorting'}

In [9]:
len(WORDS)

32198

In [10]:
sum(WORDS.values())

1115585

In [11]:
WORDS['some']

1536

In [12]:
WORDS.most_common(10)

[('the', 79809),
 ('of', 40024),
 ('and', 38312),
 ('to', 28765),
 ('in', 22023),
 ('a', 21124),
 ('that', 12512),
 ('he', 12401),
 ('was', 11410),
 ('it', 10681)]

In [13]:
max(WORDS, key=P)

'the'

In [14]:
P('the')

0.07154004401278254

In [15]:
P('outrivaled')

8.963906829152417e-07

In [16]:
P('unmentioned')

0.0

In [17]:
WORDS.most_common()

[('the', 79809),
 ('of', 40024),
 ('and', 38312),
 ('to', 28765),
 ('in', 22023),
 ('a', 21124),
 ('that', 12512),
 ('he', 12401),
 ('was', 11410),
 ('it', 10681),
 ('his', 10034),
 ('is', 9773),
 ('with', 9739),
 ('as', 8064),
 ('i', 7684),
 ('had', 7383),
 ('for', 6941),
 ('at', 6789),
 ('by', 6735),
 ('on', 6639),
 ('not', 6626),
 ('be', 6155),
 ('from', 5689),
 ('but', 5653),
 ('s', 5626),
 ('you', 5622),
 ('or', 5352),
 ('her', 5284),
 ('him', 5230),
 ('which', 4842),
 ('were', 4289),
 ('all', 4143),
 ('this', 4063),
 ('she', 3946),
 ('they', 3938),
 ('are', 3630),
 ('have', 3493),
 ('said', 3464),
 ('an', 3421),
 ('one', 3371),
 ('who', 3050),
 ('so', 3017),
 ('what', 3011),
 ('there', 2972),
 ('their', 2955),
 ('when', 2920),
 ('been', 2599),
 ('may', 2551),
 ('if', 2371),
 ('no', 2348),
 ('up', 2283),
 ('my', 2249),
 ('them', 2241),
 ('into', 2124),
 ('more', 1997),
 ('out', 1987),
 ('pierre', 1964),
 ('would', 1953),
 ('prince', 1935),
 ('me', 1920),
 ('we', 1906),
 ('did', 18