In [1]:
import re
from collections import Counter
from time import perf_counter

def words(text): return re.findall(r'\w+', text.lower())

path_corpus = "D:\Dokumen\SKRIPSI\Language Model\DATASET\wortschatz.leipzig.txt"

WORDS = Counter(words(open(path_corpus, encoding="utf8").read()))

In [2]:
def P(word, N=sum(WORDS.values())): 
    "Probability of `word`."
    return WORDS[word] / N
def correction(word): 
    "Most probable spelling correction for word."
    return max(candidates(word), key=P)

def candidates(word): 
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])

def known(words): 
    "The subset of `words` that appear in the dictionary of WORDS."
    return set(w for w in words if w in WORDS)

def edits1(word):
    "All edits that are one edit away from `word`."
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def edits2(word): 
    "All edits that are two edits away from `word`."
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))



In [3]:
word = 'semester akhir'
splits = [(word[:i], word[i:])    for i in range(len(word) + 1)]
splits

[('', 'semester akhir'),
 ('s', 'emester akhir'),
 ('se', 'mester akhir'),
 ('sem', 'ester akhir'),
 ('seme', 'ster akhir'),
 ('semes', 'ter akhir'),
 ('semest', 'er akhir'),
 ('semeste', 'r akhir'),
 ('semester', ' akhir'),
 ('semester ', 'akhir'),
 ('semester a', 'khir'),
 ('semester ak', 'hir'),
 ('semester akh', 'ir'),
 ('semester akhi', 'r'),
 ('semester akhir', '')]

In [4]:
correction('makang')

'makan'

In [7]:
def unit_tests():
    assert correction('panh') == 'panah'              # insert
    assert correction('przjuryt') == 'prajurit'           # replace 2
    assert correction('minjadi') == 'menjadi'               # replace
    assert correction('merupan') == 'merupakan'       # insert 2
    assert correction('teersebut') == 'tersebut'            # delete
    assert correction('naivgasi') =='navigasi'                  # transpose
    assert correction('naivgasii') =='navigasi'                 # transpose + delete
    assert correction('memang') == 'memang'                     # known
    assert correction('kalkulatif') == 'kalkulatif' # unknown
    assert words('This is a TEST.') == ['this', 'is', 'a', 'test']
    assert Counter(words('This is a test. 123; A TEST this is.')) == (
           Counter({'123': 1, 'a': 2, 'is': 2, 'test': 2, 'this': 2}))
    assert len(WORDS) == 34805
    assert sum(WORDS.values()) == 171674
    assert WORDS.most_common(10) == [('dan', 4644),
            ('yang', 4520),
            ('di', 3056),
            ('pada', 2277),
            ('dengan', 2012),
            ('dari', 1986),
            ('ini', 1835),
            ('untuk', 1739),
            ('dalam', 1417),
            ('tahun', 1162)]
    assert WORDS['dan'] == 4644
    assert P('kalkulatif') == 0
    assert 0.01 < P('dan') < 0.04
    return 'unit_tests pass'

def spelltest(tests, verbose=False):
    import time
    start = perf_counter()
    good, unknown = 0, 0
    n = len(tests)
    for right, wrong in tests:
        w = correction(wrong)
        good += (w == right)
        if w != right:
            unknown += (right not in WORDS)
            if verbose:
                print('correction({}) => {} ({}); expected {} ({})'
                      .format(wrong, w, WORDS[w], right, WORDS[right]))
    dt = perf_counter() - start
    print('{:.0%} of {} correct ({:.0%} unknown) at {:.0f} words per second '
          .format(good / n, n, unknown / n, n / dt))

def Testset(lines):
    return [(right, wrong)
            for (right, wrongs) in (line.split(':') for line in lines)
            for wrong in wrongs.split()]

print(unit_tests())
spelltest(Testset(open('ngrams_insertion.txt')), True)

unit_tests pass
correction(taas) => atas (210); expected tas (1)
correction(topii) => topik (9); expected topi (7)
correction(bantaal) => bantuan (36); expected bantal (0)
correction(kumbangg) => kembang (5); expected kumbang (0)
correction(s) => s (70); expected sapu (1)
correction(telppon) => telepon (7); expected telpon (0)
correction(jambbu) => bambu (8); expected jambu (0)
correction(rookk) => rokok (3); expected rok (1)
correction(taas) => atas (210); expected tas (1)
correction(guunting) => genting (3); expected gunting (0)
correction(jalanan) => jalanan (11); expected jalan (109)
correction(senndal) => sentral (9); expected sendal (0)
correction(senndal) => sentral (9); expected sendal (0)
correction(s) => s (70); expected sirip (7)
correction(konspirasii) => konspirasii (0); expected konspirasi (0)
correction(jenglott) => jenglott (0); expected jenglot (0)
correction(sinematografii) => sinematografii (0); expected sinematografi (0)
correction(quassar) => quassar (0); expected 

In [None]:
print(unit_tests())
spelltest(Testset(open('ngrams_subtitution.txt')), True)

unit_tests pass
correction(stxlah) => telah (409); expected sekolah (126)
correction(mkan) => akan (448); expected makan (28)
correction(putei) => putri (40); expected puteri (8)
correction(putxe) => putra (54); expected puteri (8)
correction(loptap) => lontar (2); expected laptop (1)
correction(kls) => kls (1); expected kelas (61)
correction(kpe) => ke (842); expected kopi (25)
correction(matari) => materi (12); expected matahari (33)
correction(baxsa) => basa (3); expected bahasa (203)
correction(jal) => hal (207); expected jam (51)
correction(meji) => mei (72); expected meja (8)
correction(pustaja) => pustaka (2); expected pustaha (0)
correction(dwitungal) => dwitungal (0); expected dwitunggal (0)
correction(transenden) => transgender (5); expected transenden (0)
correction(syafaat) => syarat (8); expected syafaat (0)
correction(abdullah) => abdullah (11); expected abdulah (0)
correction(ksenufobi) => ksenufobi (0); expected xenofobi (0)
correction(katalis) => natalis (4); expected 

In [None]:
print(unit_tests())
spelltest(Testset(open('ngram_deletion.txt')), True)

unit_tests pass
correction(kesthan) => kesatuan (11); expected kesehatan (25)
correction(komper) => koper (2); expected komputer (43)
correction(olagra) => laura (8); expected olahraga (19)
correction(perusaan) => perasaan (10); expected perusahaan (116)
correction(perencaan) => berencana (13); expected perencanaan (5)
correction(kemanan) => kemanan (1); expected keamanan (27)
correction(perekomoni) => perekomoni (0); expected perekonomian (8)
correction(kebijaksanan) => kebijaksanan (0); expected kebijaksanaan (0)
correction(perusaan) => perasaan (10); expected perusahaan (116)
correction(mkanan) => kanan (30); expected makanan (28)
correction(tepon) => temon (1); expected telepon (7)
correction(solah) => salah (176); expected sekolah (126)
correction(duna) => dua (252); expected dunia (170)
correction(akir) => air (158); expected akhir (97)
correction(wara) => para (279); expected warna (63)
correction(seang) => sedang (87); expected senang (16)
correction(pemelajaran) => pemelajaran