In [25]:
import re
from collections import Counter
from time import perf_counter

def words(text): return re.findall(r'\w+', text.lower())

path_corpus = "D:\Dokumen\SKRIPSI\Language Model\DATASET\wortschatz.leipzig.txt"

WORDS = Counter(words(open(path_corpus, encoding="utf8").read()))

def P(word, N=sum(WORDS.values())): 
    return WORDS[word] / N

def correction(word): 
    return max(candidates(word), key=P)

def candidates(word): 
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])

def known(words): 
    return set(w for w in words if w in WORDS)

def edits1(word):
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def edits2(word): 
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))


In [26]:
text = "Ini adalah contoh kalimat yang akan di-tokenize."
tokens = words(text)

# Cetak hasil tokenizing
print(tokens)

['ini', 'adalah', 'contoh', 'kalimat', 'yang', 'akan', 'di', 'tokenize']


In [27]:
correction('mengujar')

'mengajar'

In [28]:
def edits1(word):
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

In [29]:
len(edits1('mengujar'))

442

In [30]:
def known(words): return set(w for w in words if w in WORDS)
known(edits1('mengujar'))

{'mengajar', 'mengejar'}

In [31]:
def edits2(word): return (e2 for e1 in edits1(word) for e2 in edits1(e1))

In [32]:
len(set(edits2('mengejar')))

90901

In [33]:
known(edits2('mengejar'))

{'mengajak',
 'mengajar',
 'mengajari',
 'mengakar',
 'mengecam',
 'mengecat',
 'mengeja',
 'mengejar',
 'mengejek',
 'mengena',
 'mengenai',
 'mengenal',
 'mengeras',
 'menggelar',
 'menyebar',
 'pengajar'}

In [34]:
known(edits2('mngejar'))

{'mengajar', 'mengeja', 'mengejar'}

In [35]:
def words(text): return re.findall(r'\w+', text.lower())
WORDS = Counter(words(open('D:\Dokumen\SKRIPSI\Language Model\DATASET\wortschatz.leipzig.txt', encoding="utf8").read()))
def P(word, N=sum(WORDS.values())): return WORDS[word] / N

In [36]:
len(WORDS)

34805

In [37]:
sum(WORDS.values())

171674

In [38]:
WORDS.most_common(10)

[('dan', 4644),
 ('yang', 4520),
 ('di', 3056),
 ('pada', 2277),
 ('dengan', 2012),
 ('dari', 1986),
 ('ini', 1835),
 ('untuk', 1739),
 ('dalam', 1417),
 ('tahun', 1162)]

In [39]:
max(WORDS, key=P)

'dan'

In [40]:
P('kucing')

0.00011649987767512844

In [41]:
P('dan')

0.027051271596164824

In [42]:
P('kalkulatif')

0.0

In [43]:
def correction(word): return max(candidates(word), key=P)

def candidates(word): 
    return known([word]) or known(edits1(word)) or known(edits2(word)) or [word]

In [44]:
def unit_tests():
    assert correction('panh') == 'panah'              # insert
    assert correction('przjuryt') == 'prajurit'           # replace 2
    assert correction('minjadi') == 'menjadi'               # replace
    assert correction('merupan') == 'merupakan'       # insert 2
    assert correction('teersebut') == 'tersebut'            # delete
    assert correction('naivgasi') =='navigasi'                  # transpose
    assert correction('naivgasii') =='navigasi'                 # transpose + delete
    assert correction('memang') == 'memang'                     # known
    assert correction('kalkulatif') == 'kalkulatif' # unknown
    assert words('This is a TEST.') == ['this', 'is', 'a', 'test']
    assert Counter(words('This is a test. 123; A TEST this is.')) == (
           Counter({'123': 1, 'a': 2, 'is': 2, 'test': 2, 'this': 2}))
    assert len(WORDS) == 34805
    assert sum(WORDS.values()) == 171674
    assert WORDS.most_common(10) == [('dan', 4644),
            ('yang', 4520),
            ('di', 3056),
            ('pada', 2277),
            ('dengan', 2012),
            ('dari', 1986),
            ('ini', 1835),
            ('untuk', 1739),
            ('dalam', 1417),
            ('tahun', 1162)]
    assert WORDS['dan'] == 4644
    assert P('kalkulatif') == 0
    assert 0.01 < P('dan') < 0.04
    return 'unit_tests pass'

def spelltest(tests, verbose=False):
    import time
    start = time.perf_counter()
    good, unknown = 0, 0
    n = len(tests)
    for right, wrong in tests:
        w = correction(wrong)
        good += (w == right)
        if w != right:
            unknown += (right not in WORDS)
            if verbose:
                print('correction({}) => {} ({}); expected {} ({})'
                      .format(wrong, w, WORDS[w], right, WORDS[right]))
    dt = time.perf_counter() - start
    print('{:.0%} of {} correct ({:.0%} unknown) at {:.0f} words per second '
          .format(good / n, n, unknown / n, n / dt))

def Testset(lines):
    return [(right, wrong)
            for (right, wrongs) in (line.split(':') for line in lines)
            for wrong in wrongs.split()]

print(unit_tests())
spelltest(Testset(open('norvig_insertion.txt')), True)

unit_tests pass


correction(sekkoolahh) => sekkoolahh (0); expected sekolah (126)
correction(sekolahan) => sekolahan (1); expected sekolah (126)
correction(bukuuuu) => bukuuuu (0); expected buku (78)
correction(tman) => taman (37); expected teman (32)
correction(makanan) => makanan (28); expected makan (28)
correction(buung) => burung (37); expected bunga (36)
correction(jalanan) => jalanan (11); expected jalan (109)
correction(lapot) => dapat (442); expected laptop (1)
correction(laptopan) => laporan (25); expected laptop (1)
correction(matari) => materi (12); expected matahari (33)
correction(lautann) => lautan (4); expected laut (86)
correction(kucingan) => kuningan (2); expected kucing (20)
correction(pelajarn) => pelajar (19); expected pelajaran (15)
correction(putrii) => putri (40); expected puteri (8)
correction(pisangan) => pasangan (43); expected pisang (3)
correction(pisangan) => pasangan (43); expected pisang (3)
correction(bauu) => baru (251); expected baju (7)
correction(bajuan) => batuan 

In [45]:
print(unit_tests())
spelltest(Testset(open('norvig_deletion.txt')), True)

unit_tests pass
correction(mkanan) => kanan (30); expected makanan (28)
correction(maaan) => makan (28); expected makanan (28)
correction(tepon) => temon (1); expected telepon (7)
correction(melis) => medis (12); expected menulis (50)
correction(pitu) => itu (770); expected pintu (31)
correction(pelis) => pelvis (1); expected penulis (32)
correction(seolah) => seolah (3); expected sekolah (126)
correction(solah) => salah (176); expected sekolah (126)
correction(kaar) => kabar (18); expected kamar (9)
correction(baa) => baja (13); expected bahasa (203)
correction(jan) => jan (3); expected jalan (109)
correction(sdiri) => diri (128); expected sendiri (152)
correction(seri) => seri (39); expected sendiri (152)
correction(temperat) => tempat (179); expected temperatur (6)
correction(dua) => dua (252); expected dunia (170)
correction(akir) => air (158); expected akhir (97)
correction(aki) => aki (1); expected akhir (97)
correction(jenda) => benda (36); expected jendela (6)
correction(kuci) 

In [46]:
print(unit_tests())
spelltest(Testset(open('norvig_substitution.txt')), True)

unit_tests pass
correction(txman) => taman (37); expected teman (32)
correction(sekula) => semula (7); expected sekolah (126)
correction(sekoa) => serta (150); expected sekolah (126)
correction(sekalh) => sekali (46); expected sekolah (126)
correction(mkan) => akan (448); expected makan (28)
correction(mkann) => akan (448); expected makan (28)
correction(makn) => maka (80); expected makan (28)
correction(putri) => putri (40); expected puteri (8)
correction(putry) => putra (54); expected puteri (8)
correction(putrei) => putri (40); expected puteri (8)
correction(bermyn) => berwyn (1); expected bermain (49)
correction(putre) => putra (54); expected puteri (8)
correction(mgambar) => gambar (43); expected menggambar (5)
correction(mngambar) => gambar (43); expected menggambar (5)
correction(kuching) => kuching (1); expected kucing (20)
correction(loptap) => lontar (2); expected laptop (1)
correction(kls) => kls (1); expected kelas (61)
correction(kella) => kalla (4); expected kelas (61)
co