<a href="https://colab.research.google.com/github/Atirtacx/Spell_Checking_Bahasa_Indonesia/blob/main/Spell_checking_Indonesia_language_method_using_Peter_Norvig.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
#Peter_Norvig

import re
from collections import Counter
import time

def load_words(file_path):
    with open(file_path, 'r') as f:
        words = f.read().splitlines()
    return words

WORDS = Counter(words(open('kata-dasar.txt').read()))

def P(word, N=sum(WORDS.values())):
    "Probability of `word`."
    return WORDS[word] / N

def correction(word):
    "Most probable spelling correction for word."
    return max(candidates(word), key=P)

def candidates(word):
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])

def known(words):
    "The subset of `words` that appear in the dictionary of WORDS."
    return set(w for w in words if w in WORDS)

def edits1(word):
    "All edits that are one edit away from `word`."
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def edits2(word):
    "All edits that are two edits away from `word`."
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))

def suggestion_word(word):
    "Return suggestions for misspelled word"
    return sorted(candidates(word), key=P, reverse=True)

# load dictionary
start_time = time.time()
dictionary = load_words('kata-dasar.txt')
print(f"Dictionary loaded in {time.time()-start_time} seconds.")

# Testing
test_words = ['kemaren', 
              'ku', 
              'knalpot', 
              'brangkas', 
              'biskuut', 
              'kemeja', 
              'selasa', 
              'kemna', 
              'minm', 
              'siap']

for word in test_words:
    start = time.time()
    corrected_word = correction(word)
    suggestions = suggestion_word(word)
    end = time.time()
    print(f"{word} -> {corrected_word}, suggestions: {suggestions}, runtime: {end-start:.6f} seconds")


Dictionary loaded in 0.003116130828857422 seconds.
kemaren -> kemarin, suggestions: ['kemarin'], runtime: 0.000508 seconds
ku -> kau, suggestions: ['kau', 'kue', 'kuk', 'kui', 'kur', 'aku', 'kup', 'kus', 'kiu', 'mu', 'kru'], runtime: 0.000179 seconds
knalpot -> knalpot, suggestions: ['knalpot'], runtime: 0.000009 seconds
brangkas -> brankas, suggestions: ['brankas', 'bangkas'], runtime: 0.000400 seconds
biskuut -> biskuit, suggestions: ['biskuit'], runtime: 0.000371 seconds
kemeja -> kemeja, suggestions: ['kemeja'], runtime: 0.000008 seconds
selasa -> selasa, suggestions: ['selasa'], runtime: 0.000007 seconds
kemna -> kena, suggestions: ['kena', 'keman', 'kempa'], runtime: 0.000247 seconds
minm -> minim, suggestions: ['minim', 'mini', 'minum', 'min', 'mina', 'mim'], runtime: 0.000200 seconds
siap -> siap, suggestions: ['siap'], runtime: 0.000008 seconds
