<a href="https://colab.research.google.com/github/Atirtacx/Spell_Checking_Bahasa_Indonesia/blob/main/Spell_checking_using_Lavenshtein_Distance.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#Lavenshtein_Distance
import time

def load_words(file_path):
    with open(file_path, 'r') as f:
        words = f.read().splitlines()
    return words

def levenshtein_distance(s, t):
    m = len(s)
    n = len(t)
    d = [[0] * (n + 1) for _ in range(m + 1)]

    for i in range(1, m + 1):
        d[i][0] = i

    for j in range(1, n + 1):
        d[0][j] = j

    for j in range(1, n + 1):
        for i in range(1, m + 1):
            if s[i - 1] == t[j - 1]:
                d[i][j] = d[i - 1][j - 1]
            else:
                d[i][j] = min(d[i - 1][j] + 1,  # deletion
                              d[i][j - 1] + 1,  # insertion
                              d[i - 1][j - 1] + 1)  # substitution

    return d[m][n]

# fungsi spell checker dengan spell suggestion dan spell correction
def spell_checker(word, dictionary):
    min_distance = float('inf')
    closest_word = ''
    suggestions = []
    for dict_word in dictionary:
        distance = levenshtein_distance(word, dict_word)
        if distance < min_distance:
            min_distance = distance
            closest_word = dict_word
        if distance <= 1: # atur threshold jarak terdekat
            suggestions.append(dict_word)
            
    if min_distance == 0:
        return word
    elif len(suggestions) > 0:
        return f"Kata yang dimaksud mungkin adalah: {', '.join(suggestions)}"
    else:
        return f"Tidak ditemukan kata yang cocok. Kata yang dimaksud mungkin adalah: {closest_word}"

# load dictionary
start_time = time.time()
dictionary = load_words('kata-dasar.txt')
print(f"Dictionary loaded in {time.time()-start_time} seconds.")

# test spell checker
words_to_test = ['kemaren', 
                 'ku', 
                 'knalpot', 
                 'brangkas', 
                 'biskuut', 
                 'kemeja', 
                 'selasa', 
                 'kmana', 
                 'kenap', 
                 'siap']

for word in words_to_test:
    start_time = time.time()
    suggestion = spell_checker(word, dictionary)
    print(f"{word} -> {suggestion} (Time taken: {time.time()-start_time} seconds)")

Dictionary loaded in 0.01443624496459961 seconds.
kemaren -> Kata yang dimaksud mungkin adalah: kemarin (Time taken: 1.2569715976715088 seconds)
ku -> Kata yang dimaksud mungkin adalah: aku, kau, kiu, kru, kue, kui, kuk, kup, kur, kus, mu (Time taken: 0.4618368148803711 seconds)
knalpot -> knalpot (Time taken: 1.2939159870147705 seconds)
brangkas -> Kata yang dimaksud mungkin adalah: bangkas, brankas (Time taken: 1.4427282810211182 seconds)
biskuut -> Kata yang dimaksud mungkin adalah: biskuit (Time taken: 0.9540843963623047 seconds)
kemeja -> kemeja (Time taken: 1.4093878269195557 seconds)
selasa -> selasa (Time taken: 1.4027934074401855 seconds)
kmana -> Kata yang dimaksud mungkin adalah: kana, koana, mana (Time taken: 1.003403663635254 seconds)
kenap -> kenap (Time taken: 0.9372379779815674 seconds)
siap -> siap (Time taken: 0.5043070316314697 seconds)
