In [1]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize #for tokenization
from nltk.stem import PorterStemmer #for stemming
import snowballstemmer #We also import this one for "Turkish"
from nltk.corpus import stopwords 
import numpy as np
import pandas as pd
from collections import Counter

In [2]:
def processing_data(file_name):
    
    with open(file_name, encoding="utf8") as f:
        file_name_d = f.read()
    file_name_d_lower = file_name_d.lower() #.lower() for lowering the letters
    # tokenize the text word by word
    words = word_tokenize(file_name_d_lower)
    
    return words

In [3]:
word_list = processing_data('oguzatay.txt')
vocab = set(word_list) # as a vocabulary we choose Oguz Atay : Tehlikeli Oyunlar

In [4]:
# We are going to define get_count function for counting how many times a word appear in the corpus

In [5]:
def get_count(word_list):
    word_count_dict = {} #fill this with word counts
    word_count_dict = Counter(word_list)
    return word_count_dict

In [6]:
word_count_dict = get_count(word_list)

In [7]:
# We are going to implement get_probs function which gives you the probability that a word occurs in a sample.

In [8]:
def get_probs(word_count_dict):
    probs = {}
    cardinality = sum(word_count_dict.values())
    for key in word_count_dict.keys():
        probs[key] = word_count_dict[key]/cardinality
    return probs

In [9]:
probs = get_probs(word_count_dict)

In [10]:
def delete_letter(word, verbose=False):
    
    delete_l = []
    split_l = []

    split_l = [(word[:i],word[i:]) for i in range(len(word)+1)]
    delete_l = [(L+R[1:]) for L, R in split_l if R]

    if verbose: print(f"input word {word}, \nsplit_l = {split_l}, \ndelete_l = {delete_l}")

    return delete_l

In [11]:
def switch_letter(word, verbose=False):

    switch_l = []
    split_l = []

    split_l = [(word[:i],word[i:]) for i in range(len(word)+1)]
    switch_l = [(L + R[1] + R[0] + R[2:]) for L, R in split_l if len(R)>=2]

    if verbose: print(f"Input word = {word} \nsplit_l = {split_l} \nswitch_l = {switch_l}") 

    return switch_l

In [12]:
def replace_letter(word, verbose=False):

    letters = 'abcçdefgğhıijklmnoöpqrsştuüvyz'
    replace_l = []
    split_l = []

    split_l = [(word[:i],word[i:]) for i in range(len(word)+1)]
    replace_l = [(L+C+R[1:]) for L, R in split_l if len(R)>=1 for C in letters]
    replace_set = set(replace_l)
    replace_set.discard(word)

    replace_l = sorted(list(replace_set))
    
    if verbose: print(f"Input word = {word} \nsplit_l = {split_l} \nreplace_l {replace_l}")   
    
    return replace_l

In [13]:
def insert_letter(word, verbose=False):
    
    letters = 'abcçdefgğhıijklmnoöpqrsştuüvyz'
    insert_l = []
    split_l = []
    
    split_l = [(word[:i],word[i:]) for i in range(len(word)+1)]
    insert_l = [(L+c+R) for L,R in split_l if 1 for c in letters]
    
    if verbose: print(f"Input word {word} \nsplit_l = {split_l} \ninsert_l = {insert_l}")
    
    return insert_l

In [14]:
def edit_one_letter(word, allow_switches = True):

    edit_one_set = set()
    
    edit_one_set.update(delete_letter(word))
    if allow_switches:
        edit_one_set.update(switch_letter(word))
    edit_one_set.update(replace_letter(word))
    edit_one_set.update(insert_letter(word))

    return edit_one_set

In [15]:
def edit_two_letters(word, allow_switches = True):

    edit_two_set = set()

    edit_one = edit_one_letter(word,allow_switches=allow_switches)
    for w in edit_one:
        edit_two = edit_one_letter(w,allow_switches=allow_switches)
        edit_two_set.update(edit_two)

    return edit_two_set

In [16]:
def get_corrections(word, probs, vocab, n=2, verbose = False):

    suggestions = []
    n_best = []

    suggestions = list((word in vocab and word) or edit_one_letter(word).intersection(vocab) or edit_two_letters(word).intersection(vocab))
    
    suggestions = list(reversed(suggestions))
    for w in suggestions:
        n_best.append([w,probs[w]])    

    return n_best

In [17]:
text = 'Evden ekmek elmak için Ahmet ila birlakte dışarı çıktım.'
text_tokenized = word_tokenize(text)
print(text)

corrected_text = ''
for i in range(len(text_tokenized)):
    l_word = text_tokenized[i].lower()
    if l_word in vocab:
        corrected_text = corrected_text + ' ' + text_tokenized[i]
    else:
        tmp_corrections = get_corrections(l_word, probs, vocab, 2, verbose=True)
        for i, word_prob in enumerate(tmp_corrections):
            ans = input(f"Did you mean [{word_prob[0]}] instead of [{l_word}] ? [Y/N]")
            if ans == 'Y':
                corrected_text = corrected_text + ' ' + word_prob[0]
                break
            else:
                print('Hmm... Let me think')


print("So your corrected text is:\n",corrected_text)

Evden ekmek elmak için Ahmet ila birlakte dışarı çıktım.
Did you mean [elma] instead of [elmak] ? [Y/N]N
Hmm... Let me think
Did you mean [olmak] instead of [elmak] ? [Y/N]N
Hmm... Let me think
Did you mean [almak] instead of [elmak] ? [Y/N]Y
Did you mean [ima] instead of [ila] ? [Y/N]N
Hmm... Let me think
Did you mean [cila] instead of [ila] ? [Y/N]N
Hmm... Let me think
Did you mean [la] instead of [ila] ? [Y/N]N
Hmm... Let me think
Did you mean [imla] instead of [ila] ? [Y/N]N
Hmm... Let me think
Did you mean [ilaç] instead of [ila] ? [Y/N]N
Hmm... Let me think
Did you mean [isa] instead of [ila] ? [Y/N]N
Hmm... Let me think
Did you mean [ilk] instead of [ila] ? [Y/N]N
Hmm... Let me think
Did you mean [ile] instead of [ila] ? [Y/N]Y
Did you mean [birlikte] instead of [birlakte] ? [Y/N]Y
So your corrected text is:
  Evden ekmek almak için Ahmet ile birlikte dışarı çıktım .
