In [17]:
import numpy as np
import pandas as pd
import enchant

In [18]:
file_path = 'Dataset/Spelling Dataset/spell-errors.txt'

dictionary_file_path = 'Dataset/Spelling Dataset/test/Dictionary/dictionary.data'

del_confusion_matrix_file_path = 'Dataset/Spelling Dataset/test/Confusion Matrix/del-confusion.data'
ins_confusion_matrix_file_path = 'Dataset/Spelling Dataset/test/Confusion Matrix/ins-confusion.data'
sub_confusion_matrix_file_path = 'Dataset/Spelling Dataset/test/Confusion Matrix/sub-confusion.data'
tra_confusion_matrix_file_path = 'Dataset/Spelling Dataset/test/Confusion Matrix/Transposition-confusion.data'

In [19]:
def read_dictionary(file_path):
    with open(file_path, 'r') as file:
        dictionary = np.array([word.strip() for word in file])

    return dictionary

dictionary = read_dictionary(dictionary_file_path)

print("First 10 words in the dictionary:")
print(dictionary[:10])

First 10 words in the dictionary:
['aa' 'aah' 'aahed' 'aahing' 'aahs' 'aal' 'aalii' 'aaliis' 'aals'
 'aardvark']


In [20]:
def load_typo_data(file_path):
    typo_data = {'CorrectWord': [], 'Typos': []}

    with open(file_path, 'r') as file:
        for line in file:
            parts = line.strip().split(':')
            if len(parts) == 2:
                correct_word, typos = parts[0], parts[1].split(',')
                typo_data['CorrectWord'].append(correct_word)
                typo_data['Typos'].append([typo.strip() for typo in typos])

    return pd.DataFrame(typo_data)

typo_df = load_typo_data(file_path)
typo_df

Unnamed: 0,CorrectWord,Typos
0,raining,"[rainning, raning]"
1,writings,[writtings]
2,disparagingly,[disparingly]
3,yellow,[yello]
4,four,"[forer, fours, fuore, fore*5, for*4]"
...,...,...
7836,jewel,"[jewl, jule]"
7837,commencement,[commencment]
7838,suppressing,[supressing]
7839,tonner,[toner]


In [21]:
import ast

def read_confusion_matrix(file_path):
    with open(file_path, 'r') as file:
        confusion_matrix_str = file.read()
        confusion_matrix = ast.literal_eval(confusion_matrix_str)

    return confusion_matrix

In [22]:
del_confusion_matrix = read_confusion_matrix(del_confusion_matrix_file_path)
ins_confusion_matrix = read_confusion_matrix(ins_confusion_matrix_file_path)
sub_confusion_matrix = read_confusion_matrix(sub_confusion_matrix_file_path)
tra_confusion_matrix = read_confusion_matrix(tra_confusion_matrix_file_path)

In [23]:
def damerau_levenshtein_distance(s1, s2):
    """
    Calculate the Damerau–Levenshtein distance between two strings.
    """
    len_s1 = len(s1)
    len_s2 = len(s2)
    d = [[0] * (len_s2 + 1) for _ in range(len_s1 + 1)]

    for i in range(len_s1 + 1):
        d[i][0] = i
    for j in range(len_s2 + 1):
        d[0][j] = j

    for i in range(1, len_s1 + 1):
        for j in range(1, len_s2 + 1):
            cost = 0 if s1[i - 1] == s2[j - 1] else 1
            d[i][j] = min(
                d[i - 1][j] + 1,  # deletion
                d[i][j - 1] + 1,  # insertion
                d[i - 1][j - 1] + cost,  # substitution
            )
            if i > 1 and j > 1 and s1[i - 1] == s2[j - 2] and s1[i - 2] == s2[j - 1]:
                d[i][j] = min(d[i][j], d[i - 2][j - 2] + cost)  # transposition

    return d[len_s1][len_s2]


# Example usage
s1 = "acress"
s2 = "access"

distance = damerau_levenshtein_distance(s1, s2)
print(f"Damerau–Levenshtein distance between '{s1}' and '{s2}': {distance}")

Damerau–Levenshtein distance between 'acress' and 'access': 1


In [24]:
def find_correct_words(typo):
    # correct_words = typo_df.loc[typo_df['Typos'].apply(lambda x: typo in x), 'CorrectWord'].tolist()
    d = enchant.Dict("en_US")
    correct_words = d.suggest(typo)
    correct_words2 = []
    for candida in correct_words:
        if damerau_levenshtein_distance(typo, candida) == 1:
            correct_words2.append(candida)
    return correct_words2

typo = 'problem'
correct_words = find_correct_words(typo)
correct_words

['problems']

In [25]:
with open('Dataset/Spelling Dataset/test/Dictionary/Dataset.data', 'r', encoding='utf-8') as file:
    text_data = file.read()

# Split the text into sentences or paragraphs based on your dataset structure
sentences = text_data.split('<s>')

# Tokenize the sentences into words
tokenized_sentences = [sentence.split() for sentence in sentences]

tokenized_sentences = np.concatenate(tokenized_sentences, axis=0)

In [26]:
words, counts = np.unique(tokenized_sentences, return_counts=True)
dataset = pd.DataFrame({
    'word': words,
    'count': counts
}).sort_values(['count'], ascending=False)
# dataset.map(lambda x : apply())
dataset

Unnamed: 0,word,count
72392,the,282764
5941,and,155164
51147,of,151505
73254,to,115135
37328,in,87003
...,...,...
39687,jaculation,1
39690,jadau,1
39694,jadon,1
39695,jaeger,1


In [27]:
def find_type_and_letter(x, w):
    if len(x) > len(w):
        for i in range(len(x)):
            if i == len(w) - 1 or w[i] != x[i]:
                return ins_confusion_matrix, x[i]
            else:
                continue
    elif len(x) < len(w):
        for i in range(len(x)):
            if w[i] == x[i]:
                continue
            else:
                return del_confusion_matrix, w[i]
    else: #transposition or substitution
        for i in range(len(x)):
            if w[i] == x[i]:
                continue
            else:
                if i == len(x)-1:
                    return sub_confusion_matrix, w[i]+x[i]
                else:
                    if w[i+1] == x[i+1]:
                        return sub_confusion_matrix, w[i]+x[i]
                    elif w[i] == x[i+1]:
                        return tra_confusion_matrix, w[i]+w[i+1]
                    else:
                        return "?"
candidates = ['caress', 'acres', 'cress', 'actress', 'across', 'access', "acre's", 'a cress', 'acre ss', 'acre-ss', 'acres s']
find_type_and_letter("acress", "acre's")

({'gw': 1,
  'gv': 0,
  'gu': 0,
  'gt': 21,
  'gs': 13,
  'gr': 5,
  'gq': 3,
  'gp': 1,
  'gz': 0,
  'gy': 3,
  'gx': 0,
  'gg': 0,
  'gf': 2,
  'ge': 9,
  'gd': 11,
  'gc': 11,
  'gb': 1,
  'ga': 4,
  'go': 2,
  'gn': 0,
  'gm': 0,
  'gl': 3,
  'gk': 1,
  'gj': 1,
  'gi': 0,
  'gh': 0,
  'tz': 6,
  'tx': 0,
  'ty': 7,
  'tv': 2,
  'tw': 19,
  'tt': 0,
  'tu': 0,
  'tr': 11,
  'ts': 37,
  'tp': 6,
  'tq': 0,
  'tn': 5,
  'to': 5,
  'tl': 14,
  'tm': 9,
  'tj': 1,
  'tk': 0,
  'th': 5,
  'ti': 0,
  'tf': 5,
  'tg': 19,
  'td': 42,
  'te': 7,
  'tb': 4,
  'tc': 9,
  'ta': 3,
  'vu': 0,
  'zl': 7,
  'zm': 5,
  'zn': 0,
  'zo': 0,
  'zh': 0,
  'zi': 0,
  'zj': 0,
  'zk': 0,
  'zd': 7,
  'ze': 0,
  'zf': 0,
  'zg': 0,
  'za': 0,
  'zb': 0,
  'zc': 0,
  'zx': 0,
  'zy': 3,
  'zz': 0,
  'zt': 3,
  'zu': 0,
  'zv': 0,
  'zw': 0,
  'zp': 0,
  'zq': 0,
  'zr': 2,
  'zs': 21,
  'wl': 0,
  'va': 0,
  'vc': 7,
  'wk': 1,
  'vh': 0,
  'wj': 0,
  'vi': 0,
  'vj': 0,
  'vk': 0,
  'vl': 1,
  'vm': 0,

In [28]:
def Pw(w):
    if not dataset.loc[dataset['word'] == w].empty :
        frequency_of_w = dataset.loc[dataset['word'] == w].iloc[0]['count']
    else:
        frequency_of_w = 0
    count_of_all_tokens = dataset['count'].sum()
    size_of_vocab = len(dataset['word'])
    return (frequency_of_w + 1) / (count_of_all_tokens + size_of_vocab)

def Pxw(x, w):
    confusion_matrix, characters = find_type_and_letter(x, w)
    print()
    soorat = confusion_matrix[characters] + 1
    makhraj = text_data.count(characters) + 1
    return soorat / makhraj

In [29]:
Pxw('acress', 'cress')

KeyError: 'a'

In [66]:
def apply(x):
    candidates = find_correct_words(x)
    # candidates = [
    #     "actress",
    #     "cress",
    #     "caress",
    #     "access",
    # ]
    if x in dictionary:
        print("Real word")
        candidates.insert(0, x)
    else:
        print("Non word")
    print("candidates : ", candidates)
    candidates_dict = {}
    for candida in candidates:
        print("----------")
        print("word : ", candida)
        probability_of_w = Pw(candida)
        print("P(w) is ", probability_of_w)
        probability_of_x_if_w = Pxw(x, candida)
        print("P(x|w) is ", probability_of_x_if_w)
        probability_of_w_if_x = np.sum([
            np.log(probability_of_x_if_w),
            np.log(probability_of_w)
        ])
        print("P(w|x) is ", probability_of_w_if_x)
        candidates_dict[candida] = probability_of_w_if_x
    return max(candidates_dict, key=candidates_dict.get)
apply("acress")

Non word
candidates :  ['caress', 'acres', 'cress', 'actress', 'across', 'access', "acre's", 'a cress', 'acre ss', 'acre-ss', 'acres s']
----------
word :  caress
P(w) is  1.8288709221512644e-06


TypeError: string indices must be integers