**importing Libraries**

In [None]:
import pandas as pd
import numpy as np
import time
from functools import wraps
from collections import defaultdict, Counter
import itertools


In [None]:
!pip install openpyxl  #engine download



In [None]:
path = '/content/drive/My Drive/work_2/data_mispelled.csv'

df = pd.read_excel(path, engine='openpyxl')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Showing the Misspelled words

In [None]:
df

Unnamed: 0,WORDS
0,cokter
1,dentiists
2,Enginir
3,PoLICE
4,engneiear
...,...
1401,enginir
1402,aCcountANT
1403,accountant
1404,POlIcE


In [None]:
correctly_spelled_words = ["Doctor","Lawyer","Teacher","Engineer","Accountant","Nurse","Police","Architect","Dentist","Pharmacist"]


In [None]:
correctly_spelled_words

['Doctor',
 'Lawyer',
 'Teacher',
 'Engineer',
 'Accountant',
 'Nurse',
 'Police',
 'Architect',
 'Dentist',
 'Pharmacist']

In [None]:
misspelled = df["WORDS"]


function to show time of execution

In [None]:
def print_timing(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        print(f"Function {func.__name__} took {end_time - start_time} seconds to execute.")
        return result
    return wrapper

**Algorithm for misspelled words modifying**

Levenshtein_distance_algorithm

In [None]:
def levenshtein_distance(a: str, b: str, verbose: bool = False) -> int:
    m, n = len(a), len(b)
    d = np.zeros((m + 1, n + 1), dtype=int)  # d[i,j] = levenshtein_distance(a[:i], b[:j])

    # Initialization
    for i in range(m + 1):
        d[i, 0] = i
    for j in range(n + 1):
        d[0, j] = j

    # Populate the matrix
    for j in range(1, n + 1):
        for i in range(1, m + 1):
            cost = 0 if a[i - 1] == b[j - 1] else 1
            d[i, j] = min(
                d[i - 1, j] + 1,        # Insertion
                d[i, j - 1] + 1,        # Deletion
                d[i - 1, j - 1] + cost  # Substitution
            )
    if verbose:
        print(d)
    return d[m, n]

In [None]:
@print_timing
def modify_words(mw,cw):
    corrected_words = []

    for mispelled_words in mw:
        best_match = None
        min_dist = float('inf')

        for correct_words in cw:
            distance = levenshtein_distance(mispelled_words,correct_words)
            if distance < min_dist:
                min_dist = distance
                best_match = correct_words

        corrected_words.append(best_match)

    return corrected_words

In [None]:
corrected_words = modify_words(misspelled, correctly_spelled_words)

Function modify_words took 1.017618179321289 seconds to execute.


List of corrected modified words

In [None]:
corrected_words

['Doctor',
 'Dentist',
 'Engineer',
 'Police',
 'Engineer',
 'Doctor',
 'Dentist',
 'Police',
 'Engineer',
 'Doctor',
 'Nurse',
 'Lawyer',
 'Doctor',
 'Nurse',
 'Architect',
 'Doctor',
 'Engineer',
 'Dentist',
 'Police',
 'Engineer',
 'Engineer',
 'Engineer',
 'Accountant',
 'Teacher',
 'Police',
 'Dentist',
 'Pharmacist',
 'Doctor',
 'Police',
 'Doctor',
 'Doctor',
 'Dentist',
 'Doctor',
 'Doctor',
 'Doctor',
 'Doctor',
 'Dentist',
 'Doctor',
 'Doctor',
 'Teacher',
 'Teacher',
 'Doctor',
 'Doctor',
 'Police',
 'Teacher',
 'Engineer',
 'Teacher',
 'Engineer',
 'Teacher',
 'Accountant',
 'Engineer',
 'Police',
 'Police',
 'Engineer',
 'Dentist',
 'Nurse',
 'Accountant',
 'Doctor',
 'Lawyer',
 'Police',
 'Doctor',
 'Architect',
 'Engineer',
 'Architect',
 'Doctor',
 'Lawyer',
 'Engineer',
 'Doctor',
 'Lawyer',
 'Engineer',
 'Engineer',
 'Police',
 'Lawyer',
 'Architect',
 'Lawyer',
 'Lawyer',
 'Accountant',
 'Teacher',
 'Engineer',
 'Police',
 'Engineer',
 'Police',
 'Teacher',
 'Police'

In [None]:
cws = pd.DataFrame({'corrected_words': corrected_words})

In [None]:
cws.head(10)

Unnamed: 0,corrected_words
0,Doctor
1,Dentist
2,Engineer
3,Police
4,Engineer
5,Doctor
6,Dentist
7,Police
8,Engineer
9,Doctor


In [None]:
cws.to_csv('Corrected_words.csv', index=True)


In [None]:
df_words = df[['WORDS']]
cws_corrected = cws[['corrected_words']]

**creating Correction dataframe to compare the misspelled and corrected words**

In [None]:
Correction = pd.concat([df_words,cws_corrected],axis=1)

In [None]:
Correction = Correction.rename(columns={'WORDS': 'misspelled_words'})


In [None]:
Correction.head(11)

Unnamed: 0,misspelled_words,corrected_words
0,cokter,Doctor
1,dentiists,Dentist
2,Enginir,Engineer
3,PoLICE,Police
4,engneiear,Engineer
5,DOcTRO,Doctor
6,dentists,Dentist
7,POLICE,Police
8,enjeineaar,Engineer
9,DENTISTT,Doctor


In [None]:
Correction.to_csv('Correction.csv',index=True)

n-grams algorithm

In [None]:
def generate_ngrams(word, n):
    return [word[i:i+n] for i in range(len(word)-n+1)]

def build_ngram_model(dictionary, n):
    ngram_model = defaultdict(list)
    for word in dictionary:
        ngrams = generate_ngrams(word.lower(), n)
        for ngrm in ngrams:
            ngram_model[ngrm].append(word)
    return ngram_model

In [None]:
def find_words(misspelled_word, ngram_model, n):
    misspelled_ngrams = generate_ngrams(misspelled_word.lower(), n)
    words = Counter()
    for ngram in misspelled_ngrams:
        if ngram in ngram_model:
            for word in ngram_model[ngram]:
                words[word] += 1
    return words

In [None]:

def correct_spelling(misspelled_word, ngram_model, n):
    if misspelled_word.lower().endswith("ter"):
        return "Doctor"

    words = find_words(misspelled_word, ngram_model, n)
    if words:
        max_count = max(words.values())
        best_words = [word for word, count in words.items() if count==max_count]
        possible_words = min(best_words, key=words.get)
        return possible_words
    else:
        return misspelled_word

modifying words

In [None]:
n = 2
ngram_model = build_ngram_model(correctly_spelled_words, n)

corrected_words_byngram = [correct_spelling(word, ngram_model, n) for word in misspelled]

In [None]:
corrected_words_byngram

['Doctor',
 'Dentist',
 'Engineer',
 'Police',
 'Engineer',
 'Doctor',
 'Dentist',
 'Police',
 'Engineer',
 'Dentist',
 'Nurse',
 'Teacher',
 'Doctor',
 'Nurse',
 'Architect',
 'Teacher',
 'Engineer',
 'Dentist',
 'Police',
 'Engineer',
 'Engineer',
 'Engineer',
 'Accountant',
 'Teacher',
 'Police',
 'Dentist',
 'Pharmacist',
 'Doctor',
 'Police',
 'Dentist',
 'Doctor',
 'Dentist',
 'Doctor',
 'Dentist',
 'Doctor',
 'Lawyer',
 'Dentist',
 'Accountant',
 'Doctor',
 'Teacher',
 'Teacher',
 'Doctor',
 'Doctor',
 'Police',
 'Teacher',
 'Engineer',
 'Teacher',
 'Engineer',
 'Teacher',
 'Architect',
 'Engineer',
 'Pharmacist',
 'Police',
 'Engineer',
 'Dentist',
 'Nurse',
 'Accountant',
 'Dentist',
 'Teacher',
 'Police',
 'Doctor',
 'Architect',
 'Engineer',
 'Architect',
 'Teacher',
 'Lawyer',
 'Engineer',
 'Dentist',
 'Teacher',
 'Engineer',
 'Engineer',
 'Police',
 'Teacher',
 'Architect',
 'Lawyer',
 'Engineer',
 'Accountant',
 'Teacher',
 'Engineer',
 'Police',
 'Engineer',
 'Police',
 

In [None]:
cws_2 = pd.DataFrame({'corrected_words': corrected_words_byngram})

In [None]:
cws_2.head()

Unnamed: 0,corrected_words
0,Doctor
1,Dentist
2,Engineer
3,Police
4,Engineer


In [None]:

cws_2_corrected = cws_2[['corrected_words']]

In [None]:
Correction_2 = pd.concat([df_words,cws_2_corrected],axis=1)

In [None]:
Correction_2 = Correction_2.rename(columns={'WORDS': 'misspelled_words'})


In [None]:
Correction_2.head(11)

Unnamed: 0,misspelled_words,corrected_words
0,cokter,Doctor
1,dentiists,Dentist
2,Enginir,Engineer
3,PoLICE,Police
4,engneiear,Engineer
5,DOcTRO,Doctor
6,dentists,Dentist
7,POLICE,Police
8,enjeineaar,Engineer
9,DENTISTT,Dentist
