# Download necessary NLTK data for tokenization


In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
from nltk.tokenize import word_tokenize
import string

# Load the vocabulary file and store words in a set for fast lookup


In [None]:
with open('/content/Vocabulary.txt', 'r', encoding='utf-8') as f:
        vocabulary = set(word.strip() for word in f.readlines())

# Function to calculate the Damerau-Levenshtein distance between two strings


In [None]:
def damerau_levenshtein_distance(s1, s2):
    dp = [[0] * (len(s2) + 1) for _ in range(len(s1) + 1)]

    for i in range(len(s1) + 1):
        dp[i][0] = i
    for j in range(len(s2) + 1):
        dp[0][j] = j

    for i in range(1, len(s1) + 1):
        for j in range(1, len(s2) + 1):
            if s1[i - 1] == s2[j - 1]:
                cost = 0
            else:
                cost = 1

            dp[i][j] = min(dp[i - 1][j] + 1,
                           dp[i][j - 1] + 1,
                           dp[i - 1][j - 1] + cost)

            if i > 1 and j > 1 and s1[i - 1] == s2[j - 2] and s1[i - 2] == s2[j - 1]:
                dp[i][j] = min(dp[i][j], dp[i - 2][j - 2] + cost)

    return dp[len(s1)][len(s2)]

# Function to suggest corrections for a given word


In [None]:
def suggest_corrections(word, vocabulary, split_hyphen=True):
    if split_hyphen and '-' in word:
        parts = word.split('-')
        suggestions = [suggest_corrections(part, vocabulary, split_hyphen=False) for part in parts]
        return '-'.join(suggestions)
    else:
        min_distance = float('inf')
        best_correction = None
        for correct_word in vocabulary:
            distance = damerau_levenshtein_distance(word, correct_word)
            if distance < min_distance:
                min_distance = distance
                best_correction = correct_word
        return best_correction

# Define the input sentence to be processed


In [None]:
sentence = "Multipmodal usion si a ore proble fofr mpltimodal seuntiment aanlysis."
words = word_tokenize(sentence)

# Process each word in the sentence
for word in words:
  if word.lower() not in string.punctuation:
    if word.lower() not in vocabulary:
        correction = suggest_corrections(word.lower(), vocabulary)
        print(f"Suggestion for '{word}': {correction}")

Suggestion for 'Multipmodal': multinomial
Suggestion for 'usion': union
Suggestion for 'si': bi
Suggestion for 'proble': problem
Suggestion for 'fofr': four
Suggestion for 'mpltimodal': optional
Suggestion for 'seuntiment': sentiment
Suggestion for 'aanlysis': analysis
