<a href="https://colab.research.google.com/github/Amelbnmbh/Text-Correction-NLP-/blob/main/Text_Correction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Prepare the Environment

<h2>The following files:

`englishvoc.txt:` This file should contain a list of English words (the lexicon).

`text.txt:` This is the file containing the text with spelling errors.

`ref.txt:` This file is the corrected version of the text. If you don't have it yet, you can manually create it by correcting **text.txt**.

<h2>Install Required Libraries

In [None]:
!pip install nltk
!pip install python-Levenshtein



<h2>Import Required Libraries

In [None]:
import nltk
import re
from nltk.corpus import stopwords
from Levenshtein import distance
from sklearn.metrics import precision_score, recall_score, accuracy_score

<h2>Download NLTK Stopwords

In [None]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


<h2>Define File Reading Functions

In [None]:
# Read text files
def read_file(file_path):
    """Read a file and return its contents."""
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

In [None]:
# Read the lexicon
def read_vocab(file_path):
    """Read a vocabulary file and return its contents as a list of words."""
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read().splitlines()

<h2>Define File Paths

In [None]:
# File paths
text_file_path ='text.txt'
vocab_file_path = 'englishvoc.txt'
ref_file_path = 'ref.txt'

<h2>Read the Files

In [None]:
# Reading the files
text = read_file(text_file_path)
vocab = read_vocab(vocab_file_path)
ref = read_file(ref_file_path)

#Levenshtein Distance Calculation

<h2>Preprocess the Text

In [None]:
def preprocess_text(text):
    # Convert to lowercase and remove punctuation
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    # Remove stopwords
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

<h2>Check Words Against Lexicon

In [None]:
def check_words(text, lexicon):
    words = text.split()
    correct_words = []
    incorrect_words = []

    for word in words:
        if word in lexicon:
            correct_words.append(word)
        else:
            incorrect_words.append(word)

    return correct_words, incorrect_words

<h2>Suggest Corrections for Incorrect Words

In [None]:
def suggest_correction(word, lexicon):
    min_distance = float('inf')
    correction = word

    for lex_word in lexicon:
        lv_distance = distance(word, lex_word)
        if lv_distance < min_distance:
            min_distance = lv_distance
            correction = lex_word

    return correction

<h2>Preprocess and Check Words

In [None]:
preprocessed_text = preprocess_text(text)
correct_words, incorrect_words = check_words(preprocessed_text, set(vocab))

<h2>Correct Incorrect Words

In [None]:
corrected_words = []
for word in incorrect_words:
    corrected_word = suggest_correction(word, set(vocab))
    corrected_words.append(corrected_word)

# Final corrected text
corrected_text = ' '.join(correct_words + corrected_words)

<h2>Evaluate the Correction

In [None]:
corrected_words = corrected_text.split()
reference_words = ref.split()

# Create binary labels (1 for match, 0 for no match)
y_true = [1 if word in reference_words else 0 for word in corrected_words]
y_pred = [1 if word in corrected_words else 0 for word in corrected_words]

# Calculate precision, recall, and accuracy
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
accuracy = accuracy_score(y_true, y_pred)

#Results

In [None]:
print("Precision:", precision)
print("Recall:", recall)
print("Accuracy:", accuracy)

Precision: 0.6202046035805626
Recall: 1.0
Accuracy: 0.6202046035805626
