<a href="https://colab.research.google.com/github/B-Sumanth/2203A51551_NLP/blob/main/Textprocessing_1551.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import re
import string
from bs4 import BeautifulSoup

def clean_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = BeautifulSoup(text, "html.parser").get_text()  # Remove HTML tags
    return text
corpus =[
    "Automated essay scoring (AES) is the use of specialized"
    "computer programs to assign grades to essays written "
    "in an educational setting."
]
cleaned_corpus = [clean_text(doc) for doc in corpus]
print(cleaned_corpus)


['automated essay scoring aes is the use of specializedcomputer programs to assign grades to essays written in an educational setting']


In [5]:
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

tokenized_corpus = [word_tokenize(doc) for doc in cleaned_corpus]
print(tokenized_corpus)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


[['automated', 'essay', 'scoring', 'aes', 'is', 'the', 'use', 'of', 'specializedcomputer', 'programs', 'to', 'assign', 'grades', 'to', 'essays', 'written', 'in', 'an', 'educational', 'setting']]


In [6]:
from nltk.corpus import stopwords
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
filtered_corpus = [[word for word in doc if word not in stop_words] for doc in tokenized_corpus]
print(filtered_corpus)


[['automated', 'essay', 'scoring', 'aes', 'use', 'specializedcomputer', 'programs', 'assign', 'grades', 'essays', 'written', 'educational', 'setting']]


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [7]:
from nltk.stem import PorterStemmer, WordNetLemmatizer
nltk.download('wordnet')

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

stemmed_corpus = [[stemmer.stem(word) for word in doc] for doc in filtered_corpus]
lemmatized_corpus = [[lemmatizer.lemmatize(word) for word in doc] for doc in filtered_corpus]
print(stemmed_corpus)
print(lemmatized_corpus)


[nltk_data] Downloading package wordnet to /root/nltk_data...


[['autom', 'essay', 'score', 'ae', 'use', 'specializedcomput', 'program', 'assign', 'grade', 'essay', 'written', 'educ', 'set']]
[['automated', 'essay', 'scoring', 'aes', 'use', 'specializedcomputer', 'program', 'assign', 'grade', 'essay', 'written', 'educational', 'setting']]


In [9]:
!pip install contractions # Install the contractions module
import contractions # Import the contractions module

expanded_corpus = [contractions.fix(doc) for doc in cleaned_corpus]
print(expanded_corpus)

Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.2-py3-none-any.whl.metadata (1.5 kB)
Collecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (13 kB)
Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyahocorasick-2.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (110 kB)
[2

In [10]:
import contractions

expanded_corpus = [contractions.fix(doc) for doc in cleaned_corpus]
print(expanded_corpus)


['automated essay scoring aes is the use of specializedcomputer programs to assign grades to essays written in an educational setting']


In [12]:
!pip install pyspellchecker
from spellchecker import SpellChecker # Change to from pyspellchecker

spell = SpellChecker()
corrected_corpus = [[spell.correction(word) for word in doc] for doc in tokenized_corpus]
print(corrected_corpus)

Collecting pyspellchecker
  Downloading pyspellchecker-0.8.1-py3-none-any.whl.metadata (9.4 kB)
Downloading pyspellchecker-0.8.1-py3-none-any.whl (6.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m51.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyspellchecker
Successfully installed pyspellchecker-0.8.1
[['automated', 'essay', 'scoring', 'as', 'is', 'the', 'use', 'of', None, 'programs', 'to', 'assign', 'grades', 'to', 'essays', 'written', 'in', 'an', 'educational', 'setting']]


In [13]:
from spellchecker import SpellChecker

spell = SpellChecker()
corrected_corpus = [[spell.correction(word) for word in doc] for doc in tokenized_corpus]
print(corrected_corpus)


[['automated', 'essay', 'scoring', 'as', 'is', 'the', 'use', 'of', None, 'programs', 'to', 'assign', 'grades', 'to', 'essays', 'written', 'in', 'an', 'educational', 'setting']]
