# Import Necessary Libraries

Text preprocessing in NLP is the process by which we clean the raw text data by removing the noise, such as punctuations, emojis, and common words, to make it ready for our model to train. It is very important to remove unhelpful data or parts from our text.

In [1]:
import re
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Sample Text

In [2]:
sample_text = """
Natural language processing (NLP) is a field of artificial intelligence that focuses on the interaction
between computers and humans through natural language. The ultimate goal of NLP is to read, decipher,
understand, and make sense of human language in a way that is both valuable and meaningful.
"""

# Tokenization

In [16]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [11]:
#Tokenization
sentences = sent_tokenize(sample_text)

words = [word_tokenize(sentence) for sentence in sentences]
print(words)

[['Natural', 'language', 'processing', '(', 'NLP', ')', 'is', 'a', 'field', 'of', 'artificial', 'intelligence', 'that', 'focuses', 'on', 'the', 'interaction', 'between', 'computers', 'and', 'humans', 'through', 'natural', 'language', '.'], ['The', 'ultimate', 'goal', 'of', 'NLP', 'is', 'to', 'read', ',', 'decipher', ',', 'understand', ',', 'and', 'make', 'sense', 'of', 'human', 'language', 'in', 'a', 'way', 'that', 'is', 'both', 'valuable', 'and', 'meaningful', '.']]


# Lowercasing and Removing Special Characters

In [12]:
cleaned_words = [[re.sub(r'[^a-zA-Z0-9]', '', word.lower()) for word in sentence] for sentence in words]
print(cleaned_words)

[['natural', 'language', 'processing', '', 'nlp', '', 'is', 'a', 'field', 'of', 'artificial', 'intelligence', 'that', 'focuses', 'on', 'the', 'interaction', 'between', 'computers', 'and', 'humans', 'through', 'natural', 'language', ''], ['the', 'ultimate', 'goal', 'of', 'nlp', 'is', 'to', 'read', '', 'decipher', '', 'understand', '', 'and', 'make', 'sense', 'of', 'human', 'language', 'in', 'a', 'way', 'that', 'is', 'both', 'valuable', 'and', 'meaningful', '']]


# Removing Stopwords - i.e removing a, an, the

In [13]:
stop_words = set(stopwords.words('english'))
filtered_words = [[word for word in sentence if word not in stop_words] for sentence in cleaned_words]
print(filtered_words)

[['natural', 'language', 'processing', '', 'nlp', '', 'field', 'artificial', 'intelligence', 'focuses', 'interaction', 'computers', 'humans', 'natural', 'language', ''], ['ultimate', 'goal', 'nlp', 'read', '', 'decipher', '', 'understand', '', 'make', 'sense', 'human', 'language', 'way', 'valuable', 'meaningful', '']]


# Stemming

In [14]:
stemmer = PorterStemmer()
stemmed_words = [[stemmer.stem(word) for word in sentence] for sentence in filtered_words]
print(stemmed_words)


[['natur', 'languag', 'process', '', 'nlp', '', 'field', 'artifici', 'intellig', 'focus', 'interact', 'comput', 'human', 'natur', 'languag', ''], ['ultim', 'goal', 'nlp', 'read', '', 'deciph', '', 'understand', '', 'make', 'sens', 'human', 'languag', 'way', 'valuabl', 'meaning', '']]


# Lemmatization

In [17]:
lemmatizer = WordNetLemmatizer()
lemmatized_words = [[lemmatizer.lemmatize(word) for word in sentence] for sentence in filtered_words]
print(lemmatized_words)

[['natural', 'language', 'processing', '', 'nlp', '', 'field', 'artificial', 'intelligence', 'focus', 'interaction', 'computer', 'human', 'natural', 'language', ''], ['ultimate', 'goal', 'nlp', 'read', '', 'decipher', '', 'understand', '', 'make', 'sense', 'human', 'language', 'way', 'valuable', 'meaningful', '']]


# Printing Processed Sentences

In [18]:
print("Original Sentences:")
for sentence in sentences:
    print(sentence)

print("\nProcessed Sentences (Lemmatized):")
for sentence in lemmatized_words:
    print(' '.join(sentence))

Original Sentences:

Natural language processing (NLP) is a field of artificial intelligence that focuses on the interaction
between computers and humans through natural language.
The ultimate goal of NLP is to read, decipher,
understand, and make sense of human language in a way that is both valuable and meaningful.

Processed Sentences (Lemmatized):
natural language processing  nlp  field artificial intelligence focus interaction computer human natural language 
ultimate goal nlp read  decipher  understand  make sense human language way valuable meaningful 
