# Text Preprocessing

In [1]:
# Importing Required Libraries
import nltk
import string
import re
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

In [13]:
# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/omar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/omar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/omar/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/omar/nltk_data...


True

In [3]:
# Sample Text
text = """
Hello, world! Welcome to text preprocessing. Let's explore how to clean and prepare text data for NLP tasks.
In this notebook, we'll cover various techniques like tokenization, stopword removal, stemming, lemmatization, 
lowercasing, punctuation removal, and text normalization. Let's get started! 🚀
"""
# Display the Original Text
print("Original Text:\n", text)

Original Text:
 
Hello, world! Welcome to text preprocessing. Let's explore how to clean and prepare text data for NLP tasks.
In this notebook, we'll cover various techniques like tokenization, stopword removal, stemming, lemmatization, 
lowercasing, punctuation removal, and text normalization. Let's get started! 🚀



## 1. Tokenization ✂️
Tokenization is the process of splitting text into individual words or sentences.

In [4]:
# Tokenization
words = word_tokenize(text) # Word Tokenization
sentences = sent_tokenize(text) # Sentence Tokenization

print("Words:", words)
print("Sentences:", sentences)

Words: ['Hello', ',', 'world', '!', 'Welcome', 'to', 'text', 'preprocessing', '.', 'Let', "'s", 'explore', 'how', 'to', 'clean', 'and', 'prepare', 'text', 'data', 'for', 'NLP', 'tasks', '.', 'In', 'this', 'notebook', ',', 'we', "'ll", 'cover', 'various', 'techniques', 'like', 'tokenization', ',', 'stopword', 'removal', ',', 'stemming', ',', 'lemmatization', ',', 'lowercasing', ',', 'punctuation', 'removal', ',', 'and', 'text', 'normalization', '.', 'Let', "'s", 'get', 'started', '!', '🚀']
Sentences: ['\nHello, world!', 'Welcome to text preprocessing.', "Let's explore how to clean and prepare text data for NLP tasks.", "In this notebook, we'll cover various techniques like tokenization, stopword removal, stemming, lemmatization, \nlowercasing, punctuation removal, and text normalization.", "Let's get started!", '🚀']


## 2. Stopword Removal 🚫
Stopwords are common words that may not contribute much meaning, such as "the," "is," "in," etc.

In [5]:
# Stopword Removal
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in words if word.lower() not in stop_words]

print("Filtered Words:", filtered_words)

Filtered Words: ['Hello', ',', 'world', '!', 'Welcome', 'text', 'preprocessing', '.', 'Let', "'s", 'explore', 'clean', 'prepare', 'text', 'data', 'NLP', 'tasks', '.', 'notebook', ',', "'ll", 'cover', 'various', 'techniques', 'like', 'tokenization', ',', 'stopword', 'removal', ',', 'stemming', ',', 'lemmatization', ',', 'lowercasing', ',', 'punctuation', 'removal', ',', 'text', 'normalization', '.', 'Let', "'s", 'get', 'started', '!', '🚀']


## 3. Stemming 🌱
Stemming reduces words to their base or root form. It may not always produce actual words, but it helps in reducing inflected words to a common base.

In [6]:
# Stemming
stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(word) for word in filtered_words]

print("Stemmed Words:", stemmed_words)

Stemmed Words: ['hello', ',', 'world', '!', 'welcom', 'text', 'preprocess', '.', 'let', "'s", 'explor', 'clean', 'prepar', 'text', 'data', 'nlp', 'task', '.', 'notebook', ',', "'ll", 'cover', 'variou', 'techniqu', 'like', 'token', ',', 'stopword', 'remov', ',', 'stem', ',', 'lemmat', ',', 'lowercas', ',', 'punctuat', 'remov', ',', 'text', 'normal', '.', 'let', "'s", 'get', 'start', '!', '🚀']


## 4. Lemmatization 🌱
Lemmatization also reduces words to their base form, but it aims to return actual words that belong to the language.

In [7]:
# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]

print("Lemmatized Words:", lemmatized_words)

Lemmatized Words: ['Hello', ',', 'world', '!', 'Welcome', 'text', 'preprocessing', '.', 'Let', "'s", 'explore', 'clean', 'prepare', 'text', 'data', 'NLP', 'task', '.', 'notebook', ',', "'ll", 'cover', 'various', 'technique', 'like', 'tokenization', ',', 'stopword', 'removal', ',', 'stemming', ',', 'lemmatization', ',', 'lowercasing', ',', 'punctuation', 'removal', ',', 'text', 'normalization', '.', 'Let', "'s", 'get', 'started', '!', '🚀']


## 5. Lowercasing 🔡
Lowercasing helps in maintaining consistency, especially for tasks where case sensitivity does not matter.

In [8]:
# Lowercasing
lowercased_words = [word.lower() for word in lemmatized_words]

print("Lowercased Words:", lowercased_words)

Lowercased Words: ['hello', ',', 'world', '!', 'welcome', 'text', 'preprocessing', '.', 'let', "'s", 'explore', 'clean', 'prepare', 'text', 'data', 'nlp', 'task', '.', 'notebook', ',', "'ll", 'cover', 'various', 'technique', 'like', 'tokenization', ',', 'stopword', 'removal', ',', 'stemming', ',', 'lemmatization', ',', 'lowercasing', ',', 'punctuation', 'removal', ',', 'text', 'normalization', '.', 'let', "'s", 'get', 'started', '!', '🚀']


## 6. Punctuation Removal ❌
Punctuation removal helps in reducing the vocabulary size and noise in text data.

In [9]:
# Punctuation Removal
no_punctuation = [word for word in lowercased_words if word not in string.punctuation]

print("Words without Punctuation:", no_punctuation)

Words without Punctuation: ['hello', 'world', 'welcome', 'text', 'preprocessing', 'let', "'s", 'explore', 'clean', 'prepare', 'text', 'data', 'nlp', 'task', 'notebook', "'ll", 'cover', 'various', 'technique', 'like', 'tokenization', 'stopword', 'removal', 'stemming', 'lemmatization', 'lowercasing', 'punctuation', 'removal', 'text', 'normalization', 'let', "'s", 'get', 'started', '🚀']


## 7. Text Normalization 📝
Text normalization is the process of transforming text into a single canonical form. It involves combining all the above steps to clean text data.

In [10]:
# Text Normalization Example
def normalize_text(text):
    # Replace numbers with a placeholder
    normalized_text = re.sub(r'\d+', '<NUM>', text)
    # Handle any other custom normalization rules here
    return normalized_text

normalized_text = normalize_text('I have 3 apples and 20 bananas.')
print("Normalized Text:", normalized_text)

Normalized Text: I have <NUM> apples and <NUM> bananas.


## Complete Preprocessing Pipeline
Now that we have explored each preprocessing step individually, let's combine them into a complete preprocessing pipeline:

In [11]:
def preprocess_text(text):
    # Tokenization
    words = word_tokenize(text)

    # Stopword Removal
    filtered_words = [word for word in words if word.lower() not in stop_words]

    # Stemming
    stemmed_words = [stemmer.stem(word) for word in filtered_words]

    # Lemmatization
    lemmatized_words = [lemmatizer.lemmatize(word) for word in stemmed_words]

    # Lowercasing
    lowercased_words = [word.lower() for word in lemmatized_words]

    # Punctuation Removal
    no_punctuation = [word for word in lowercased_words if word not in string.punctuation]

    # Return the processed text
    return no_punctuation

# Run the preprocessing pipeline on the sample text
processed_text = preprocess_text(text)

print("Processed Text:", processed_text)

Processed Text: ['hello', 'world', 'welcom', 'text', 'preprocess', 'let', "'s", 'explor', 'clean', 'prepar', 'text', 'data', 'nlp', 'task', 'notebook', "'ll", 'cover', 'variou', 'techniqu', 'like', 'token', 'stopword', 'remov', 'stem', 'lemmat', 'lowercas', 'punctuat', 'remov', 'text', 'normal', 'let', "'s", 'get', 'start', '🚀']


## Summary
In this notebook, we explored several essential text preprocessing techniques that form the backbone of many NLP tasks. These steps help transform raw text into a format suitable for various natural language processing applications, such as sentiment analysis, text classification, and more.

Experiment with these techniques and apply them to your own text datasets to see the power of text preprocessing in action! 🚀