# Text Cleaning in NLP - Hands-On Tutorial

In [22]:
# Import required libraries
import re
import string
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from bs4 import BeautifulSoup
import nltk

In [23]:
# Download required NLTK data
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')

In [24]:
# Example text data
text_data = [
    "Hello! I looooveee this movie!!! 😍😍😍 #awesome http://movie.com",
    "Check out my blog at http://example.com #blog",
    "This is an example sentence.",
    "What a great day, @John! Let's grab lunch :)",
    "I can't believe it's already 2024!",
    "Running, ran, runner!",
    np.nan  # Added NaN for demonstration purposes
]

In [25]:
# 1. Basic Preprocessing
def basic_preprocessing(text):
    # Check if the text is a string, else return empty string or handle NaN
    if not isinstance(text, str):
        return ""
    
    # Lowercase
    text = text.lower()
    
    # Remove extra whitespaces
    text = text.strip()
    
    return text

In [26]:
# Apply to text data
cleaned_data = [basic_preprocessing(text) for text in text_data]
print("\n===== Basic Preprocessing =====")
for i, text in enumerate(cleaned_data):
    print(f"Text {i+1}: {text}")


===== Basic Preprocessing =====
Text 1: hello! i looooveee this movie!!! 😍😍😍 #awesome http://movie.com
Text 2: check out my blog at http://example.com #blog
Text 3: this is an example sentence.
Text 4: what a great day, @john! let's grab lunch :)
Text 5: i can't believe it's already 2024!
Text 6: running, ran, runner!
Text 7: 


In [27]:
# 2. Tokenization
def tokenize_text(text):
    # Sentence Tokenization
    sentences = sent_tokenize(text)
    
    # Word Tokenization for each sentence
    tokens = [word_tokenize(sentence) for sentence in sentences]
    
    return tokens

In [28]:
# Apply to one example
example_text = cleaned_data[0]
tokens = tokenize_text(example_text)
print("\n===== Tokenization =====")
for i, sentence_tokens in enumerate(tokens):
    print(f"Sentence {i+1} Tokens: {sentence_tokens}")


===== Tokenization =====
Sentence 1 Tokens: ['hello', '!']
Sentence 2 Tokens: ['i', 'looooveee', 'this', 'movie', '!', '!', '!']
Sentence 3 Tokens: ['😍😍😍', '#', 'awesome', 'http', ':', '//movie.com']


In [29]:
# 3. Handling Stopwords
stop_words = set(stopwords.words('english'))

def remove_stopwords(tokens):
    return [word for word in tokens if word not in stop_words]

In [30]:
# Apply to one example
tokens_without_stopwords = [remove_stopwords(sentence) for sentence in tokens]
print("\n===== Tokens without Stopwords =====")
for i, sentence_tokens in enumerate(tokens_without_stopwords):
    print(f"Sentence {i+1} Tokens: {sentence_tokens}")


===== Tokens without Stopwords =====
Sentence 1 Tokens: ['hello', '!']
Sentence 2 Tokens: ['looooveee', 'movie', '!', '!', '!']
Sentence 3 Tokens: ['😍😍😍', '#', 'awesome', 'http', ':', '//movie.com']


In [31]:
# 4. Dealing with Noise
def remove_noise(text):
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    
    # Remove HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Remove emojis
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    
    return text

In [32]:
# Apply to text data
noise_removed_data = [remove_noise(text) for text in cleaned_data]
print("\n===== Noise Removed Text =====")
for i, text in enumerate(noise_removed_data):
    print(f"Text {i+1}: {text}")


===== Noise Removed Text =====
Text 1: hello i looooveee this movie  awesome 
Text 2: check out my blog at  blog
Text 3: this is an example sentence
Text 4: what a great day john lets grab lunch 
Text 5: i cant believe its already 2024
Text 6: running ran runner
Text 7: 


In [33]:
# 5. Normalization Techniques (Stemming and Lemmatization)
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def normalize_text(tokens):
    # Stemming
    stemmed = [stemmer.stem(word) for word in tokens]
    
    # Lemmatization
    lemmatized = [lemmatizer.lemmatize(word) for word in tokens]
    
    return stemmed, lemmatized

In [34]:
# Apply to one example
tokens = word_tokenize(noise_removed_data[0])
stemmed, lemmatized = normalize_text(tokens)
print("\n===== Stemming and Lemmatization =====")
print(f"Stemmed: {stemmed}")
print(f"Lemmatized: {lemmatized}")


===== Stemming and Lemmatization =====
Stemmed: ['hello', 'i', 'loooovee', 'thi', 'movi', 'awesom']
Lemmatized: ['hello', 'i', 'looooveee', 'this', 'movie', 'awesome']


In [35]:
# 6. Regular Expressions (RegEx)
def apply_regex(text):
    # Remove digits
    text = re.sub(r'\d+', '', text)
    
    # Extract emails (example pattern, although the data doesn't have emails)
    emails = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text)
    
    return text, emails

In [36]:
# Apply to one example
regex_applied, extracted_emails = apply_regex(noise_removed_data[0])
print("\n===== RegEx (After Removing Digits) =====")
print(f"Text: {regex_applied}")
print(f"Extracted Emails: {extracted_emails if extracted_emails else 'None'}")


===== RegEx (After Removing Digits) =====
Text: hello i looooveee this movie  awesome 
Extracted Emails: None


In [37]:
# 7. Dealing with Duplicates and Incomplete Text
# Example text data with duplicates and missing values
text_data = [
    "This is a sentence.",
    "This is a sentence.",
    "Another sentence.",
    np.nan,
    "Running, ran, runner!"
]

df = pd.DataFrame(text_data, columns=["text"])

# Remove duplicates
df_cleaned = df.drop_duplicates()

# Handle missing values
df_cleaned = df_cleaned.dropna()

print("\n===== Cleaned DataFrame (After Removing Duplicates and NaNs) =====")
print(df_cleaned)


===== Cleaned DataFrame (After Removing Duplicates and NaNs) =====
                    text
0    This is a sentence.
2      Another sentence.
4  Running, ran, runner!


In [38]:
# 8. Advanced Cleaning Techniques (Handling Slang)
slang_dict = {
    "u": "you",
    "r": "are",
    "lol": "laughing out loud",
    "brb": "be right back"
}

In [39]:
def replace_slang(text):
    tokens = word_tokenize(text)
    return " ".join([slang_dict[word] if word in slang_dict else word for word in tokens])

In [40]:
# Apply to one example
advanced_cleaned_text = replace_slang(noise_removed_data[0])
print("\n===== After Replacing Slang =====")
print(f"Text: {advanced_cleaned_text}")


===== After Replacing Slang =====
Text: hello i looooveee this movie awesome


In [41]:
# 9. Putting It All Together: Building a Cleaning Pipeline
def text_cleaning_pipeline(text):
    # Basic Preprocessing
    text = basic_preprocessing(text)
    
    # Remove noise
    text = remove_noise(text)
    
    # Tokenization
    tokens = word_tokenize(text)
    
    # Remove stopwords
    tokens = remove_stopwords(tokens)
    
    # Normalize (Stemming or Lemmatization)
    _, tokens = normalize_text(tokens)
    
    return " ".join(tokens)

In [42]:
# Apply to the entire dataset
final_cleaned_data = [text_cleaning_pipeline(text) for text in text_data]
print("\n===== Final Cleaned Data =====")
for i, text in enumerate(final_cleaned_data):
    print(f"Text {i+1}: {text}")


===== Final Cleaned Data =====
Text 1: sentence
Text 2: sentence
Text 3: another sentence
Text 4: 
Text 5: running ran runner
