In [1]:
%pip install nltk
%pip install pandas



In [2]:
# Import necessary libraries
import re
import nltk
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

In [7]:
# Download required NLTK data (run this once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [8]:

# Sample text for demonstration
text = """Hey there! This is an example text with some numbers (123) and special characters @#$.
         I love programming in Python! Python is amazing... Check out https://www.example.com"""

In [19]:
def clean_text(text):
    # Step 1: Convert to lowercase
    # This helps in standardizing the text
    text = text.lower()
    print("\n--> After lowercase:", text)

    # Step 2: Remove URLs
    # We use regex to find and remove web links
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    print("\n--> After removing URLs:", text)

    # Step 3: Remove special characters and numbers
    # Keep only letters and spaces
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    print("\n--> After removing special characters:", text)

    # Step 4: Tokenization
    # Split text into individual words
    tokens = word_tokenize(text)
    print("\n--> After tokenization:", tokens)

    # Step 5: Remove stopwords
    # Stopwords are common words like 'the', 'is', 'at' that don't carry much meaning
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    print("\n--> After removing stopwords:", tokens)

    # Step 6: Lemmatization
    # Convert words to their base form (e.g., 'running' -> 'run')
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    print("\n--> After lemmatization:", tokens)

    # Step 7: Stemming (optional)
    # Another way to get root form of words, but can be more aggressive
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    print("\n--> After stemming:", tokens)

    # Step 8: Join tokens back to text
    clean_text = ' '.join(tokens)
    return clean_text

In [21]:
# Process the text and show final result
final_text = clean_text(text)
print("\n--> Final cleaned text:", final_text)


--> After lowercase: hey there! this is an example text with some numbers (123) and special characters @#$. 
         i love programming in python! python is amazing... check out https://www.example.com

--> After removing URLs: hey there! this is an example text with some numbers (123) and special characters @#$. 
         i love programming in python! python is amazing... check out 

--> After removing special characters: hey there this is an example text with some numbers  and special characters  
         i love programming in python python is amazing check out 

--> After tokenization: ['hey', 'there', 'this', 'is', 'an', 'example', 'text', 'with', 'some', 'numbers', 'and', 'special', 'characters', 'i', 'love', 'programming', 'in', 'python', 'python', 'is', 'amazing', 'check', 'out']

--> After removing stopwords: ['hey', 'example', 'text', 'numbers', 'special', 'characters', 'love', 'programming', 'python', 'python', 'amazing', 'check']

--> After lemmatization: ['hey', 'example

In [11]:
# Bonus: Example with multiple texts using pandas
def process_multiple_texts():
    # Create a sample DataFrame
    df = pd.DataFrame({
        'text': [
            "First example! With numbers 123",
            "Second example... with special chars @#$",
            "Third example with URL https://example.com"
        ]
    })

    # Apply cleaning function to entire column
    df['cleaned_text'] = df['text'].apply(clean_text)
    print("\nProcessing multiple texts using pandas:")
    print(df)

In [22]:
# Uncomment to see multiple text processing example
# process_multiple_texts()


--> After lowercase: first example! with numbers 123

--> After removing URLs: first example! with numbers 123

--> After removing special characters: first example with numbers 

--> After tokenization: ['first', 'example', 'with', 'numbers']

--> After removing stopwords: ['first', 'example', 'numbers']

--> After lemmatization: ['first', 'example', 'number']

--> After stemming: ['first', 'exampl', 'number']

--> After lowercase: second example... with special chars @#$

--> After removing URLs: second example... with special chars @#$

--> After removing special characters: second example with special chars 

--> After tokenization: ['second', 'example', 'with', 'special', 'chars']

--> After removing stopwords: ['second', 'example', 'special', 'chars']

--> After lemmatization: ['second', 'example', 'special', 'char']

--> After stemming: ['second', 'exampl', 'special', 'char']

--> After lowercase: third example with url https://example.com

--> After removing URLs: third exampl