In [7]:
import nltk

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Download required NLTK data (run once)
# nltk.download('wordnet')
# nltk.download('omw-1.4')
# nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/chaklader/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/chaklader/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/chaklader/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/chaklader/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


### Load data

Next, we need to load the data that we want to preprocess. In this example, we will use the following sentence:

In [3]:
text = "The quick brown fox jumped over the lazy dog."

### Text Normalization

Text normalization is the process of converting text into a standard format. This involves converting all characters to lowercase and removing any punctuation.

In [4]:
# Convert to lowercase
text = text.lower()

# Remove punctuation
text = ''.join(c for c in text if c not in '.,;:-')

### Tokenization

Tokenization is the process of splitting a sentence into individual words or tokens.

In [5]:
# Tokenize the text
tokens = word_tokenize(text)
print(tokens)

['the', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy', 'dog']


### Stopword Removal

Stopwords are common words that do not carry much meaning and can be removed from the text. We will use NLTK's list of stopwords and remove them from the tokenized text.

In [6]:
# Remove stopwords
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word not in stop_words]
print(filtered_tokens)

['quick', 'brown', 'fox', 'jumped', 'lazy', 'dog']


### Stemming

Stemming is the process of reducing a word to its base or root form. We will use Porter stemmer from NLTK for stemming.

### Lemmatization

Lemmatization is the process of converting a word to its base or dictionary form. We will use WordNet lemmatizer from NLTK for lemmatization.

In [8]:
# Perform stemming
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
print(stemmed_tokens)

# Perform lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
print(lemmatized_tokens)

##################################

def demonstrate_stemming_vs_lemmatization(words_list: list) -> None:
    """
    Compare stemming and lemmatization results side by side for educational purposes.
    
    This function demonstrates the key differences between stemming and lemmatization
    by processing the same set of words through both techniques and displaying
    results in a formatted table for easy comparison.
    
    Stemming uses rule-based suffix removal to reduce words to approximate roots,
    while lemmatization uses dictionary lookups and morphological analysis to find
    actual dictionary word forms (lemmas).
    
    Args:
        words_list (list): List of words to process through both stemming and lemmatization.
            Should include various word forms (plurals, verb forms, adjectives) to
            showcase the differences between the two approaches.
    
    Returns:
        None: Prints formatted comparison table to console.
        
    Example:
        >>> test_words = ["running", "better", "mice", "feet"]
        >>> demonstrate_stemming_vs_lemmatization(test_words)
    """
    """
    Initialize both stemming and lemmatization tools.
    PorterStemmer: Rule-based algorithm that removes common suffixes
    WordNetLemmatizer: Dictionary-based tool that finds proper word forms
    """
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    
    print(f"{'Original Word':<15} {'Stemmed':<15} {'Lemmatized':<15} {'Difference'}")
    print("-" * 65)
    
    for word in words_list:
        """
        Apply both stemming and lemmatization to each word for comparison.
        Stemming uses algorithmic rules, lemmatization uses linguistic knowledge.
        """
        stemmed = stemmer.stem(word)
        lemmatized = lemmatizer.lemmatize(word)
        
        # Highlight when results differ
        difference = "✓ Same" if stemmed == lemmatized else "✗ Different"
        
        print(f"{word:<15} {stemmed:<15} {lemmatized:<15} {difference}")

def advanced_lemmatization_demo(words_with_pos: list) -> None:
    """
    Demonstrate advanced lemmatization with part-of-speech (POS) tagging for accuracy.
    
    Lemmatization accuracy improves significantly when provided with grammatical
    context through POS tags. The same word can have different lemmas depending
    on whether it's used as a noun, verb, adjective, etc.
    
    This function shows how specifying POS tags leads to more accurate lemmatization
    results compared to using default POS assumptions.
    
    Args:
        words_with_pos (list): List of tuples containing (word, pos_tag) pairs.
            POS tags use WordNet format: 'n' (noun), 'v' (verb), 'a' (adjective), 'r' (adverb).
            
    Returns:
        None: Prints comparison of lemmatization with and without POS context.
        
    Example:
        >>> words = [("better", "a"), ("better", "r"), ("running", "v"), ("running", "n")]
        >>> advanced_lemmatization_demo(words)
    """
    lemmatizer = WordNetLemmatizer()
    
    print(f"{'Word':<12} {'POS':<5} {'Default Lemma':<15} {'POS-aware Lemma':<18} {'Difference'}")
    print("-" * 70)
    
    for word, pos in words_with_pos:
        """
        Compare lemmatization results with and without POS information.
        Default lemmatization assumes noun form, POS-aware uses grammatical context.
        """
        default_lemma = lemmatizer.lemmatize(word)
        pos_aware_lemma = lemmatizer.lemmatize(word, pos=pos)
        
        difference = "✓ Same" if default_lemma == pos_aware_lemma else "✗ Different"
        
        print(f"{word:<12} {pos:<5} {default_lemma:<15} {pos_aware_lemma:<18} {difference}")

def process_sentence_comparison(sentence: str) -> None:
    """
    Process an entire sentence through both stemming and lemmatization pipelines.
    
    This function demonstrates how stemming and lemmatization affect real text
    processing by applying both techniques to a complete sentence. It shows how
    the choice between methods impacts the final processed output that would be
    used in downstream NLP tasks.
    
    Args:
        sentence (str): Input sentence to process through both methods.
            Should contain various word forms to illustrate differences.
            
    Returns:
        None: Prints original sentence and processed versions for comparison.
        
    Example:
        >>> process_sentence_comparison("The children were running quickly")
    """
    """
    Initialize processing tools for both approaches.
    """
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    
    """
    Tokenize sentence into individual words for processing.
    Both stemming and lemmatization operate on individual tokens.
    """
    tokens = word_tokenize(sentence)
    
    """
    Apply stemming: rule-based suffix removal for approximate root forms.
    Fast but may produce non-dictionary words (over-stemming or under-stemming).
    """
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    
    """
    Apply lemmatization: dictionary-based reduction to proper word forms.
    Slower but produces valid dictionary words (lemmas).
    """
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    print(f"Original:    {sentence}")
    print(f"Tokens:      {tokens}")
    print(f"Stemmed:     {stemmed_tokens}")
    print(f"Lemmatized:  {lemmatized_tokens}")
    print(f"Stemmed text:     {' '.join(stemmed_tokens)}")
    print(f"Lemmatized text:  {' '.join(lemmatized_tokens)}")


# Demonstration Examples
print("=" * 60)
print("STEMMING vs LEMMATIZATION COMPARISON")
print("=" * 60)

"""
Test words chosen to highlight key differences between stemming and lemmatization:
- Irregular plurals (mice, feet, children)
- Verb forms (running, better used as verb)
- Comparative/superlative adjectives (better, worst)
- Words where stemming may over-reduce (beautiful, connection)
"""
test_words = [
    "running", "runs", "ran",
    "better", "best", "good",
    "mice", "mouse", "feet", "foot",
    "children", "child",
    "beautiful", "beautifully",
    "connection", "connected", "connecting",
    "happiness", "happy", "happier",
    "studies", "studying", "studied"
]

demonstrate_stemming_vs_lemmatization(test_words)

print("\n" + "=" * 60)
print("ADVANCED LEMMATIZATION WITH POS TAGS")
print("=" * 60)

"""
Demonstrate how POS context improves lemmatization accuracy.
Same words can have different lemmas based on grammatical role.
"""
words_with_pos = [
    ("better", "a"),    # adjective: better -> good
    ("better", "r"),    # adverb: better -> well  
    ("running", "v"),   # verb: running -> run
    ("running", "n"),   # noun: running -> running
    ("leaves", "n"),    # noun: leaves -> leaf
    ("leaves", "v"),    # verb: leaves -> leave
]

advanced_lemmatization_demo(words_with_pos)

print("\n" + "=" * 60)
print("SENTENCE PROCESSING COMPARISON")
print("=" * 60)

"""
Real-world example showing how both methods affect sentence processing.
"""
sample_sentence = "The children were running faster and played better games"
process_sentence_comparison(sample_sentence)

print("\n" + "=" * 60)
print("KEY DIFFERENCES SUMMARY")
print("=" * 60)
print("""
STEMMING (Porter Stemmer):
✓ Fast and simple rule-based approach
✓ Language-independent algorithm  
✓ Good for information retrieval and search
✗ May produce non-dictionary words (e.g., 'beauti' from 'beautiful')
✗ Can over-stem or under-stem words
✗ No understanding of word meaning or context

LEMMATIZATION (WordNet):
✓ Produces valid dictionary words (lemmas)
✓ Uses linguistic knowledge and morphology
✓ More accurate for semantic analysis
✓ Handles irregular word forms correctly
✗ Slower due to dictionary lookups
✗ Requires language-specific resources
✗ May need POS tags for optimal accuracy

WHEN TO USE WHICH:
- Stemming: Information retrieval, search engines, quick preprocessing
- Lemmatization: Semantic analysis, sentiment analysis, text classification
""")

['quick', 'brown', 'fox', 'jump', 'lazi', 'dog']
['quick', 'brown', 'fox', 'jumped', 'lazy', 'dog']
STEMMING vs LEMMATIZATION COMPARISON
Original Word   Stemmed         Lemmatized      Difference
-----------------------------------------------------------------
running         run             running         ✗ Different
runs            run             run             ✓ Same
ran             ran             ran             ✓ Same
better          better          better          ✓ Same
best            best            best            ✓ Same
good            good            good            ✓ Same
mice            mice            mouse           ✗ Different
mouse           mous            mouse           ✗ Different
feet            feet            foot            ✗ Different
foot            foot            foot            ✓ Same
children        children        child           ✗ Different
child           child           child           ✓ Same
beautiful       beauti          beautiful       ✗ Dif

### Output the results

Finally, we will output the results of each step of the text preprocessing process.

In [11]:
print("Original text: ", text)
print("Tokenized text: ", tokens)
print("Filtered tokens: ", filtered_tokens)
print("Stemmed tokens: ", stemmed_tokens)
print("Lemmatized tokens: ", lemmatized_tokens)

Original text:  the quick brown fox jumped over the lazy dog
Tokenized text:  ['the', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy', 'dog']
Filtered tokens:  ['quick', 'brown', 'fox', 'jumped', 'lazy', 'dog']
Stemmed tokens:  ['quick', 'brown', 'fox', 'jump', 'lazi', 'dog']
Lemmatized tokens:  ['quick', 'brown', 'fox', 'jumped', 'lazy', 'dog']
