# Assignment 1: Tokenization and Stemming

This notebook demonstrates:
- **Tokenization**: Whitespace, Punctuation-based, Treebank, Tweet, MWE
- **Stemming**: Porter Stemmer, Snowball Stemmer
- **Lemmatization**: WordNet Lemmatizer

In [1]:
# Import required libraries
import nltk

# Download required NLTK data
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('punkt_tab')

from nltk.tokenize import (
    WhitespaceTokenizer,
    WordPunctTokenizer,
    TreebankWordTokenizer,
    TweetTokenizer,
    MWETokenizer
)
from nltk.stem import PorterStemmer, SnowballStemmer
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ayush\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ayush\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ayush\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\ayush\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


## Sample Text

In [2]:
sample_text = "The quick brown foxes are jumping over the lazy dogs! @user #NLP isn't that amazing? :) I'm loving it."
print(f"Original Text: {sample_text}")

Original Text: The quick brown foxes are jumping over the lazy dogs! @user #NLP isn't that amazing? :) I'm loving it.


## 1. Tokenization Methods

In [3]:
# 1. Whitespace Tokenizer
print("1. WHITESPACE TOKENIZER:")
whitespace_tokenizer = WhitespaceTokenizer()
whitespace_tokens = whitespace_tokenizer.tokenize(sample_text)
print(f"   Tokens: {whitespace_tokens}")

# 2. Punctuation-based Tokenizer (WordPunctTokenizer)
print("\n2. PUNCTUATION-BASED TOKENIZER:")
punct_tokenizer = WordPunctTokenizer()
punct_tokens = punct_tokenizer.tokenize(sample_text)
print(f"   Tokens: {punct_tokens}")

# 3. Treebank Tokenizer
print("\n3. TREEBANK TOKENIZER:")
treebank_tokenizer = TreebankWordTokenizer()
treebank_tokens = treebank_tokenizer.tokenize(sample_text)
print(f"   Tokens: {treebank_tokens}")

# 4. Tweet Tokenizer
print("\n4. TWEET TOKENIZER:")
tweet_tokenizer = TweetTokenizer(preserve_case=False, strip_handles=False, reduce_len=True)
tweet_tokens = tweet_tokenizer.tokenize(sample_text)
print(f"   Tokens: {tweet_tokens}")

# 5. Multi-Word Expression (MWE) Tokenizer
print("\n5. MULTI-WORD EXPRESSION (MWE) TOKENIZER:")
mwe_tokenizer = MWETokenizer([('quick', 'brown'), ('lazy', 'dogs')])
base_tokens = nltk.word_tokenize(sample_text)
mwe_tokens = mwe_tokenizer.tokenize(base_tokens)
print(f"   Base Tokens: {base_tokens}")
print(f"   MWE Tokens: {mwe_tokens}")

1. WHITESPACE TOKENIZER:
   Tokens: ['The', 'quick', 'brown', 'foxes', 'are', 'jumping', 'over', 'the', 'lazy', 'dogs!', '@user', '#NLP', "isn't", 'that', 'amazing?', ':)', "I'm", 'loving', 'it.']

2. PUNCTUATION-BASED TOKENIZER:
   Tokens: ['The', 'quick', 'brown', 'foxes', 'are', 'jumping', 'over', 'the', 'lazy', 'dogs', '!', '@', 'user', '#', 'NLP', 'isn', "'", 't', 'that', 'amazing', '?', ':)', 'I', "'", 'm', 'loving', 'it', '.']

3. TREEBANK TOKENIZER:
   Tokens: ['The', 'quick', 'brown', 'foxes', 'are', 'jumping', 'over', 'the', 'lazy', 'dogs', '!', '@', 'user', '#', 'NLP', 'is', "n't", 'that', 'amazing', '?', ':', ')', 'I', "'m", 'loving', 'it', '.']

4. TWEET TOKENIZER:
   Tokens: ['the', 'quick', 'brown', 'foxes', 'are', 'jumping', 'over', 'the', 'lazy', 'dogs', '!', '@user', '#nlp', "isn't", 'that', 'amazing', '?', ':)', "i'm", 'loving', 'it', '.']

5. MULTI-WORD EXPRESSION (MWE) TOKENIZER:
   Base Tokens: ['The', 'quick', 'brown', 'foxes', 'are', 'jumping', 'over', 'the', 'l

## 2. Stemming Methods

In [4]:
# Words to stem
words_to_stem = ["running", "runs", "ran", "easily", "fairly", "jumping", "foxes", "loving", "amazing"]

# 1. Porter Stemmer
print("1. PORTER STEMMER:")
porter_stemmer = PorterStemmer()
porter_stems = [(word, porter_stemmer.stem(word)) for word in words_to_stem]
print("   Word -> Stem:")
for word, stem in porter_stems:
    print(f"   {word:15} -> {stem}")

# 2. Snowball Stemmer (English)
print("\n2. SNOWBALL STEMMER:")
snowball_stemmer = SnowballStemmer("english")
snowball_stems = [(word, snowball_stemmer.stem(word)) for word in words_to_stem]
print("   Word -> Stem:")
for word, stem in snowball_stems:
    print(f"   {word:15} -> {stem}")

1. PORTER STEMMER:
   Word -> Stem:
   running         -> run
   runs            -> run
   ran             -> ran
   easily          -> easili
   fairly          -> fairli
   jumping         -> jump
   foxes           -> fox
   loving          -> love
   amazing         -> amaz

2. SNOWBALL STEMMER:
   Word -> Stem:
   running         -> run
   runs            -> run
   ran             -> ran
   easily          -> easili
   fairly          -> fair
   jumping         -> jump
   foxes           -> fox
   loving          -> love
   amazing         -> amaz


## 3. Lemmatization

In [5]:
# WordNet Lemmatizer
lemmatizer = WordNetLemmatizer()
words_to_lemmatize = ["running", "runs", "ran", "better", "foxes", "geese", "loving", "amazing", "dogs"]

print("WORDNET LEMMATIZER:")
print("   Word -> Lemma (as noun, verb, adjective):")
for word in words_to_lemmatize:
    lemma_n = lemmatizer.lemmatize(word, pos='n')  # noun
    lemma_v = lemmatizer.lemmatize(word, pos='v')  # verb
    lemma_a = lemmatizer.lemmatize(word, pos='a')  # adjective
    print(f"   {word:15} -> n: {lemma_n:12} v: {lemma_v:12} a: {lemma_a}")

WORDNET LEMMATIZER:
   Word -> Lemma (as noun, verb, adjective):
   running         -> n: running      v: run          a: running
   runs            -> n: run          v: run          a: runs
   ran             -> n: ran          v: run          a: ran
   better          -> n: better       v: better       a: good
   foxes           -> n: fox          v: fox          a: foxes
   geese           -> n: goose        v: geese        a: geese
   loving          -> n: loving       v: love         a: loving
   amazing         -> n: amazing      v: amaze        a: amazing
   dogs            -> n: dog          v: dog          a: dogs


## 4. Comparison: Stemming vs Lemmatization

In [6]:
comparison_words = ["running", "better", "foxes", "studies", "feet"]
print(f"{'Word':<15} {'Porter':<15} {'Snowball':<15} {'Lemma(v)':<15}")
print("-" * 60)
for word in comparison_words:
    porter = porter_stemmer.stem(word)
    snowball = snowball_stemmer.stem(word)
    lemma = lemmatizer.lemmatize(word, pos='v')
    print(f"{word:<15} {porter:<15} {snowball:<15} {lemma:<15}")

Word            Porter          Snowball        Lemma(v)       
------------------------------------------------------------
running         run             run             run            
better          better          better          better         
foxes           fox             fox             fox            
studies         studi           studi           study          
feet            feet            feet            feet           
