In [None]:
# CADL1: Preprocessing with NLTK & SpaCy

# Install dependencies
!pip install nltk spacy

# --- Import libraries ---
import nltk
# Download everything needed
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import spacy

# --- Sample text corpus ---
text = """Natural Language Processing (NLP) helps computers
understand human language like news, tweets, and articles."""

print("Original Text:\n", text)

# --- 1. Tokenization ---
tokens = word_tokenize(text)
print("\nTokens:", tokens)

# --- 2. Stopword Removal ---
stop_words = set(stopwords.words('english'))
filtered_tokens = [w for w in tokens if w.lower() not in stop_words]
print("\nAfter Stopword Removal:", filtered_tokens)

# --- 3. Stemming (Porter Stemmer) ---
ps = PorterStemmer()
stems = [ps.stem(w) for w in filtered_tokens]
print("\nAfter Stemming:", stems)

# --- 4. Lemmatization (WordNet) ---
lemmatizer = WordNetLemmatizer()
lemmas = [lemmatizer.lemmatize(w) for w in filtered_tokens]
print("\nAfter Lemmatization:", lemmas)

# --- 5. Lemmatization with SpaCy (more accurate) ---
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)
spacy_lemmas = [token.lemma_ for token in doc if token.text.lower() not in stop_words]
print("\nSpaCy Lemmas:", spacy_lemmas)




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Original Text:
 Natural Language Processing (NLP) helps computers 
understand human language like news, tweets, and articles.

Tokens: ['Natural', 'Language', 'Processing', '(', 'NLP', ')', 'helps', 'computers', 'understand', 'human', 'language', 'like', 'news', ',', 'tweets', ',', 'and', 'articles', '.']

After Stopword Removal: ['Natural', 'Language', 'Processing', '(', 'NLP', ')', 'helps', 'computers', 'understand', 'human', 'language', 'like', 'news', ',', 'tweets', ',', 'articles', '.']

After Stemming: ['natur', 'languag', 'process', '(', 'nlp', ')', 'help', 'comput', 'understand', 'human', 'languag', 'like', 'news', ',', 'tweet', ',', 'articl', '.']

After Lemmatization: ['Natural', 'Language', 'Processing', '(', 'NLP', ')', 'help', 'computer', 'understand', 'human', 'language', 'like', 'news', ',', 'tweet', ',', 'article', '.']

SpaCy Lemmas: ['Natural', 'Language', 'Processing', '(', 'NLP', ')', 'help', 'computer', '\n', 'understand', 'human', 'language', 'like', 'news', ',', 