In [1]:
# Setup: Install and Import
!pip install nltk

import nltk
nltk.download('punkt_tab')



Collecting nltk
  Using cached nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting joblib (from nltk)
  Using cached joblib-1.5.1-py3-none-any.whl.metadata (5.6 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2025.7.34-cp313-cp313-macosx_11_0_arm64.whl.metadata (40 kB)
Using cached nltk-3.9.1-py3-none-any.whl (1.5 MB)
Downloading regex-2025.7.34-cp313-cp313-macosx_11_0_arm64.whl (285 kB)
Using cached joblib-1.5.1-py3-none-any.whl (307 kB)
Installing collected packages: regex, joblib, nltk
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3/3[0m [nltk][32m2/3[0m [nltk]b]
[1A[2KSuccessfully installed joblib-1.5.1 nltk-3.9.1 regex-2025.7.34


[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/deeplatiyan/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [2]:
# Sample Text
text = "NLTK is great for NLP! It provides easy-to-use interfaces for over 50 corpora and lexical resources."

In [3]:
# 1. Tokenization
from nltk.tokenize import word_tokenize, sent_tokenize

words = word_tokenize(text)
sentences = sent_tokenize(text)

print("Word Tokens:", words)
print("Sentence Tokens:", sentences)

Word Tokens: ['NLTK', 'is', 'great', 'for', 'NLP', '!', 'It', 'provides', 'easy-to-use', 'interfaces', 'for', 'over', '50', 'corpora', 'and', 'lexical', 'resources', '.']
Sentence Tokens: ['NLTK is great for NLP!', 'It provides easy-to-use interfaces for over 50 corpora and lexical resources.']


In [4]:
type(words)

list

In [5]:
type(sentences)

list

In [14]:

# 2. Cleaning Text
import re

# Remove punctuation and numbers
cleaned_text = re.sub(r'[^\w\s]', '', text)
cleaned_text = re.sub(r'\d+', '', cleaned_text)
print("Cleaned Text:", cleaned_text)

Cleaned Text: NLTK is great for NLP It provides easytouse interfaces for over  corpora and lexical resources


| Pattern | Matches            | Effect in `re.sub()`       |
| ------- | ------------------ | -------------------------- |
| `\d+`   | One or more digits | Removes full number blocks |
| `''`    | Empty string       | Means "delete the match"   |


In [7]:
print("Data type of cleaned_text:", type(cleaned_text))

Data type of cleaned_text: <class 'str'>


In [8]:

# 3. Case Normalization
lowercase_text = cleaned_text.lower()
print("Lowercased Text:", lowercase_text)


Lowercased Text: nltk is great for nlp it provides easytouse interfaces for over  corpora and lexical resources


In [9]:
type(lowercase_text)

str

In [10]:

# 4. Stop Words Removal
from nltk.corpus import stopwords
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
filtered_words = [word for word in word_tokenize(lowercase_text) if word not in stop_words]
print("Filtered Words:", filtered_words)


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/deeplatiyan/nltk_data...


Filtered Words: ['nltk', 'great', 'nlp', 'provides', 'easytouse', 'interfaces', 'corpora', 'lexical', 'resources']


[nltk_data]   Unzipping corpora/stopwords.zip.


In [11]:
# for word in word_tokenize(lowercase_text):
#     if word not in stopwords.words('english'):
#         filtered_words.append(word)


In [12]:
type(filtered_words)

list

In [13]:

# 5. Stemming
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(word) for word in filtered_words]
print("Stemmed Words:", stemmed_words)

Stemmed Words: ['nltk', 'great', 'nlp', 'provid', 'easytous', 'interfac', 'corpora', 'lexic', 'resourc']


In [62]:


# 6. Lemmatization
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]
print("Lemmatized Words:", lemmatized_words)


Lemmatized Words: ['nltk', 'great', 'nlp', 'provides', 'easytouse', 'interface', 'corpus', 'lexical', 'resource']


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [63]:
nltk.download('averaged_perceptron_tagger_eng')
# 7. Part-of-Speech (POS) Tagging
from nltk import pos_tag

pos_tags = pos_tag(words)
print("POS Tags:", pos_tags)

POS Tags: [('NLTK', 'NNP'), ('is', 'VBZ'), ('great', 'JJ'), ('for', 'IN'), ('NLP', 'NNP'), ('!', '.'), ('It', 'PRP'), ('provides', 'VBZ'), ('easy-to-use', 'JJ'), ('interfaces', 'NNS'), ('for', 'IN'), ('over', 'IN'), ('50', 'CD'), ('corpora', 'NNS'), ('and', 'CC'), ('lexical', 'JJ'), ('resources', 'NNS'), ('.', '.')]


[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [64]:
# 8. Named Entity Recognition (NER)
from nltk import ne_chunk
nltk.download('maxent_ne_chunker_tab')

ner_tree = ne_chunk(pos_tags)
print("Named Entity Tree:")
print(ner_tree)

[nltk_data] Downloading package maxent_ne_chunker_tab to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker_tab is already up-to-date!


Named Entity Tree:
(S
  (ORGANIZATION NLTK/NNP)
  is/VBZ
  great/JJ
  for/IN
  (ORGANIZATION NLP/NNP)
  !/.
  It/PRP
  provides/VBZ
  easy-to-use/JJ
  interfaces/NNS
  for/IN
  over/IN
  50/CD
  corpora/NNS
  and/CC
  lexical/JJ
  resources/NNS
  ./.)
