# Natural Language Processing with NLTK

In [1]:
# importing necessary libraries and packages
import nltk
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_eng is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]    |

True

## 1. Tokenization

In [2]:
from nltk.tokenize import sent_tokenize
text1 = "Climate change is one of the greatest challenges of our time. The latest report, released on January 10, 2025, states that global temperatures have risen by 1.5°C. Rising sea levels are expected to displace over 200 million people by 2050. Experts predict that the cost of addressing these issues could exceed $10 trillion if immediate action isn’t taken."
text2 = "OMG!! 😲 Just watched the new netflix documentary... it's mind-blowing!!! #Bingewatching #FutureTech"
text3 = "E = mc^2 is one of the most famous equations in physics. This principle forms the basis of modern theoretical physics."
sentences1 = sent_tokenize(text1)
sentences2 = sent_tokenize(text2)
sentences3 = sent_tokenize(text3)
print("Sentences for Text 1:", sentences1)
print("Sentences for Text 2:", sentences2)
print("Sentences for Text 3:", sentences3)

Sentences for Text 1: ['Climate change is one of the greatest challenges of our time.', 'The latest report, released on January 10, 2025, states that global temperatures have risen by 1.5°C.', 'Rising sea levels are expected to displace over 200 million people by 2050.', 'Experts predict that the cost of addressing these issues could exceed $10 trillion if immediate action isn’t taken.']
Sentences for Text 2: ['OMG!!', "😲 Just watched the new netflix documentary... it's mind-blowing!!!", '#Bingewatching #FutureTech']
Sentences for Text 3: ['E = mc^2 is one of the most famous equations in physics.', 'This principle forms the basis of modern theoretical physics.']


In [3]:
# Word Tokenization: splitting sentences into words

from nltk.tokenize import word_tokenize
words1 = word_tokenize(text1)
words2 = word_tokenize(text2)
words3 = word_tokenize(text3)
print("Tokenized Words for Text 1:", words1)
print("Tokenized Words for Text 2:", words2)
print("Tokenized Words for Text 3:", words3)

Tokenized Words for Text 1: ['Climate', 'change', 'is', 'one', 'of', 'the', 'greatest', 'challenges', 'of', 'our', 'time', '.', 'The', 'latest', 'report', ',', 'released', 'on', 'January', '10', ',', '2025', ',', 'states', 'that', 'global', 'temperatures', 'have', 'risen', 'by', '1.5°C', '.', 'Rising', 'sea', 'levels', 'are', 'expected', 'to', 'displace', 'over', '200', 'million', 'people', 'by', '2050', '.', 'Experts', 'predict', 'that', 'the', 'cost', 'of', 'addressing', 'these', 'issues', 'could', 'exceed', '$', '10', 'trillion', 'if', 'immediate', 'action', 'isn', '’', 't', 'taken', '.']
Tokenized Words for Text 2: ['OMG', '!', '!', '😲', 'Just', 'watched', 'the', 'new', 'netflix', 'documentary', '...', 'it', "'s", 'mind-blowing', '!', '!', '!', '#', 'Bingewatching', '#', 'FutureTech']
Tokenized Words for Text 3: ['E', '=', 'mc^2', 'is', 'one', 'of', 'the', 'most', 'famous', 'equations', 'in', 'physics', '.', 'This', 'principle', 'forms', 'the', 'basis', 'of', 'modern', 'theoretical

## 2. Stop Words Removal

In [4]:
# stop words are common words (like the, is, and) that provided little semantic value

from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))  # Retrieves a list of common English stop words 

# processing each word and removing any word that is a stop word
filtered_words1 = [word1 for word1 in words1 if word1.lower() not in stop_words] 
print("Filtered words for text 1:", filtered_words1)

filtered_words2 = [word2 for word2 in words2 if word2.lower() not in stop_words] 
print("Filtered words for text 2:", filtered_words2)

filtered_words3 = [word3 for word3 in words3 if word3.lower() not in stop_words] 
print("Filtered words for text 3:", filtered_words3)

Filtered words for text 1: ['Climate', 'change', 'one', 'greatest', 'challenges', 'time', '.', 'latest', 'report', ',', 'released', 'January', '10', ',', '2025', ',', 'states', 'global', 'temperatures', 'risen', '1.5°C', '.', 'Rising', 'sea', 'levels', 'expected', 'displace', '200', 'million', 'people', '2050', '.', 'Experts', 'predict', 'cost', 'addressing', 'issues', 'could', 'exceed', '$', '10', 'trillion', 'immediate', 'action', '’', 'taken', '.']
Filtered words for text 2: ['OMG', '!', '!', '😲', 'watched', 'new', 'netflix', 'documentary', '...', "'s", 'mind-blowing', '!', '!', '!', '#', 'Bingewatching', '#', 'FutureTech']
Filtered words for text 3: ['E', '=', 'mc^2', 'one', 'famous', 'equations', 'physics', '.', 'principle', 'forms', 'basis', 'modern', 'theoretical', 'physics', '.']


## 3. Text Normalization

In [5]:
# Normalization is reducing words into their base or root form. There are two ways of normalization.

### 3.1 Stemming

In [6]:
# removing prefixes of suffixes to approximate the root form of the word.

from nltk.stem import PorterStemmer
ps = PorterStemmer()

stemmed_words1 = [ps.stem(word) for word in filtered_words1]
print("Stemmed Words from text 1:", stemmed_words1)

stemmed_words2 = [ps.stem(word) for word in filtered_words2]
print("Stemmed Words from text 2:", stemmed_words2)

stemmed_words3 = [ps.stem(word) for word in filtered_words3]
print("Stemmed Words from text 3:", stemmed_words3)

Stemmed Words from text 1: ['climat', 'chang', 'one', 'greatest', 'challeng', 'time', '.', 'latest', 'report', ',', 'releas', 'januari', '10', ',', '2025', ',', 'state', 'global', 'temperatur', 'risen', '1.5°c', '.', 'rise', 'sea', 'level', 'expect', 'displac', '200', 'million', 'peopl', '2050', '.', 'expert', 'predict', 'cost', 'address', 'issu', 'could', 'exceed', '$', '10', 'trillion', 'immedi', 'action', '’', 'taken', '.']
Stemmed Words from text 2: ['omg', '!', '!', '😲', 'watch', 'new', 'netflix', 'documentari', '...', "'s", 'mind-blow', '!', '!', '!', '#', 'bingewatch', '#', 'futuretech']
Stemmed Words from text 3: ['e', '=', 'mc^2', 'one', 'famou', 'equat', 'physic', '.', 'principl', 'form', 'basi', 'modern', 'theoret', 'physic', '.']


### 3.2 Lemmatization

In [7]:
# using a vocabulary-based approach to return the base or dictionary form of a word

from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

lemmatized_words1 = [lemmatizer.lemmatize(word) for word in filtered_words1]
print("Lemmatized Words from text 1:", lemmatized_words1)

lemmatized_words2 = [lemmatizer.lemmatize(word) for word in filtered_words2]
print("Lemmatized Words from text 2:", lemmatized_words2)

lemmatized_words3 = [lemmatizer.lemmatize(word) for word in filtered_words3]
print("Lemmatized Words from text 3:", lemmatized_words3)

Lemmatized Words from text 1: ['Climate', 'change', 'one', 'greatest', 'challenge', 'time', '.', 'latest', 'report', ',', 'released', 'January', '10', ',', '2025', ',', 'state', 'global', 'temperature', 'risen', '1.5°C', '.', 'Rising', 'sea', 'level', 'expected', 'displace', '200', 'million', 'people', '2050', '.', 'Experts', 'predict', 'cost', 'addressing', 'issue', 'could', 'exceed', '$', '10', 'trillion', 'immediate', 'action', '’', 'taken', '.']
Lemmatized Words from text 2: ['OMG', '!', '!', '😲', 'watched', 'new', 'netflix', 'documentary', '...', "'s", 'mind-blowing', '!', '!', '!', '#', 'Bingewatching', '#', 'FutureTech']
Lemmatized Words from text 3: ['E', '=', 'mc^2', 'one', 'famous', 'equation', 'physic', '.', 'principle', 'form', 'basis', 'modern', 'theoretical', 'physic', '.']


## 4. Parts of Speech (POS) Tagging

In [8]:
# assigns grammatical labels to words helping analyze sentence structure

from nltk import pos_tag

pos_tags1 = pos_tag(words1)
print("POS Tags from text 1:", pos_tags1)

pos_tags2 = pos_tag(words2)
print("POS Tags from text 2:", pos_tags2)

pos_tags3 = pos_tag(words3)
print("POS Tags from text 3:", pos_tags3)
# DT (Determiner), JJ (Adjective), NN (Noun, singular), VBZ (Verb, 3rd person singular present), IN (Preposition), DT (Determiner), JJ (Adjective)

POS Tags from text 1: [('Climate', 'NNP'), ('change', 'NN'), ('is', 'VBZ'), ('one', 'CD'), ('of', 'IN'), ('the', 'DT'), ('greatest', 'JJS'), ('challenges', 'NNS'), ('of', 'IN'), ('our', 'PRP$'), ('time', 'NN'), ('.', '.'), ('The', 'DT'), ('latest', 'JJS'), ('report', 'NN'), (',', ','), ('released', 'VBN'), ('on', 'IN'), ('January', 'NNP'), ('10', 'CD'), (',', ','), ('2025', 'CD'), (',', ','), ('states', 'VBZ'), ('that', 'IN'), ('global', 'JJ'), ('temperatures', 'NNS'), ('have', 'VBP'), ('risen', 'VBN'), ('by', 'IN'), ('1.5°C', 'CD'), ('.', '.'), ('Rising', 'VBG'), ('sea', 'NN'), ('levels', 'NNS'), ('are', 'VBP'), ('expected', 'VBN'), ('to', 'TO'), ('displace', 'VB'), ('over', 'IN'), ('200', 'CD'), ('million', 'CD'), ('people', 'NNS'), ('by', 'IN'), ('2050', 'CD'), ('.', '.'), ('Experts', 'NNS'), ('predict', 'VBP'), ('that', 'IN'), ('the', 'DT'), ('cost', 'NN'), ('of', 'IN'), ('addressing', 'VBG'), ('these', 'DT'), ('issues', 'NNS'), ('could', 'MD'), ('exceed', 'VB'), ('$', '$'), ('10',

## 5. Named Entity Recognition (NER)

In [9]:
# identifies specific entities such as names, dates and locations in a text.
# NER is often used in tasks like extracting information from documents or creating chatbots.

from nltk import ne_chunk
entities = ne_chunk(pos_tags1)
print("Named Entities:", entities)

Named Entities: (S
  (PERSON Climate/NNP)
  change/NN
  is/VBZ
  one/CD
  of/IN
  the/DT
  greatest/JJS
  challenges/NNS
  of/IN
  our/PRP$
  time/NN
  ./.
  The/DT
  latest/JJS
  report/NN
  ,/,
  released/VBN
  on/IN
  January/NNP
  10/CD
  ,/,
  2025/CD
  ,/,
  states/VBZ
  that/IN
  global/JJ
  temperatures/NNS
  have/VBP
  risen/VBN
  by/IN
  1.5°C/CD
  ./.
  Rising/VBG
  sea/NN
  levels/NNS
  are/VBP
  expected/VBN
  to/TO
  displace/VB
  over/IN
  200/CD
  million/CD
  people/NNS
  by/IN
  2050/CD
  ./.
  Experts/NNS
  predict/VBP
  that/IN
  the/DT
  cost/NN
  of/IN
  addressing/VBG
  these/DT
  issues/NNS
  could/MD
  exceed/VB
  $/$
  10/CD
  trillion/CD
  if/IN
  immediate/JJ
  action/NN
  isn/NN
  ’/NNP
  t/NN
  taken/VBN
  ./.)


In [10]:
# S : Represents Sentence, PERSON - NNP : Proper Nouns, VBZ : Verbs, IN : preposition
# GPE : Geopolitical Entity (location) , ./. : end of the sentence

## 6. Frequency Distribution

In [11]:
# Analyze the frequency of words or tokens to understand text patterns.

from nltk.probability import FreqDist
fdist_after = FreqDist(filtered_words1)
print("Most Common Words in text1:", fdist_after.most_common(6))

from nltk.probability import FreqDist
fdist_before = FreqDist(words1)
print("Most Common Words in text1:", fdist_before.most_common(6))


Most Common Words in text1: [('.', 4), (',', 3), ('10', 2), ('Climate', 1), ('change', 1), ('one', 1)]
Most Common Words in text1: [('.', 4), ('of', 3), (',', 3), ('the', 2), ('10', 2), ('that', 2)]


## 7. Synonyms and Antonyms

In [12]:
from nltk.corpus import wordnet 

synonyms = wordnet.synsets('mc^2')

# Check if synonyms were found and print them
if synonyms:
    for syn in synonyms:
        print(f"Word: {syn.name()}, Definition: {syn.definition()}")
else:
    print("No synonyms found.")

No synonyms found.


In [13]:
# Example word
word = "new"

# Find antonyms
antonyms = []
for syn in wordnet.synsets(word):
    for lemma in syn.lemmas():
        if lemma.antonyms():
            antonyms.append(lemma.antonyms()[0].name())

# Display results
if antonyms:
    print(f"Antonyms of '{word}': {antonyms}")
else:
    print(f"No antonyms found for '{word}'.")

Antonyms of 'new': ['old', 'worn']
