In [1]:
import warnings
warnings.filterwarnings("ignore")

### Tokenization

Breaking text into individual words or sentences.

In [2]:
from nltk.tokenize import word_tokenize, sent_tokenize

text = "NLTK is a powerful library for NLP. It provides various functions for text processing."
words = word_tokenize(text)
sentences = sent_tokenize(text)


In [3]:
words

['NLTK',
 'is',
 'a',
 'powerful',
 'library',
 'for',
 'NLP',
 '.',
 'It',
 'provides',
 'various',
 'functions',
 'for',
 'text',
 'processing',
 '.']

In [4]:
sentences

['NLTK is a powerful library for NLP.',
 'It provides various functions for text processing.']

### Part-of-Speech (POS) Tagging

Assigning grammatical information (e.g., noun, verb, adjective) to each word in a sentence.

In [5]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [6]:
from nltk import pos_tag

words = word_tokenize("This is a sample sentence.")
pos_tags = pos_tag(words)


In [7]:
pos_tags

[('This', 'DT'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('sample', 'JJ'),
 ('sentence', 'NN'),
 ('.', '.')]

### Stemming and Lemmatization

Reducing words to their base or root form.

In [8]:
from nltk.stem import PorterStemmer

porter = PorterStemmer()
print(porter.stem("play"))
print(porter.stem("playing"))
print(porter.stem("plays"))
print(porter.stem("played"))
print(porter.stem("Communication"))


play
play
play
play
commun


In [9]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize("plays", 'v'))
print(lemmatizer.lemmatize("played", 'v'))
print(lemmatizer.lemmatize("play", 'v'))
print(lemmatizer.lemmatize("playing", 'v'))
print(lemmatizer.lemmatize("Communication", 'v'))


play
play
play
play
Communication


### Named Entity Recognition (NER)

Identifying named entities (e.g., persons, organizations, locations) in text.

In [10]:
import nltk
nltk.download('maxent_ne_chunker')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [11]:
import nltk
from nltk import word_tokenize, pos_tag, ne_chunk

# Example text
text = "Barack Obama was born in Honolulu, Hawaii."

# Tokenize the text into words
words = word_tokenize(text)

# Tag the words with part-of-speech tags
tagged_words = pos_tag(words)

# Identify named entities
named_entities = ne_chunk(tagged_words)

# Print the named entities
print(named_entities)

(S
  (PERSON Barack/NNP)
  (PERSON Obama/NNP)
  was/VBD
  born/VBN
  in/IN
  (GPE Honolulu/NNP)
  ,/,
  (GPE Hawaii/NNP)
  ./.)


### Chunking

Grouping words or phrases into chunks based on their POS tags.

In [12]:
from nltk import word_tokenize, pos_tag, RegexpParser

# Example text
text = "The big black cat sat on the small white mat near the window."

# Define a more complex grammar for NP chunking
grammar = r"""
    NP: {<DT>?<JJ>*<NN>}    # Chunk sequences of DT, JJ, NN
        {<NNP>+}            # Chunk sequences of NNP
"""

# Create the chunk parser
parser = RegexpParser(grammar)

# Tokenize the text
words = word_tokenize(text)

# POS tagging
tagged_words = pos_tag(words)

# Parse the tagged words
tree = parser.parse(tagged_words)

# Print the parse tree
print(tree)

(S
  (NP The/DT big/JJ black/JJ cat/NN)
  sat/VBD
  on/IN
  (NP the/DT small/JJ white/JJ mat/NN)
  near/IN
  (NP the/DT window/NN)
  ./.)


### WordNet

A lexical database that provides semantic relationships between words.

In [13]:
from nltk.corpus import wordnet

synsets = wordnet.synsets("car")


In [14]:
synsets

[Synset('car.n.01'),
 Synset('car.n.02'),
 Synset('car.n.03'),
 Synset('car.n.04'),
 Synset('cable_car.n.01')]

### Sentiment Analysis

Determining sentiment (positive, negative, neutral) of text.

In [15]:
import nltk
nltk.download('vader_lexicon')


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [16]:
from nltk.sentiment import SentimentIntensityAnalyzer

sia = SentimentIntensityAnalyzer()
sentiment_score = sia.polarity_scores("NLTK is awesome!")


In [17]:
sentiment_score

{'neg': 0.0, 'neu': 0.313, 'pos': 0.687, 'compound': 0.6588}