In [None]:
import nltk

In [None]:
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('averaged_perceptron_tagger') # needed for pos tagging

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
# Sample text
text = "Alice loves painting in the quiet evenings."

In [None]:
from nltk.tokenize import word_tokenize

In [None]:
# Tokenize the text
tokens = word_tokenize(text)
tokens

['Alice', 'loves', 'painting', 'in', 'the', 'quiet', 'evenings', '.']

In [None]:
# Use pre-trained POS tagger
pos_tags = nltk.pos_tag(tokens)
print(pos_tags)

[('Alice', 'NNP'), ('loves', 'VBZ'), ('painting', 'VBG'), ('in', 'IN'), ('the', 'DT'), ('quiet', 'JJ'), ('evenings', 'NNS'), ('.', '.')]


- NNP: Proper noun, singular
- VBZ: 3rd person singular present
- VBG: Present participle
- IN: Preposition
- DT: Determiner
- JJ: Adjective
- NNS: Plural noun

- Use pre-trained POS tagger
- Customise this pre-trained POS tagger Only

In [None]:
# Define custom rules
def custom_tagger(pos_tags):
    modified_tags = []
    for word, tag in pos_tags:
        # Rule 1: "painting" should always be a noun
        if word.lower() == "painting":
            modified_tags.append((word, "NN"))
        # Rule 2: "evenings" should always be singular noun
        elif word.lower() == "evenings":
            modified_tags.append((word, "NN"))
        else:
            # Default case: Use the tag from the pre-trained tagger
            modified_tags.append((word, tag))
    return modified_tags

In [None]:
# Apply the custom rule-based tagger
custom_pos_tags = custom_tagger(pos_tags)
print("Custom POS Tags:", custom_pos_tags)

Custom POS Tags: [('Alice', 'NNP'), ('loves', 'VBZ'), ('painting', 'NN'), ('in', 'IN'), ('the', 'DT'), ('quiet', 'JJ'), ('evenings', 'NN'), ('.', '.')]


In [None]:
from nltk.corpus import treebank
from nltk.tag import UnigramTagger # Rule Based Tagger

In [None]:
nltk.download('treebank')

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.


True

In [None]:
# Load tagged sentences from the Treebank corpus
tagged_sentences = treebank.tagged_sents()
print("Sample Tagged Sentence:", tagged_sentences[0])

Sample Tagged Sentence: [('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')]


In [None]:
len(tagged_sentences)

3914

In [None]:
# Split data into 80% training and 20% testing
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(tagged_sentences, test_size=0.2, random_state=42)

print("Number of Training Sentences:", len(train_data))
print("Number of Testing Sentences:", len(test_data))

Number of Training Sentences: 3131
Number of Testing Sentences: 783


"painting" (40) --> NNP: 32
                    VBZ: 8
Final Prediction --> Painting --> NNP (most frequent tag)

In [None]:
# Train a Unigram POS Tagger
unigram_tagger = UnigramTagger(train_data)

In [None]:
accuracy = unigram_tagger.evaluate(test_data)
print("Unigram Tagger Accuracy:", round(100*accuracy,2),"%")

Unigram Tagger Accuracy: 88.27 %


  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  accuracy = unigram_tagger.evaluate(test_data)


In [None]:
from nltk.tag import BigramTagger

In [None]:
# Train a Bigram POS Tagger with Unigram Tagger as backoff
bigram_tagger = BigramTagger(train_data, backoff=unigram_tagger)

In [None]:
# Evaluate the bigram tagger
accuracy = bigram_tagger.evaluate(test_data)
print("Bigram Tagger Accuracy:", round(100*accuracy,2),"%")

Bigram Tagger Accuracy: 89.12 %


  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  accuracy = bigram_tagger.evaluate(test_data)


In [None]:
from nltk.tag import TrigramTagger

In [None]:
# Train a Trigram POS Tagger with Bigram Tagger as backoff
trigram_tagger = TrigramTagger(train_data, backoff=bigram_tagger)

In [None]:
# Evaluate the Trigram tagger
accuracy = trigram_tagger.evaluate(test_data)
print("Trigram Tagger Accuracy:",round(100*accuracy,2),"%")

Trigram Tagger Accuracy: 89.15 %


  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  accuracy = trigram_tagger.evaluate(test_data)


- Pre-trained models
- Pre-trained models + Manual Rule Based Adjustments (This can be done only for few cases) (Customization)
- Customised trained models with non-machine learning Logic (Unigram Tagger, Bigram Tagger, Trigram Tagger) - Frequency Based
- Customised trained models with machine learning Logic (ClassifierBasedPOSTagger)

In [None]:
sent="The capital of India is New Delhi"

In [None]:
# Tokenize and tag using the trained tagger
tokens = nltk.word_tokenize(sent)

In [None]:
tags = unigram_tagger.tag(tokens)
print("Tagged Sentence:", tags)

Tagged Sentence: [('The', 'DT'), ('capital', 'NN'), ('of', 'IN'), ('India', 'NNP'), ('is', 'VBZ'), ('New', 'NNP'), ('Delhi', None)]


In [None]:
for i in tagged_sentences:
  for j in i:
    if j[0]=="Delhi":
      print(i)

In [None]:
from nltk.tag.sequential import ClassifierBasedPOSTagger

In [None]:
# Train a Classifier-Based POS Tagger
classifier_tagger = ClassifierBasedPOSTagger(train=train_data)

In [None]:
# Evaluate the classifier tagger
accuracy = classifier_tagger.evaluate(test_data)
print("Classifier-Based Tagger Accuracy:", round(100*accuracy,2),"%")

  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  accuracy = classifier_tagger.evaluate(test_data)


Classifier-Based Tagger Accuracy: 93.75 %


In [None]:
tags = classifier_tagger.tag(tokens)
print("Tagged Sentence:", tags)

Tagged Sentence: [('The', 'DT'), ('capital', 'NN'), ('of', 'IN'), ('India', 'NNP'), ('is', 'VBZ'), ('New', 'NNP'), ('Delhi', 'NNP')]
