<a href="https://colab.research.google.com/github/Abbhiraami/ML_AI_masters/blob/main/practice_books/Syntactical_processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import nltk

In [11]:
nltk.download("punkt_tab")
# Corrected the typo in the download command
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('averaged_perceptron_tagger') # needed for pos tagging

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [17]:
sample_text="Alice loves painting in the quite and calm evenings."

In [18]:
from nltk.tokenize import word_tokenize

In [19]:
tokens= word_tokenize(sample_text)
tokens

['Alice',
 'loves',
 'painting',
 'in',
 'the',
 'quite',
 'and',
 'calm',
 'evenings',
 '.']

In [20]:
pos_tags=nltk.pos_tag(tokens)
pos_tags

[('Alice', 'NNP'),
 ('loves', 'VBZ'),
 ('painting', 'VBG'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('quite', 'NN'),
 ('and', 'CC'),
 ('calm', 'JJ'),
 ('evenings', 'NNS'),
 ('.', '.')]

In [23]:
#Define custom rules
def custom_tagger(pos_tags):
  modified_tags = []
  for word, tag in pos_tags:
    # Rule 1: "painting" should always be a noun
    if word.lower() == "painting": modified_tags.append((word, "NN"))
    # Rule 2: "evenings" should always be singular noun
    elif word.lower() == "evenings": modified_tags.append((word, "NN"))
    else:
    # Default case: Use the tag from the pre-trained tagger
      modified_tags.append((word, tag))
  return modified_tags

In [25]:
custom_pos_tags=custom_tagger(pos_tags)
custom_pos_tags

[('Alice', 'NNP'),
 ('loves', 'VBZ'),
 ('painting', 'NN'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('quite', 'NN'),
 ('and', 'CC'),
 ('calm', 'JJ'),
 ('evenings', 'NN'),
 ('.', '.')]

In [28]:
from nltk.corpus import treebank
from nltk.tag import UnigramTagger
nltk.download('treebank')

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.


True

In [29]:
## Load tagged sentences from the treebank
tagged_sentences=treebank.tagged_sents()
print(tagged_sentences)

[[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')], [('Mr.', 'NNP'), ('Vinken', 'NNP'), ('is', 'VBZ'), ('chairman', 'NN'), ('of', 'IN'), ('Elsevier', 'NNP'), ('N.V.', 'NNP'), (',', ','), ('the', 'DT'), ('Dutch', 'NNP'), ('publishing', 'VBG'), ('group', 'NN'), ('.', '.')], ...]


In [30]:
len(tagged_sentences)

3914

In [32]:
from sklearn.model_selection import train_test_split
train_data, test_data=train_test_split(tagged_sentences, test_size=0.2, random_state=42)
print(f'Train Data: {len(train_data)}')
print(f'Test Data: {len(test_data)}')
#

Train Data: 3131
Test Data: 783


In [33]:
### Train Unigram
unigram_tagger=UnigramTagger(train_data)


In [36]:
accuracy=unigram_tagger.accuracy(test_data)
print(f'Accuracy: {100*accuracy:.2f}%')

Accuracy: 88.27%


In [37]:
from nltk.tag import BigramTagger

In [43]:
## Train a bigram POS Tagger with Unigram as a backoff
bigram_tagger=BigramTagger(train_data, backoff=unigram_tagger)
print(f'Train: {100*bigram_tagger.accuracy(train_data):.2f}%')
print(f'Test: {100*bigram_tagger.accuracy(test_data):.2f}%')


Train: 98.61%
Test: 89.12%


In [45]:
## Train a bigram POS Tagger with Trigram as a backoff
from nltk.tag import TrigramTagger
trigram_tagger=TrigramTagger(train_data, backoff=unigram_tagger)
print(f'Train: {100*trigram_tagger.accuracy(train_data):.2f}%')
print(f'Test: {100*trigram_tagger.accuracy(test_data):.2f}%')


Train: 99.25%
Test: 88.95%


In [46]:
sent="The capital of India is New Delhi"
tags=unigram_tagger.tag(word_tokenize(sent))
print(tags)

[('The', 'DT'), ('capital', 'NN'), ('of', 'IN'), ('India', 'NNP'), ('is', 'VBZ'), ('New', 'NNP'), ('Delhi', None)]


In [47]:
from nltk.tag.sequential import ClassifierBasedPOSTagger

In [50]:
classifier_tagger=ClassifierBasedPOSTagger(train=train_data)
print(f'Train: {100*classifier_tagger.accuracy(train_data):.2f}%')
print(f'Test: {100*classifier_tagger.accuracy(test_data):.2f}%')

Train: 97.81%
Test: 93.75%


In [51]:
tags=classifier_tagger.tag(word_tokenize(sent))
print(tags)

[('The', 'DT'), ('capital', 'NN'), ('of', 'IN'), ('India', 'NNP'), ('is', 'VBZ'), ('New', 'NNP'), ('Delhi', 'NNP')]
