In [1]:
pip install nltk




In [2]:
# Importing libraries
import nltk
import numpy as np
import pandas as pd
import pprint, time
import random
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
import math
# reading the Treebank tagged sentences
wsj = list(nltk.corpus.treebank.tagged_sents())
# samples: Each sentence is a list of (word, pos) tuples
wsj[:3]

[[('Pierre', 'NNP'),
  ('Vinken', 'NNP'),
  (',', ','),
  ('61', 'CD'),
  ('years', 'NNS'),
  ('old', 'JJ'),
  (',', ','),
  ('will', 'MD'),
  ('join', 'VB'),
  ('the', 'DT'),
  ('board', 'NN'),
  ('as', 'IN'),
  ('a', 'DT'),
  ('nonexecutive', 'JJ'),
  ('director', 'NN'),
  ('Nov.', 'NNP'),
  ('29', 'CD'),
  ('.', '.')],
 [('Mr.', 'NNP'),
  ('Vinken', 'NNP'),
  ('is', 'VBZ'),
  ('chairman', 'NN'),
  ('of', 'IN'),
  ('Elsevier', 'NNP'),
  ('N.V.', 'NNP'),
  (',', ','),
  ('the', 'DT'),
  ('Dutch', 'NNP'),
  ('publishing', 'VBG'),
  ('group', 'NN'),
  ('.', '.')],
 [('Rudolph', 'NNP'),
  ('Agnew', 'NNP'),
  (',', ','),
  ('55', 'CD'),
  ('years', 'NNS'),
  ('old', 'JJ'),
  ('and', 'CC'),
  ('former', 'JJ'),
  ('chairman', 'NN'),
  ('of', 'IN'),
  ('Consolidated', 'NNP'),
  ('Gold', 'NNP'),
  ('Fields', 'NNP'),
  ('PLC', 'NNP'),
  (',', ','),
  ('was', 'VBD'),
  ('named', 'VBN'),
  ('*-1', '-NONE-'),
  ('a', 'DT'),
  ('nonexecutive', 'JJ'),
  ('director', 'NN'),
  ('of', 'IN'),
  ('this'

In [None]:
#In the list mentioned above, each element corresponds to a sentence and is followed by a full stop ‘.’ which also serves as its POS tag. 
#Therefore, the POS tag ‘.’ signifies the end of a sentence.

#Furthermore, it is not necessary for the corpus to be segmented into sentences. 
#Instead, we can use a list of tuples that contain both the word and its corresponding POS tag.

#To accomplish this, we will convert the original list into a list of (word, tag) tuples.

In [3]:
# converting the list of sents to a list of (word, pos tag) tuples
tagged_words = [tup for sent in wsj for tup in sent]
print(len(tagged_words))
tagged_words[:10]

100676


[('Pierre', 'NNP'),
 ('Vinken', 'NNP'),
 (',', ','),
 ('61', 'CD'),
 ('years', 'NNS'),
 ('old', 'JJ'),
 (',', ','),
 ('will', 'MD'),
 ('join', 'VB'),
 ('the', 'DT')]

Exploratory Data Analysis
How many unique tags are present in the corpus?
What is the most commonly occurring tag in the corpus?
Which tag is most frequently assigned to the words “bank” and “executive”?

In [16]:
print(random.choice(tagged_words))

('principal', 'NN')


In [17]:
# question 1: Find the number of unique POS tags in the corpus
# you can use the set() function on the list of tags to get a unique set of tags, 
# and compute its length
tags = [pair[1] for pair in tagged_words]
unique_tags = set(tags)
len(unique_tags)

46

In [5]:
# question 2: Which is the most frequent tag in the corpus
# to count the frequency of elements in a list, the Counter() class from collections
# module is very useful, as shown below
from collections import Counter
tag_counts = Counter(tags)
tag_counts

Counter({'NNP': 9410,
         ',': 4886,
         'CD': 3546,
         'NNS': 6047,
         'JJ': 5834,
         'MD': 927,
         'VB': 2554,
         'DT': 8165,
         'NN': 13166,
         'IN': 9857,
         '.': 3874,
         'VBZ': 2125,
         'VBG': 1460,
         'CC': 2265,
         'VBD': 3043,
         'VBN': 2134,
         '-NONE-': 6592,
         'RB': 2822,
         'TO': 2179,
         'PRP': 1716,
         'RBR': 136,
         'WDT': 445,
         'VBP': 1321,
         'RP': 216,
         'PRP$': 766,
         'JJS': 182,
         'POS': 824,
         '``': 712,
         'EX': 88,
         "''": 694,
         'WP': 241,
         ':': 563,
         'JJR': 381,
         'WRB': 178,
         '$': 724,
         'NNPS': 244,
         'WP$': 14,
         '-LRB-': 120,
         '-RRB-': 126,
         'PDT': 27,
         'RBS': 35,
         'FW': 4,
         'UH': 3,
         'SYM': 1,
         'LS': 13,
         '#': 16})

In [6]:
# the most common tags can be seen using the most_common() method of Counter
tag_counts.most_common(5)

[('NN', 13166), ('IN', 9857), ('NNP', 9410), ('DT', 8165), ('-NONE-', 6592)]

Thus, NN is the most common tag followed by IN, NNP, DT, -NONE- etc.

In [7]:
# question 3: Which tag is most commonly assigned to the word "bank".
bank = [pair for pair in tagged_words if pair[0].lower() == 'bank']
bank

[('bank', 'NN'),
 ('Bank', 'NNP'),
 ('bank', 'NN'),
 ('Bank', 'NNP'),
 ('bank', 'NN'),
 ('Bank', 'NNP'),
 ('bank', 'NN'),
 ('Bank', 'NNP'),
 ('bank', 'NN'),
 ('bank', 'NN'),
 ('bank', 'NN'),
 ('Bank', 'NNP'),
 ('Bank', 'NNP'),
 ('Bank', 'NNP'),
 ('Bank', 'NNP'),
 ('bank', 'NN'),
 ('bank', 'NN'),
 ('bank', 'NN'),
 ('bank', 'NN'),
 ('bank', 'NN'),
 ('Bank', 'NNP'),
 ('Bank', 'NNP'),
 ('Bank', 'NNP'),
 ('bank', 'NN'),
 ('Bank', 'NNP'),
 ('Bank', 'NNP'),
 ('bank', 'NN'),
 ('Bank', 'NNP'),
 ('Bank', 'NNP'),
 ('bank', 'NN'),
 ('bank', 'NN'),
 ('Bank', 'NNP'),
 ('bank', 'NN'),
 ('bank', 'NN'),
 ('bank', 'NN'),
 ('Bank', 'NNP'),
 ('bank', 'NN'),
 ('bank', 'NN'),
 ('bank', 'NN'),
 ('bank', 'NN'),
 ('bank', 'NN'),
 ('Bank', 'NNP'),
 ('Bank', 'NNP'),
 ('Bank', 'NNP'),
 ('bank', 'NN'),
 ('Bank', 'NNP'),
 ('Bank', 'NNP'),
 ('Bank', 'NNP'),
 ('Bank', 'NNP'),
 ('Bank', 'NNP'),
 ('Bank', 'NNP'),
 ('Bank', 'NNP'),
 ('Bank', 'NNP'),
 ('Bank', 'NNP'),
 ('bank', 'NN'),
 ('bank', 'NN'),
 ('Bank', 'NNP'),
 

In [8]:
# question 3: Which tag is most commonly assigned to the word "executive".
executive = [pair for pair in tagged_words if pair[0].lower() == 'executive']
executive

[('executive', 'NN'),
 ('executive', 'JJ'),
 ('executive', 'JJ'),
 ('executive', 'NN'),
 ('executive', 'NN'),
 ('executive', 'NN'),
 ('executive', 'NN'),
 ('executive', 'NN'),
 ('executive', 'NN'),
 ('executive', 'NN'),
 ('executive', 'NN'),
 ('executive', 'NN'),
 ('executive', 'NN'),
 ('executive', 'JJ'),
 ('executive', 'JJ'),
 ('executive', 'JJ'),
 ('executive', 'NN'),
 ('executive', 'JJ'),
 ('executive', 'NN'),
 ('executive', 'NN'),
 ('executive', 'NN'),
 ('executive', 'NN'),
 ('executive', 'NN'),
 ('executive', 'NN'),
 ('executive', 'NN'),
 ('executive', 'NN'),
 ('executive', 'JJ'),
 ('executive', 'JJ'),
 ('executive', 'JJ'),
 ('executive', 'JJ'),
 ('executive', 'JJ'),
 ('executive', 'NN'),
 ('executive', 'NN'),
 ('executive', 'NN'),
 ('executive', 'NN'),
 ('executive', 'NN'),
 ('executive', 'JJ'),
 ('executive', 'NN'),
 ('executive', 'NN'),
 ('executive', 'NN'),
 ('executive', 'JJ'),
 ('executive', 'JJ'),
 ('executive', 'JJ'),
 ('executive', 'JJ'),
 ('executive', 'JJ'),
 ('executi

In [18]:
# splitting into train and test sets
random.seed(1234)
train_set, test_set = train_test_split(wsj, test_size=0.3)
print(len(train_set))
print(len(test_set))
print(train_set[:2])

2739
1175
[[('The', 'DT'), ('House', 'NNP'), ('and', 'CC'), ('Senate', 'NNP'), ('are', 'VBP'), ('divided', 'VBN'), ('*-6', '-NONE-'), ('over', 'IN'), ('whether', 'IN'), ('the', 'DT'), ('United', 'NNP'), ('Nations', 'NNPS'), ('Population', 'NNP'), ('Fund', 'NNP'), ('will', 'MD'), ('receive', 'VB'), ('any', 'DT'), ('portion', 'NN'), ('of', 'IN'), ('these', 'DT'), ('appropriations', 'NNS'), (',', ','), ('but', 'CC'), ('the', 'DT'), ('size', 'NN'), ('of', 'IN'), ('the', 'DT'), ('increase', 'NN'), ('is', 'VBZ'), ('itself', 'PRP'), ('significant', 'JJ'), ('.', '.')], [('-LRB-', '-LRB-'), ('Fewer', 'JJR'), ('said', 'VBD'), ('0', '-NONE-'), ('conditions', 'NNS'), ('wo', 'MD'), ("n't", 'RB'), ('change', 'VB'), ('.', '.'), ('-RRB-', '-RRB-')]]


In [23]:
#Let’s now try training a lexicon (or a unigram) tagger which assigns the most commonly assigned tag to a word.

#In NLTK, the UnigramTagger() can be used to train such a model.

# Lexicon (or unigram tagger)
unigram_tagger = nltk.UnigramTagger(train_set)
unigram_tagger.accuracy(test_set)

0.8729672722324284

Rule-Based (Regular Expression) Tagger
Let’s now move on to building a rule-based tagger, which utilizes regular expressions. In NLTK, we can use the RegexpTagger() to provide handwritten regular expression patterns for our tagger.

For example, we can specify regexes for various grammatical forms such as gerunds and past tense verbs, 3rd singular present verbs (e.g., creates, moves, makes), modal verbs (e.g., should, would, could), possessive nouns (e.g., partner’s, bank’s), plural nouns (e.g., banks, institutions), cardinal numbers (CD), and so on. In case none of these rules are applicable to a word, we can assign the most frequent tag NN to it.

In [22]:
# specify patterns for tagging
# example from the NLTK book
patterns = [
    (r'.*ing$', 'VBG'),              # gerund
    (r'.*ed$', 'VBD'),               # past tense
    (r'.*es$', 'VBZ'),               # 3rd singular present
    (r'.*ould$', 'MD'),              # modals
    (r'.*\'s$', 'NN$'),              # possessive nouns
    (r'.*s$', 'NNS'),                # plural nouns
    (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers
    (r'.*', 'NN')                    # nouns
]
regexp_tagger = nltk.RegexpTagger(patterns)
regexp_tagger.accuracy(test_set)

0.21586037966932026

Combining Taggers
Let’s explore the possibility of combining the taggers we created earlier. As we saw earlier, the rule-based tagger on its own is not very effective due to the limited number of rules we have written. However, by combining the lexicon and rule-based taggers, we have the potential to create a tagger that performs better than either of the individual ones.

NLTK provides a convenient method to combine taggers using the ‘backup’ argument. In the following code, we create a regex tagger to act as a backup to the lexicon tagger. In other words, if the lexicon tagger is unable to tag a word (e.g., a new word not in the vocabulary), it will use the rule-based tagger to assign a tag. Additionally, note that the rule-based tagger itself is backed up by the ‘NN’ tag.

In [24]:
# rule based tagger
rule_based_tagger = nltk.RegexpTagger(patterns)
# lexicon backed up by the rule-based tagger
lexicon_tagger = nltk.UnigramTagger(train_set, backoff=rule_based_tagger)
lexicon_tagger.accuracy(test_set)

0.9049125671905831

So, as we can observe by combining the taggers our accuracy is increased to 90.49% even higher than the lexicon-based tagger.