In [4]:
import nltk 
nltk.download('tagsets')

[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\balkr\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping help\tagsets.zip.


True

In [1]:
import nltk 
text=nltk.word_tokenize("It is a pleasant day today.")
nltk.pos_tag(text)

[('It', 'PRP'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('pleasant', 'JJ'),
 ('day', 'NN'),
 ('today', 'NN'),
 ('.', '.')]

In [2]:
text = nltk.word_tokenize("They refuse to permit us to obtain the refuse permit")
nltk.pos_tag(text)

[('They', 'PRP'),
 ('refuse', 'VBP'),
 ('to', 'TO'),
 ('permit', 'VB'),
 ('us', 'PRP'),
 ('to', 'TO'),
 ('obtain', 'VB'),
 ('the', 'DT'),
 ('refuse', 'NN'),
 ('permit', 'NN')]

In [4]:
# NLTK may provide the information of tags.
nltk.help.upenn_tagset('NN')

NN: noun, common, singular or mass
    common-carrier cabbage knuckle-duster Casino afghan shed thermostat
    investment slide humour falloff slick wind hyena override subhumanity
    machinist ...


In [6]:
# NLTK may provide the information of tags.
nltk.help.upenn_tagset('NN.*')

NN: noun, common, singular or mass
    common-carrier cabbage knuckle-duster Casino afghan shed thermostat
    investment slide humour falloff slick wind hyena override subhumanity
    machinist ...
NNP: noun, proper, singular
    Motown Venneboerger Czestochwa Ranzer Conchita Trumplane Christos
    Oceanside Escobar Kreisler Sawyer Cougar Yvette Ervin ODI Darryl CTCA
    Shannon A.K.C. Meltex Liverpool ...
NNPS: noun, proper, plural
    Americans Americas Amharas Amityvilles Amusements Anarcho-Syndicalists
    Andalusians Andes Andruses Angels Animals Anthony Antilles Antiques
    Apache Apaches Apocrypha ...
NNS: noun, common, plural
    undergraduates scotches bric-a-brac products bodyguards facets coasts
    divestitures storehouses designs clubs fragrances averages
    subjectivists apprehensions muses factory-jobs ...


In [7]:
nltk.help.upenn_tagset('VB.*')

VB: verb, base form
    ask assemble assess assign assume atone attention avoid bake balkanize
    bank begin behold believe bend benefit bevel beware bless boil bomb
    boost brace break bring broil brush build ...
VBD: verb, past tense
    dipped pleaded swiped regummed soaked tidied convened halted registered
    cushioned exacted snubbed strode aimed adopted belied figgered
    speculated wore appreciated contemplated ...
VBG: verb, present participle or gerund
    telegraphing stirring focusing angering judging stalling lactating
    hankerin' alleging veering capping approaching traveling besieging
    encrypting interrupting erasing wincing ...
VBN: verb, past participle
    multihulled dilapidated aerosolized chaired languished panelized used
    experimented flourished imitated reunifed factored condensed sheared
    unsettled primed dubbed desired ...
VBP: verb, present tense, not 3rd person singular
    predominate wrap resort sue twist spill cure lengthen brush terminate
 

In [9]:
text=nltk.word_tokenize("I cannot bear the pain of bear.")
nltk.pos_tag(text)

[('I', 'PRP'),
 ('can', 'MD'),
 ('not', 'RB'),
 ('bear', 'VB'),
 ('the', 'DT'),
 ('pain', 'NN'),
 ('of', 'IN'),
 ('bear', 'NN'),
 ('.', '.')]

## Tagged Corpora
### Representing Tagged Tokens

In [10]:
# In NLTK, a tagged token is represented as a tuple consisting of a token and its tag.
# We can create this tuple in NLTK using the str2tuple() function

taggedword=nltk.tag.str2tuple('bear/NN')
print(taggedword)
print(taggedword[0])
print(taggedword[1])

('bear', 'NN')
bear
NN


In [8]:
# converts the tuple (word and pos tag) into a word and a tag
from nltk.tag.util import tuple2str
taggedtok = ('People', 'NNP')
tuple2str(taggedtok)

'People/NNP'

In [11]:
sentence='''The/DT sacred/VBN Ganga/NNP flows/VBZ in/IN this/DT
region/NN ./. This/DT is/VBZ a/DT pilgrimage/NN ./. People/NNP from/IN
all/DT over/IN the/DT country/NN visit/NN this/DT place/NN ./. '''

In [12]:
[nltk.tag.str2tuple(t) for t in sentence.split()]

[('The', 'DT'),
 ('sacred', 'VBN'),
 ('Ganga', 'NNP'),
 ('flows', 'VBZ'),
 ('in', 'IN'),
 ('this', 'DT'),
 ('region', 'NN'),
 ('.', '.'),
 ('This', 'DT'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('pilgrimage', 'NN'),
 ('.', '.'),
 ('People', 'NNP'),
 ('from', 'IN'),
 ('all', 'DT'),
 ('over', 'IN'),
 ('the', 'DT'),
 ('country', 'NN'),
 ('visit', 'NN'),
 ('this', 'DT'),
 ('place', 'NN'),
 ('.', '.')]

In [13]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
stop_words = set(stopwords.words('english'))

text="""Tokenization is the first step in text analytics. 
        The process of breaking down a text paragraph into smaller chunks such as words or sentence is called Tokenization.
        Token is a single entity that is building blocks for sentence or paragraph. 
        Does sentence tokenizer break text paragraph into sentences?
        What is fact?"""


tokenized = sent_tokenize(text)
for i in tokenized:
      
    # Word tokenizers is used to find the words and punctuation in a string
    wordsList = nltk.word_tokenize(i)
  
    # removing stop words from wordList
    wordsList = [w for w in wordsList if not w in stop_words] 
  
    #  Using a Tagger. Which is part-of-speech agger or POS-tagger. 
    tagged = nltk.pos_tag(wordsList)
  
    print(tagged)

[('Tokenization', 'NN'), ('first', 'RB'), ('step', 'VB'), ('text', 'JJ'), ('analytics', 'NNS'), ('.', '.')]
[('The', 'DT'), ('process', 'NN'), ('breaking', 'VBG'), ('text', 'NN'), ('paragraph', 'NN'), ('smaller', 'JJR'), ('chunks', 'NNS'), ('words', 'NNS'), ('sentence', 'NN'), ('called', 'VBN'), ('Tokenization', 'NNP'), ('.', '.')]
[('Token', 'NNP'), ('single', 'JJ'), ('entity', 'NN'), ('building', 'NN'), ('blocks', 'NNS'), ('sentence', 'NN'), ('paragraph', 'NN'), ('.', '.')]
[('Does', 'NNP'), ('sentence', 'VB'), ('tokenizer', 'NN'), ('break', 'NN'), ('text', 'NN'), ('paragraph', 'NN'), ('sentences', 'NNS'), ('?', '.')]
[('What', 'WP'), ('fact', 'NN'), ('?', '.')]


In [12]:
nltk.download('indian')

[nltk_data] Downloading package indian to
[nltk_data]     C:\Users\balkr\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\indian.zip.


True

In [14]:
nltk.corpus.indian.tagged_words()

[('মহিষের', 'NN'), ('সন্তান', 'NN'), (':', 'SYM'), ...]

In [15]:
nltk.corpus.brown.tagged_words()

[('The', 'AT'), ('Fulton', 'NP-TL'), ...]

In [16]:
nltk.download('universal_tagset')

[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\balkr\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\universal_tagset.zip.


True

In [16]:
nltk.corpus.brown.tagged_words(tagset='universal')

[('The', 'DET'), ('Fulton', 'NOUN'), ...]

In [17]:
from nltk.corpus import brown
brown_news_tagged = brown.tagged_words(categories='news', tagset='universal')
tag_fd = nltk.FreqDist(tag for (word, tag) in brown_news_tagged)
tag_fd.most_common()

[('NOUN', 30654),
 ('VERB', 14399),
 ('ADP', 12355),
 ('.', 11928),
 ('DET', 11389),
 ('ADJ', 6706),
 ('ADV', 3349),
 ('CONJ', 2717),
 ('PRON', 2535),
 ('PRT', 2264),
 ('NUM', 2166),
 ('X', 92)]

## Default tagging

In [16]:
"""Default tagging provides a baseline for part-of-speech tagging. It simply assigns the same
part-of-speech tag to every token. We do this using the DefaultTagger class. This tagger
is useful as a last-resort tagger, and provides a baseline to measure accuracy improvements."""

'Default tagging provides a baseline for part-of-speech tagging. It simply assigns the same\npart-of-speech tag to every token. We do this using the DefaultTagger class. This tagger\nis useful as a last-resort tagger, and provides a baseline to measure accuracy improvements.'

In [18]:
from nltk.tag import DefaultTagger
tagger = DefaultTagger('VB')
tagger.tag(['Hello', 'World','Students'])

[('Hello', 'VB'), ('World', 'VB'), ('Students', 'VB')]

In [19]:
text="Default tagging provides a baseline for part-of-speech tagging."
text=[nltk.word_tokenize(text)]
tagger = DefaultTagger('NN')
for tg in text:
    print(tagger.tag(tg))    

[('Default', 'NN'), ('tagging', 'NN'), ('provides', 'NN'), ('a', 'NN'), ('baseline', 'NN'), ('for', 'NN'), ('part-of-speech', 'NN'), ('tagging', 'NN'), ('.', 'NN')]


In [20]:
from nltk.corpus import treebank
test_sents = treebank.tagged_sents()[3000:]
tagger.evaluate(test_sents)

0.14331966328512843

## Training a unigram part-of-speech tagger

In [20]:
"""A unigram generally refers to a single token. Therefore, a unigram tagger only uses a single
word as its context for determining the part-of-speech tag."""

'A unigram generally refers to a single token. Therefore, a unigram tagger only uses a single\nword as its context for determining the part-of-speech tag.'

In [21]:
nltk.download('treebank')

[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\balkr\AppData\Roaming\nltk_data...
[nltk_data]   Package treebank is already up-to-date!


True

In [22]:
"""We use the first 3000 tagged sentences of the treebank corpus as the training set to
initialize the UnigramTagger class. Then, we see the first sentence as a list of words,
and can see how it is transformed by the tag() function into a list of tagged tokens"""

'We use the first 3000 tagged sentences of the treebank corpus as the training set to\ninitialize the UnigramTagger class. Then, we see the first sentence as a list of words,\nand can see how it is transformed by the tag() function into a list of tagged tokens'

In [21]:
from nltk.tag import UnigramTagger
from nltk.corpus import treebank
train_sents = treebank.tagged_sents()[:3000]
tagger = UnigramTagger(train_sents)
treebank.sents()[0]

['Pierre',
 'Vinken',
 ',',
 '61',
 'years',
 'old',
 ',',
 'will',
 'join',
 'the',
 'board',
 'as',
 'a',
 'nonexecutive',
 'director',
 'Nov.',
 '29',
 '.']

In [24]:
tagger.tag(treebank.sents()[0])

[('Pierre', 'NNP'),
 ('Vinken', 'NNP'),
 (',', ','),
 ('61', 'CD'),
 ('years', 'NNS'),
 ('old', 'JJ'),
 (',', ','),
 ('will', 'MD'),
 ('join', 'VB'),
 ('the', 'DT'),
 ('board', 'NN'),
 ('as', 'IN'),
 ('a', 'DT'),
 ('nonexecutive', 'JJ'),
 ('director', 'NN'),
 ('Nov.', 'NNP'),
 ('29', 'CD'),
 ('.', '.')]

In [22]:
test_sents = treebank.tagged_sents()[3000:]
tagger.evaluate(test_sents)

0.8571551910209367

## Minimum frequency cutoff

In [23]:
"""If you'd like to set a minimum frequency threshold, then you can pass a cutoff value to the
UnigramTagger class."""

tagger = UnigramTagger(train_sents, cutoff=3)
tagger.evaluate(test_sents)

0.775350744657889

In [24]:
tagger = UnigramTagger(train_sents, cutoff=2)
tagger.evaluate(test_sents)

0.7951651197927908

## Combining taggers with backoff tagging

In [None]:
"""Backoff tagging is one of the core features of SequentialBackoffTagger. It allows you
to chain taggers together so that if one tagger doesn't know how to tag a word, it can pass
the word on to the next backoff tagger. If that one can't do it, it can pass the word on to the
next backoff tagger, and so on until
there are no backoff taggers left to check."""

In [28]:
tagger1 = DefaultTagger('NN')
tagger2 = UnigramTagger(train_sents, backoff=tagger1)
tagger2.evaluate(test_sents)

0.8741204403194475

### Training and combining ngram taggers

In [29]:
from nltk.tag import BigramTagger, TrigramTagger
bitagger = BigramTagger(train_sents)
bitagger.evaluate(test_sents)

0.11318799913662854

In [30]:
tritagger = TrigramTagger(train_sents)
tritagger.evaluate(test_sents)

0.06902654867256637

In [32]:
def backoff_tagger(train_sents, tagger_classes, backoff=None):
    for cls in tagger_classes:
        backoff = cls(train_sents, backoff=backoff)
    return backoff

In [33]:
backoff = DefaultTagger('NN')
tagger = backoff_tagger(train_sents, [UnigramTagger, BigramTagger,TrigramTagger], backoff=backoff)
tagger.evaluate(test_sents)

0.8806388948845241

In [35]:
import nltk
text = "We are taking nlp classes."
tokens = nltk.word_tokenize(text)
print(tokens)
tag = nltk.pos_tag(tokens)
print(tag)
grammar = "NP: {<DT>?<JJ>*<NN>}"


cp  =nltk.RegexpParser(grammar)
result = cp.parse(tag)
print(result)
result.draw()    # It will draw the pattern graphically which can be seen in Noun Phrase chunking 

['We', 'are', 'taking', 'nlp', 'classes', '.']
[('We', 'PRP'), ('are', 'VBP'), ('taking', 'VBG'), ('nlp', 'JJ'), ('classes', 'NNS'), ('.', '.')]
(S We/PRP are/VBP taking/VBG nlp/JJ classes/NNS ./.)
