In [1]:
from nltk.tokenize import word_tokenize
from nltk import pos_tag

In [2]:
sent = 'A quick brown fox jumped over the lazy dog.'

In [3]:
pos_tag(word_tokenize(sent))

[('A', 'DT'),
 ('quick', 'JJ'),
 ('brown', 'NN'),
 ('fox', 'NN'),
 ('jumped', 'VBD'),
 ('over', 'IN'),
 ('the', 'DT'),
 ('lazy', 'JJ'),
 ('dog', 'NN'),
 ('.', '.')]

In [4]:
text = """. that the origin of chole bhature (example pictured) is disputed?
... , is the first woman master blender in the history of Irish whiskey?"""

In [5]:
pos_tag(word_tokenize(text))

[('.', '.'),
 ('that', 'IN'),
 ('the', 'DT'),
 ('origin', 'NN'),
 ('of', 'IN'),
 ('chole', 'JJ'),
 ('bhature', 'NN'),
 ('(', '('),
 ('example', 'NN'),
 ('pictured', 'VBN'),
 (')', ')'),
 ('is', 'VBZ'),
 ('disputed', 'VBN'),
 ('?', '.'),
 (',', ','),
 ('is', 'VBZ'),
 ('the', 'DT'),
 ('first', 'JJ'),
 ('woman', 'NN'),
 ('master', 'NN'),
 ('blender', 'NN'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('history', 'NN'),
 ('of', 'IN'),
 ('Irish', 'NNP'),
 ('whiskey', 'NN'),
 ('?', '.')]

In [6]:
# extract only verbs
verbs = [verb for verb, pos in pos_tag(word_tokenize(text)) if pos.startswith('VB')]

In [7]:
verbs

['pictured', 'is', 'disputed', 'is']

In [8]:
# unique verbs
verbs = {verb for verb, pos in pos_tag(word_tokenize(text)) if pos.startswith('VB')}
verbs

{'disputed', 'is', 'pictured'}

In [9]:
# count total number of nouns

nouns = [noun for noun, pos in pos_tag(word_tokenize(text)) if pos.startswith('NN')]
len(nouns)

9

In [10]:
text = """Saturn is the sixth planet from the Sun and the second largest in the Solar System, after Jupiter. It is a gas giant, with an average radius of about 9 times that of Earth. It has an eighth of the average density of Earth, but is over 95 times more massive. Even though Saturn is almost as big as Jupiter, Saturn has less than a third of its mass. Saturn orbits the Sun at a distance of 9.59 AU (1,434 million km), with an orbital period of 29.45 years.

Saturn's interior is thought to be composed of a rocky core, surrounded by a deep layer of metallic hydrogen, an intermediate layer of liquid hydrogen and liquid helium, and an outer layer of gas. Saturn has a pale yellow hue, due to ammonia crystals in its upper atmosphere. An electrical current in the metallic hydrogen layer is thought to give rise to Saturn's planetary magnetic field, which is weaker than Earth's, but has a magnetic moment 580 times that of Earth because of Saturn's greater size. Saturn's magnetic field strength is about a twentieth that of Jupiter.[27] The outer atmosphere is generally bland and lacking in contrast, although long-lived features can appear. Wind speeds on Saturn can reach 1,800 kilometres per hour (1,100 miles per hour).

The planet has a bright and extensive system of rings, composed mainly of ice particles, with a smaller amount of rocky debris and dust. At least 274 moons orbit the planet, of which 63 are officially named; these do not include the hundreds of moonlets in the rings. Titan, Saturn's largest moon and the second largest in the Solar System, is larger (but less massive) than the planet Mercury and is the only moon in the Solar System that has a substantial atmosphere"""

In [11]:
# pair of adjective and noun
pos_tags = pos_tag(word_tokenize(text))
pairs = []
for i in range(len(pos_tags)):
    if pos_tags[i][1].startswith('JJ') and pos_tags[i+1][1].startswith('NN'):
        pairs.append([pos_tags[i][0], pos_tags[i+1][0]])

In [12]:
pairs

[['sixth', 'planet'],
 ['average', 'radius'],
 ['average', 'density'],
 ['orbital', 'period'],
 ['rocky', 'core'],
 ['deep', 'layer'],
 ['metallic', 'hydrogen'],
 ['intermediate', 'layer'],
 ['liquid', 'helium'],
 ['outer', 'layer'],
 ['yellow', 'hue'],
 ['upper', 'atmosphere'],
 ['metallic', 'hydrogen'],
 ['magnetic', 'field'],
 ['magnetic', 'moment'],
 ['greater', 'size'],
 ['magnetic', 'field'],
 ['long-lived', 'features'],
 ['extensive', 'system'],
 ['smaller', 'amount'],
 ['rocky', 'debris'],
 ['largest', 'moon'],
 ['planet', 'Mercury'],
 ['only', 'moon'],
 ['substantial', 'atmosphere']]

## Indian Language POS Tag

In [13]:
import nltk
from nltk.tokenize import word_tokenize

nltk.download('indian')

[nltk_data] Downloading package indian to
[nltk_data]     C:\Users\dai\AppData\Roaming\nltk_data...
[nltk_data]   Package indian is already up-to-date!


True

In [14]:
from nltk.corpus import indian

indian.fileids()

['bangla.pos', 'hindi.pos', 'marathi.pos', 'telugu.pos']

In [15]:
indian.sents('hindi.pos')[0][1]

'प्रतिबंध'

In [16]:
for file in indian.fileids():
    print(file)
    print(len(indian.words(file)))

bangla.pos
10281
hindi.pos
9408
marathi.pos
19066
telugu.pos
9999


In [17]:
indian.tagged_sents('hindi.pos')

[[('पूर्ण', 'JJ'), ('प्रतिबंध', 'NN'), ('हटाओ', 'VFM'), (':', 'SYM'), ('इराक', 'NNP')], [('संयुक्त', 'NNC'), ('राष्ट्र', 'NN'), ('।', 'SYM')], ...]

## Something new

In [18]:
from nltk import TnT

tags = indian.tagged_sents('hindi.pos')

tagger = TnT()

In [19]:
# Train the tagger
tagger.train(tags)

In [20]:
new_sent = 'आज की रात होना है क्या?'

tagger.tag(word_tokenize(new_sent))

[('आज', 'NN'),
 ('की', 'PREP'),
 ('रात', 'Unk'),
 ('होना', 'VNN'),
 ('है', 'VAUX'),
 ('क्या', 'QW'),
 ('?', 'PUNC')]