In corpus linguistics, part-of-speech tagging (POS tagging or PoS tagging or POST), also called grammatical tagging is the process of marking up a word in a text (corpus) as corresponding to a particular part of speech,based on both its definition and its context.

In [1]:
import nltk
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize,sent_tokenize
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [2]:
text='Rohan and Rajat are good friends.'\
     'They both are the players of school volleyball team.'\
     'Last year they got first position in inter state volleyball competition.'\
     'This year also their school is going in the finals of the same competition.'\
     'Both are shortlisted for National team.'\
     'Next year they both will be going to Singapore for a tounament.'\
     'We all are proud of them.'\


In [3]:
tokenized_text=sent_tokenize(text)
print(tokenized_text)

['Rohan and Rajat are good friends.They both are the players of school volleyball team.Last year they got first position in inter state volleyball competition.This year also their school is going in the finals of the same competition.Both are shortlisted for National team.Next year they both will be going to Singapore for a tounament.We all are proud of them.']


In [4]:
stop_words = set(stopwords.words('english')) 
for i in tokenized_text:
  # Word tokenizers is used to find the words  
    # and punctuation in a string 
  tokenized_words=word_tokenize(i)

  
  # removing stop words from wordList 
  wordsList = [w for w in tokenized_words if not w in stop_words]
  #  Using a Tagger. Which is part-of-speech tagger or POS-tagger.  
  tags=nltk.pos_tag(wordsList)
  print(tags)


[('Rohan', 'NNP'), ('Rajat', 'NNP'), ('good', 'JJ'), ('friends.They', 'NN'), ('players', 'NNS'), ('school', 'NN'), ('volleyball', 'NN'), ('team.Last', 'IN'), ('year', 'NN'), ('got', 'VBD'), ('first', 'JJ'), ('position', 'NN'), ('inter', 'NN'), ('state', 'NN'), ('volleyball', 'NN'), ('competition.This', 'NN'), ('year', 'NN'), ('also', 'RB'), ('school', 'NN'), ('going', 'VBG'), ('finals', 'NNS'), ('competition.Both', 'RB'), ('shortlisted', 'VBD'), ('National', 'NNP'), ('team.Next', 'JJ'), ('year', 'NN'), ('going', 'VBG'), ('Singapore', 'NNP'), ('tounament.We', 'NN'), ('proud', 'NN'), ('.', '.')]


In [5]:
import spacy 
  
# Load English tokenizer, tagger,parser, NER and word vectors 
nlp = spacy.load("en_core_web_sm") 
  
# Process whole documents 
  
doc = nlp(text) 
  
# Token and Tag 
for token in doc: 
  print(token, token.pos_) 
  
# You want list of Verb tokens 
print("Verbs:", [token.text for token in doc if token.pos_ == "VERB"]) 

Rohan PROPN
and CCONJ
Rajat PROPN
are AUX
good ADJ
friends NOUN
. PUNCT
They PRON
both DET
are AUX
the DET
players NOUN
of ADP
school NOUN
volleyball NOUN
team NOUN
. PUNCT
Last ADJ
year NOUN
they PRON
got VERB
first ADJ
position NOUN
in ADP
inter PROPN
state PROPN
volleyball PROPN
competition NOUN
. PUNCT
This DET
year NOUN
also ADV
their DET
school NOUN
is AUX
going VERB
in ADP
the DET
finals NOUN
of ADP
the DET
same ADJ
competition NOUN
. PUNCT
Both DET
are AUX
shortlisted VERB
for ADP
National ADJ
team NOUN
. PUNCT
Next ADJ
year NOUN
they PRON
both DET
will VERB
be AUX
going VERB
to ADP
Singapore PROPN
for ADP
a DET
tounament NOUN
. PUNCT
We PRON
all DET
are AUX
proud ADJ
of ADP
them PRON
. PUNCT
Verbs: ['got', 'going', 'shortlisted', 'will', 'going']
