## POS tagging using modified Viterbi

### Data Preparation

In [214]:
#Importing libraries
import nltk
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from collections import Counter
import time
from nltk.tokenize import word_tokenize
from functools import reduce
import random
import pdb
import re

In [2]:
# Reading Treebank tagged sentences
nltk_data = list(nltk.corpus.treebank.tagged_sents(tagset='universal'))

In [11]:
# splitting Treebank dataset into train & validation set in ration of 95:5 %
train_set, validation_set = train_test_split(nltk_data,train_size=.95,test_size=0.05,random_state=70)

In [12]:
print("Dataset size-> nltk: {}, Training:{}, Validation: {}".format(len(nltk_data),len(train_set),len(validation_set)))

Dataset size-> nltk: 3914, Training:3718, Validation: 196


In [13]:
tagged_train_set=[tup for set in train_set for tup in set]
# Extracging vocabulary and respective Tags sperately
vocabulary=(set([v[0] for v in tagged_train_set]))
tags=set(v[1] for v in tagged_train_set)
print("Length-> Vocabulary: {}, Tags: {}".format(len(vocabulary),len(tags)))
print("Available Tags-> ",tags)

Length-> Vocabulary: 12088, Tags: 12
Available Tags->  {'DET', 'CONJ', 'PRT', '.', 'NOUN', 'X', 'ADP', 'ADJ', 'ADV', 'VERB', 'NUM', 'PRON'}


### Build the vanilla Viterbi based POS tagger

In [14]:
# compute word given tag: Emission Probability
def word_given_tag(word, tag, train_bag = tagged_train_set):
    tag_list = [pair for pair in train_bag if pair[1]==tag]
    count_tag = len(tag_list)
    w_given_tag_list = [pair[0] for pair in tag_list if pair[0]==word]
    count_w_given_tag = len(w_given_tag_list)
    
    return (count_w_given_tag, count_tag)

In [15]:
def t2_given_t1(t2, t1, train_bag = tagged_train_set):
    tags = [pair[1] for pair in train_bag]
    count_t1 = len([t for t in tags if t==t1])
    count_t2_t1 = 0
    for index in range(len(tags)-1):
        if tags[index]==t1 and tags[index+1] == t2:
            count_t2_t1 += 1
    return (count_t2_t1, count_t1)

In [16]:
tags_matrix = np.zeros((len(tags), len(tags)), dtype='float32')
for i, t1 in enumerate(list(tags)):
    for j, t2 in enumerate(list(tags)): 
        tags_matrix[i, j] = t2_given_t1(t2, t1)[0]/t2_given_t1(t2, t1)[1]

In [17]:
# tags_matrix
# convert the matrix to a df for better readability
tags_df = pd.DataFrame(tags_matrix, columns = list(tags), index=list(tags))

In [18]:
tags_df.loc['.', :]

DET     0.173507
CONJ    0.058015
PRT     0.002515
.       0.091783
NOUN    0.222452
X       0.027211
ADP     0.091334
ADJ     0.043197
ADV     0.053345
VERB    0.089807
NUM     0.081275
PRON    0.065469
Name: ., dtype: float32

In [224]:
def Viterbi(words, train_bag = tagged_train_set,backoff=[]):
    state = []
    T = list(set([pair[1] for pair in train_bag]))
    
    for key, word in enumerate(words):
        #initialise list of probability column for a given observation
        p = [] 
#         state_max=0.0
        for tag in T:
            if key == 0:
                transition_p = tags_df.loc['.', tag]
            else:
                transition_p = tags_df.loc[state[-1], tag]

            # compute emission and state probabilities
            emission_p = word_given_tag(words[key], tag)[0]/word_given_tag(words[key], tag)[1]
            state_probability = emission_p * transition_p    
            p.append(state_probability)
        pmax = max(p)
        state_max = None
#         print(str(pmax)+' '+word)
        if pmax==0.0:
#             pdb.set_trace()
            linker=backoff.copy()
            while linker!=[]:
                if state_max==None:
                    state_max=linker.pop()([word])[0][1]
                else:
                    linker.clear()
#                 print(str(pmax)+' '+word)
#         pdb.set_trace()
        # getting state for which probability is maximum
        if state_max==None:
            state_max = T[p.index(pmax)] 
        state.append(state_max)
    return list(zip(words, state))

In [215]:
# def Viterbi(words, train_bag = tagged_train_set,backoff=[]):
#     state = []
#     T = list(set([pair[1] for pair in train_bag]))
    
#     for key, word in enumerate(words):
#         #initialise list of probability column for a given observation
#         p = [] 
# #         state_max=0.0
#         for tag in T:
#             if key == 0:
#                 transition_p = tags_df.loc['.', tag]
#             else:
#                 transition_p = tags_df.loc[state[-1], tag]

#             # compute emission and state probabilities
#             emission_p = word_given_tag(words[key], tag)[0]/word_given_tag(words[key], tag)[1]
#             state_probability = emission_p * transition_p    
#             p.append(state_probability)
#         pmax = max(p)
#         state_max = T[p.index(pmax)] 
# #         print(str(pmax)+' '+word)
#         if pmax==0.0:
#             pdb.set_trace()
#             if len(backoff)!=0:
# #                 if backoff[0].__name__=='cardinal_tagger':
#                 state_max=backoff[0]([word])[0][1]
#                     if state_max==None and :
#                         status=backoff[0]([word])
# #                 print(str(pmax)+' '+word)
# #         pdb.set_trace()
#         # getting state for which probability is maximum
#         if state_max==None:
#             state_max = T[p.index(pmax)] 
#         state.append(state_max)
#     return list(zip(words, state))

In [None]:
# def Viterbi(words, train_bag = tagged_train_set,backoff=[]):
#     state = []
#     T = list(set([pair[1] for pair in train_bag]))
    
#     for key, word in enumerate(words):
#         #initialise list of probability column for a given observation
#         if (re.search('^[A-Z]+([a-z]{1})?\.?$',word) or re.search('.*[0-9]+.*',word))==None:
#             p = [] 
#             for tag in T:
#                 if key == 0:
#                     transition_p = tags_df.loc['.', tag]
#                 else:
#                     transition_p = tags_df.loc[state[-1], tag]

#                 # compute emission and state probabilities
#                 emission_p = word_given_tag(words[key], tag)[0]/word_given_tag(words[key], tag)[1]
#                 state_probability = emission_p * transition_p    
#                 p.append(state_probability)

#             pmax = max(p)
#     #         pdb.set_trace()
#             # getting state for which probability is maximum
#             state_max = T[p.index(pmax)] 
#         else:
#             state_max=cardinal_tagger([word])[0][1]
#         state.append(state_max)
#     return list(zip(words, state))


In [None]:
# random.seed(1234)
# # choose random 5 sents
# rndom = [random.randint(1,len(validation_set)) for x in range(5)]
# print(rndom)
# validation_run = [validation_set[i] for i in rndom]
# validation_run_base = [tup for sent in validation_run for tup in sent]
# print(len(validation_run_base))
# sum([len(v) for v in validation_run])
# sum([len(v) for v in validation_set])
len(validation_set)

In [189]:
random.seed(1234)
# choose random 5 sents
rndom = [random.randint(1,len(validation_set)) for x in range(10)]

# list of sents
validation_run = [validation_set[i] for i in rndom]

# list of tagged words
validation_run_base = [tup for sent in validation_run for tup in sent]

# list of untagged words
validation_tagged_words = [tup[0] for sent in validation_run for tup in sent]
validation_tagged_words
# validation_run_base

['So',
 'far',
 ',',
 'Mrs.',
 'Hills',
 'has',
 "n't",
 'deemed',
 'any',
 'cases',
 'bad',
 'enough',
 '*-1',
 'to',
 'merit',
 'an',
 'accelerated',
 'investigation',
 'under',
 'the',
 'so-called',
 'special',
 '301',
 'provision',
 'of',
 'the',
 'act',
 '.',
 'Despite',
 'the',
 'strong',
 'evidence',
 'against',
 'Mrs.',
 'Yeargin',
 ',',
 'popular',
 'sentiment',
 '*ICH*-1',
 'was',
 'so',
 'strong',
 '*ICH*-3',
 'in',
 'her',
 'favor',
 ',',
 'Mrs.',
 'Ward',
 'says',
 '0',
 '*T*-2',
 ',',
 'that',
 '``',
 'I',
 "'m",
 'afraid',
 '0',
 'a',
 'jury',
 'would',
 "n't",
 'have',
 'convicted',
 'her',
 '.',
 'In',
 'CAT',
 'sections',
 'where',
 'students',
 "'",
 'knowledge',
 'of',
 'two-letter',
 'consonant',
 'sounds',
 'is',
 'tested',
 '*-1',
 '*T*-2',
 ',',
 'the',
 'authors',
 'noted',
 'that',
 'Scoring',
 'High',
 'concentrated',
 'on',
 'the',
 'same',
 'sounds',
 'that',
 'the',
 'test',
 'does',
 '*?*',
 '*T*-4',
 '--',
 'to',
 'the',
 'exclusion',
 'of',
 'other',
 '

In [21]:
validation_run_base=[tup for sent in validation_set for tup in sent] # it's a list of tuple with WORD and TAG
validation_tagged_words = [tup[0] for sent in validation_set for tup in sent] # list of TAGS
# # validation_tagged_words

In [190]:
import time
# tagging the test sentences
start = time.time()
tagged_seq = Viterbi(validation_tagged_words)
end = time.time()
difference = end-start

In [191]:
print("Time taken in seconds: ", difference)
# print(tagged_seq)
print(len(tagged_seq),len(validation_run_base))
print(tagged_seq[0],validation_run_base[0])

Time taken in seconds:  37.89800000190735
257 257
('So', 'ADV') ('So', 'ADP')


In [192]:
check = [i for i, j in zip(tagged_seq, validation_run_base) if i == j] 
accuracy = len(check)/len(tagged_seq)
# print(len(check))
print(accuracy)
incorrect_tagged_cases = [j for i, j in enumerate(zip(tagged_seq, validation_run_base)) if j[0]!=j[1]]
# 0.9014084507042254
# 0.9172749391727494
# 0.9221411192214112


0.9027237354085603


In [193]:
incorrect_tagged_cases
# validation_tagged_words
# Viterbi(validation_tagged_words[1:40],backoff=[cardinal_tagger])

[(('So', 'ADV'), ('So', 'ADP')),
 (('deemed', 'DET'), ('deemed', 'VERB')),
 (('enough', 'ADJ'), ('enough', 'ADV')),
 (('merit', 'NOUN'), ('merit', 'VERB')),
 (('accelerated', 'ADJ'), ('accelerated', 'VERB')),
 (('301', 'DET'), ('301', 'NUM')),
 (('favor', 'VERB'), ('favor', 'NOUN')),
 (('that', 'DET'), ('that', 'ADP')),
 (('two-letter', 'DET'), ('two-letter', 'ADJ')),
 (('consonant', 'DET'), ('consonant', 'ADJ')),
 (('sounds', 'VERB'), ('sounds', 'NOUN')),
 (('sounds', 'VERB'), ('sounds', 'NOUN')),
 (('to', 'PRT'), ('to', 'ADP')),
 (('exclusion', 'DET'), ('exclusion', 'NOUN')),
 (('sounds', 'VERB'), ('sounds', 'NOUN')),
 (('Handelsbanken', 'DET'), ('Handelsbanken', 'NOUN')),
 (("C'mon", 'DET'), ("C'mon", 'VERB')),
 (('boyfriends', 'DET'), ('boyfriends', 'NOUN')),
 (('estimates', 'NOUN'), ('estimates', 'VERB')),
 (('total', 'VERB'), ('total', 'NOUN')),
 (('vs.', 'CONJ'), ('vs.', 'ADP')),
 (('%', 'NOUN'), ('%', 'ADJ')),
 (('modification', 'DET'), ('modification', 'NOUN')),
 (('*-130', 'D

In [194]:
tagged_words_validation=[tup for ls in validation_set for tup in ls]
vocabulary_validation=set([t[0] for t in tagged_words_validation])

print(len(vocabulary),' ',len(vocabulary_validation))
print("Vocabulary which is not part of Training Set but part of Validation Set")
UnknownWords=list(vocabulary_validation-vocabulary)
CommonWords=list(vocabulary_validation.intersection(vocabulary))
print("Total Unknown words: {}, sample->{}".format(len(UnknownWords),UnknownWords[0:3]))
print("Vocabulary which is not part of Training Set but part of Validation Set")
print("Total common words: {}, sample->{}".format(len(CommonWords),CommonWords[0:3]))

12088   1820
Vocabulary which is not part of Training Set but part of Validation Set
Total Unknown words: 320, sample->['parts-engineering', 'pride', 'disaster-assistance']
Vocabulary which is not part of Training Set but part of Validation Set
Total common words: 1500, sample->['eliminated', 'fall', 'Kalipharma']


### Solve the problem of unknown words

In [195]:
UnknownWords

#There are some subtle things which can be find which are misclassified by Viterbi algorithm like

# Numbers-> ('3.19', 'ADV'), ('3.19', 'NUM'))
# Wines/ Company Names -> Louisiana-Pacific, Antinori
# Countries Names-> Africa, (('Clarence', 'ADV'), ('Clarence', 'NOUN')), (('American', 'ADJ'), ('American', 'NOUN')),
# Salutation like Sr. Mr. Md.
# Door Numbers/Flat--> 50-State, cary-3 etc.
# Capitalization Rule--> ASSOCIATION, CIA etc.

# We will handle numbers, Salutation, Capitalization, and Flat number via RegexpTagger

['parts-engineering',
 'pride',
 'disaster-assistance',
 'vicissitudes',
 'regulations',
 'Baking',
 'Norwegian',
 'double-A',
 'Advice',
 'deliberating',
 'rendering',
 'ECONOMIC',
 'forecasting',
 'dismayed',
 'tows',
 '7.8',
 'administer',
 'disapproval',
 '*T*-176',
 'responds',
 'dusty',
 'Nine',
 '2.95',
 'sacrificing',
 'sketching',
 'disapproved',
 'English-speaking',
 'inevitable',
 'boots',
 'Four',
 'CERTIFICATES',
 'mentioned',
 'retiring',
 'Majority',
 'tags',
 'BRIEFS',
 'timing',
 'injuring',
 'rectified',
 'commanded',
 'Huntington',
 'newsstand',
 '6.03',
 'Ian',
 'illustrates',
 'vague',
 'Heiwado',
 '70.2',
 'Alurralde',
 'Riviera',
 'diagnosed',
 'APPEARS',
 'percent',
 'resumption',
 'skirmishes',
 'insider',
 'contractors',
 'deck',
 'TRIMMING',
 'modification',
 "C'mon",
 'Cathedral',
 '0.28',
 'Tailors',
 'odd-year',
 'riding',
 'U.S.-Japan',
 'RBC',
 'Heatherington',
 'freeway',
 'alike',
 'Andean',
 'yearly',
 '301',
 'injecting',
 'fingers',
 'inquiring',
 '

In [225]:
def cardinal_tagger(word):
    patterns = [
    (r'^[aA-zZ].*[0-9]+','NOUN'),  #Flat/Door Number, Street Number
    (r'[1-9].?[,\/]?[0-9]*','NUM'),# Any Form of number non-0
    (r'^(0|[*|-|$].*)','X'), #Any special form of number like *T* *a-767, 0
    (r'^[0-9]+\-[aA-zZ]*$','ADJ') #adjective like 100-megabytes 237-Seats
    ]
#     pdb.set_trace()
    regextagger=nltk.RegexpTagger(patterns)
    return regextagger.tag(word)

In [226]:
def english_morphological_tagger(word):
    patterns = [
    (r'^[A-Z]+([a-z]{1,2})?\.?$','NOUN'),# Capitalization rule of English and Salutation
    (r'[aA-zZ]+(ed|ing|es)$', 'VERB'),
    (r'[aA-zZ]+(\'s|s)$', 'NOUN'),             # possessive nouns & plural nouns
    (r'^[A-Z]{1}[a-z]*$','NOUN')
    ]
    print("Englis")
    regextagger=nltk.RegexpTagger(patterns)
    status=regextagger.tag(word)
    return status

In [186]:
print(cardinal_tagger(['cycles'],backoff=[english_morphological_tagger]))

[('cycles', 'VERB')]


In [202]:
start = time.time()
tagged_seq = Viterbi(validation_tagged_words,backoff=[cardinal_tagger])
end = time.time()
difference = end-start

In [203]:
check = [i for i, j in zip(tagged_seq, validation_run_base) if i == j] 
accuracy = len(check)/len(tagged_seq)
# print(len(check))
print(accuracy)
incorrect_tagged_cases = [j for i, j in enumerate(zip(tagged_seq, validation_run_base)) if j[0]!=j[1]]

0.9105058365758755


In [204]:
incorrect_tagged_cases

[(('So', 'ADV'), ('So', 'ADP')),
 (('deemed', 'DET'), ('deemed', 'VERB')),
 (('enough', 'ADJ'), ('enough', 'ADV')),
 (('merit', 'NOUN'), ('merit', 'VERB')),
 (('accelerated', 'ADJ'), ('accelerated', 'VERB')),
 (('favor', 'VERB'), ('favor', 'NOUN')),
 (('that', 'DET'), ('that', 'ADP')),
 (('two-letter', 'DET'), ('two-letter', 'ADJ')),
 (('consonant', 'DET'), ('consonant', 'ADJ')),
 (('sounds', 'VERB'), ('sounds', 'NOUN')),
 (('sounds', 'VERB'), ('sounds', 'NOUN')),
 (('to', 'PRT'), ('to', 'ADP')),
 (('exclusion', 'DET'), ('exclusion', 'NOUN')),
 (('sounds', 'VERB'), ('sounds', 'NOUN')),
 (('Handelsbanken', 'DET'), ('Handelsbanken', 'NOUN')),
 (("C'mon", 'DET'), ("C'mon", 'VERB')),
 (('boyfriends', 'DET'), ('boyfriends', 'NOUN')),
 (('estimates', 'NOUN'), ('estimates', 'VERB')),
 (('total', 'VERB'), ('total', 'NOUN')),
 (('vs.', 'CONJ'), ('vs.', 'ADP')),
 (('%', 'NOUN'), ('%', 'ADJ')),
 (('modification', 'DET'), ('modification', 'NOUN')),
 (('opening', 'NOUN'), ('opening', 'VERB'))]

In [223]:
start = time.time()
tagged_seq = Viterbi(validation_tagged_words,backoff=[cardinal_tagger,english_morphological_tagger])
end = time.time()
difference = end-start

> <ipython-input-222-4272827202b2>(24)Viterbi()
-> linker=backoff.copy()
(Pdb) q


BdbQuit: 

In [None]:
check = [i for i, j in zip(tagged_seq, validation_run_base) if i == j] 
accuracy = len(check)/len(tagged_seq)
# print(len(check))
print(accuracy)
incorrect_tagged_cases = [j for i, j in enumerate(zip(tagged_seq, validation_run_base)) if j[0]!=j[1]]

In [None]:
incorrect_tagged_cases

In [None]:
re.fullmatch('[0-9].?[0-9]*','600,000')


# (('1980s', 'NOUN'), ('1980s', 'NUM')),
# (('190-point', 'NUM'), ('190-point', 'ADJ')),
#   (('237-seat', 'NUM'), ('237-seat', 'ADJ')),
#     (('100-megabyte', 'NUM'), ('100-megabyte', 'ADJ')),

In [None]:
patterns = [
    (r'[1-9].?[,\/]?[0-9]*','NUM'),
    (r'^(0|[*|-|$].*)','X')
    ]
regextagger=nltk.RegexpTagger(patterns)
regextagger.tag(['C-90'])

#### Evaluating tagging accuracy

### Compare the tagging accuracies of the modifications with the vanilla Viterbi algorithm

In [44]:
sent=nltk.corpus.treebank.tagged_sents()[22]
print(nltk.ne_chunk(sent, binary=True))

(S
  The/DT
  (NE U.S./NNP)
  is/VBZ
  one/CD
  of/IN
  the/DT
  few/JJ
  industrialized/VBN
  nations/NNS
  that/WDT
  *T*-7/-NONE-
  does/VBZ
  n't/RB
  have/VB
  a/DT
  higher/JJR
  standard/NN
  of/IN
  regulation/NN
  for/IN
  the/DT
  smooth/JJ
  ,/,
  needle-like/JJ
  fibers/NNS
  such/JJ
  as/IN
  crocidolite/NN
  that/WDT
  *T*-1/-NONE-
  are/VBP
  classified/VBN
  *-5/-NONE-
  as/IN
  amphobiles/NNS
  ,/,
  according/VBG
  to/TO
  (NE Brooke/NNP)
  T./NNP
  Mossman/NNP
  ,/,
  a/DT
  professor/NN
  of/IN
  pathlogy/NN
  at/IN
  the/DT
  (NE University/NNP)
  of/IN
  (NE Vermont/NNP College/NNP)
  of/IN
  (NE Medicine/NNP)
  ./.)


### List down cases which were incorrectly tagged by original POS tagger and got corrected by your modifications

In [42]:
nltk.ne_chunk([incorrect_tagged_cases[0][0]])
# nltk.ne_chunk_sents()

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\jaisa05\\AppData\\Local\\Temp\\tmpcsvhg8v6.png'

Tree('S', [('resumption', 'DET')])

In [53]:
tree=nltk.ne_chunk(nltk.corpus.treebank.tagged_sents()[0])
tree.leaves()
# nltk.ne_chunk(.tagged_sents()[0])

[('Pierre', 'NNP'),
 ('Vinken', 'NNP'),
 (',', ','),
 ('61', 'CD'),
 ('years', 'NNS'),
 ('old', 'JJ'),
 (',', ','),
 ('will', 'MD'),
 ('join', 'VB'),
 ('the', 'DT'),
 ('board', 'NN'),
 ('as', 'IN'),
 ('a', 'DT'),
 ('nonexecutive', 'JJ'),
 ('director', 'NN'),
 ('Nov.', 'NNP'),
 ('29', 'CD'),
 ('.', '.')]

##### Reading Test File

In [55]:
# Readinig the Test File
with open('./Test_sentences.txt','r') as f:
    lines=f.readlines()
# joining lines to form a single string
test_string=reduce(lambda x,y: x+ ' '+y,lines)
test_string=test_string.replace('\n','') #removing extra new line characters
test_string=test_string.strip() #strip white spaces
# Generating word tokens from sentence
test_tokens=word_tokenize(test_string)
test_set=nltk.pos_tag(test_tokens)

In [56]:
print(nltk.pos_tag(word_tokenize(lines[2])))
print(Viterbi(word_tokenize(lines[2])))

[('Google', 'NNP'), ('and', 'CC'), ('Twitter', 'NNP'), ('made', 'VBD'), ('a', 'DT'), ('deal', 'NN'), ('in', 'IN'), ('2015', 'CD'), ('that', 'WDT'), ('gave', 'VBD'), ('Google', 'NNP'), ('access', 'NN'), ('to', 'TO'), ('Twitter', 'NNP'), ("'s", 'POS'), ('firehose', 'NN'), ('.', '.')]
[('Google', 'DET'), ('and', 'CONJ'), ('Twitter', 'DET'), ('made', 'VERB'), ('a', 'DET'), ('deal', 'NOUN'), ('in', 'ADP'), ('2015', 'DET'), ('that', 'ADP'), ('gave', 'VERB'), ('Google', 'DET'), ('access', 'NOUN'), ('to', 'PRT'), ('Twitter', 'DET'), ("'s", 'VERB'), ('firehose', 'DET'), ('.', '.')]


In [75]:
train_sents=Viterbi(word_tokenize(lines[2]))
[train_sents[0:2]]

[[('Google', 'DET'), ('and', 'CONJ')]]

In [66]:
from nltk.corpus import brown
brown_tagged_sents = brown.tagged_sents(categories='news')
brown_sents = brown.sents(categories='news')
train_sents = brown_tagged_sents[:40]

In [67]:
train_sents

[[('The', 'AT'),
  ('Fulton', 'NP-TL'),
  ('County', 'NN-TL'),
  ('Grand', 'JJ-TL'),
  ('Jury', 'NN-TL'),
  ('said', 'VBD'),
  ('Friday', 'NR'),
  ('an', 'AT'),
  ('investigation', 'NN'),
  ('of', 'IN'),
  ("Atlanta's", 'NP$'),
  ('recent', 'JJ'),
  ('primary', 'NN'),
  ('election', 'NN'),
  ('produced', 'VBD'),
  ('``', '``'),
  ('no', 'AT'),
  ('evidence', 'NN'),
  ("''", "''"),
  ('that', 'CS'),
  ('any', 'DTI'),
  ('irregularities', 'NNS'),
  ('took', 'VBD'),
  ('place', 'NN'),
  ('.', '.')],
 [('The', 'AT'),
  ('jury', 'NN'),
  ('further', 'RBR'),
  ('said', 'VBD'),
  ('in', 'IN'),
  ('term-end', 'NN'),
  ('presentments', 'NNS'),
  ('that', 'CS'),
  ('the', 'AT'),
  ('City', 'NN-TL'),
  ('Executive', 'JJ-TL'),
  ('Committee', 'NN-TL'),
  (',', ','),
  ('which', 'WDT'),
  ('had', 'HVD'),
  ('over-all', 'JJ'),
  ('charge', 'NN'),
  ('of', 'IN'),
  ('the', 'AT'),
  ('election', 'NN'),
  (',', ','),
  ('``', '``'),
  ('deserves', 'VBZ'),
  ('the', 'AT'),
  ('praise', 'NN'),
  ('and', 

In [86]:
t0 = nltk.DefaultTagger('NN')
t1 = nltk.UnigramTagger([train_sents[0:2]], backoff=t0)
t2 = nltk.BigramTagger([train_sents[0:2]], backoff=t1)
t2.tag(["India"])

[('India', 'NN')]

In [91]:
tt2=nltk.BigramTagger([tagged_train_set])

In [94]:
tt2.tag(['Domaine'])

[('Domaine', None)]

In [103]:
tagged_train_set

[('Temple', 'NOUN'),
 (',', '.'),
 ('however', 'ADV'),
 (',', '.'),
 ('harshly', 'ADV'),
 ('criticized', 'VERB'),
 ('Sea', 'NOUN'),
 ('Containers', 'NOUN'),
 ("'", 'PRT'),
 ('plan', 'NOUN'),
 ('yesterday', 'NOUN'),
 (',', '.'),
 ('*-1', 'X'),
 ('characterizing', 'VERB'),
 ('it', 'PRON'),
 ('as', 'ADP'),
 ('a', 'DET'),
 ('``', '.'),
 ('highly', 'ADV'),
 ('conditional', 'ADJ'),
 ('device', 'NOUN'),
 ('designed', 'VERB'),
 ('*', 'X'),
 ('*-2', 'X'),
 ('to', 'PRT'),
 ('entrench', 'VERB'),
 ('management', 'NOUN'),
 (',', '.'),
 ('confuse', 'VERB'),
 ('shareholders', 'NOUN'),
 ('and', 'CONJ'),
 ('prevent', 'VERB'),
 ('them', 'PRON'),
 ('from', 'ADP'),
 ('*-3', 'X'),
 ('accepting', 'VERB'),
 ('our', 'PRON'),
 ('superior', 'ADJ'),
 ('cash', 'NOUN'),
 ('offer', 'NOUN'),
 ('.', '.'),
 ("''", '.'),
 ('It', 'PRON'),
 ('was', 'VERB'),
 ('later', 'ADJ'),
 ('applied', 'VERB'),
 ('to', 'PRT'),
 ('other', 'ADJ'),
 ('new-car', 'NOUN'),
 ('programs', 'NOUN'),
 (',', '.'),
 ('including', 'VERB'),
 ('those

In [106]:
# sentence = [("the", "DT"), ("little", "JJ"), ("yellow", "JJ"),
#             ("dog", "NN"), ("barked", "VBD"), ("at", "IN"), ("the", "DT"), ("cat", "NN")]
sentence = [("Saurabh", "DT")]
grammar = "NP_chunk: {<DT>?<NN><JJ>*}"
cp = nltk.RegexpParser(grammar)
result = cp.parse(sentence)
print(result)

(S Saurabh/DT)


In [114]:
nltk.pos_tag(["America"])

[('America', 'NNP')]

In [115]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [122]:
wordnet_lemmatizer = WordNetLemmatizer()
wordnet_lemmatizer.lemmatize("architects")

'architect'

In [123]:
help(nltk.pos_tag)

Help on function pos_tag in module nltk.tag:

pos_tag(tokens, tagset=None, lang='eng')
    Use NLTK's currently recommended part of speech tagger to
    tag the given list of tokens.
    
        >>> from nltk.tag import pos_tag
        >>> from nltk.tokenize import word_tokenize
        >>> pos_tag(word_tokenize("John's big idea isn't all that bad."))
        [('John', 'NNP'), ("'s", 'POS'), ('big', 'JJ'), ('idea', 'NN'), ('is', 'VBZ'),
        ("n't", 'RB'), ('all', 'PDT'), ('that', 'DT'), ('bad', 'JJ'), ('.', '.')]
        >>> pos_tag(word_tokenize("John's big idea isn't all that bad."), tagset='universal')
        [('John', 'NOUN'), ("'s", 'PRT'), ('big', 'ADJ'), ('idea', 'NOUN'), ('is', 'VERB'),
        ("n't", 'ADV'), ('all', 'DET'), ('that', 'DET'), ('bad', 'ADJ'), ('.', '.')]
    
    NB. Use `pos_tag_sents()` for efficient tagging of more than one sentence.
    
    :param tokens: Sequence of tokens to be tagged
    :type tokens: list(str)
    :param tagset: the tagset to be u