In [1]:
import nltk
from nltk.grammar import Nonterminal
from nltk.corpus import treebank
nltk.download('treebank')
training_set = treebank.parsed_sents()
print(training_set[1])

[nltk_data] Downloading package treebank to
[nltk_data]     /home/distillery/nltk_data...
[nltk_data]   Package treebank is already up-to-date!
(S
  (NP-SBJ (NNP Mr.) (NNP Vinken))
  (VP
    (VBZ is)
    (NP-PRD
      (NP (NN chairman))
      (PP
        (IN of)
        (NP
          (NP (NNP Elsevier) (NNP N.V.))
          (, ,)
          (NP (DT the) (NNP Dutch) (VBG publishing) (NN group))))))
  (. .))


##### Extract the productions for all annotated training sentences

In [2]:
treebank_productions = list(
    set(production for sent in training_set for production in sent.productions())
)

treebank_productions[0:10]

[SINV -> VP-TPC-1 VP NP-SBJ-2 .,
 CD -> '454',
 NP -> DT NNP JJ NN,
 NNP -> 'Byron',
 CD -> '2163.2',
 NP-SBJ-4 -> DT NN NNS,
 JJ -> 'consonant',
 ADVP -> RBR,
 VP -> VBG PP-CLR PP-CLR,
 S -> S-ADV , NP-SBJ-64 VP .]

##### Add productions for each word, POS tag

In [3]:
for word, tag in treebank.tagged_words():
    t = nltk.Tree.fromstring("(" + tag + " " + word + ")")
    for production in t.productions():
        treebank_productions.append(production)

##### Build the PCFG based grammar and the parser

In [4]:
treebank_grammar = nltk.grammar.induce_pcfg(
    Nonterminal('S'),
    treebank_productions
)

viterbi_parser = nltk.ViterbiParser(treebank_grammar)

##### Get sample sentence tokens and parse tree for sample sentence

In [5]:
sentence = 'The car is quick and it is jumping over the sleeping tree'
tokens = nltk.word_tokenize(sentence)
result = list(viterbi_parser.parse(tokens))

##### Get parse tree for sample sentence

In [6]:
result = list(viterbi_parser.parse(tokens))
print(result[0])
result[0].draw()

(S
  (NP-SBJ-99 (DT The) (NN car))
  (VP
    (VBZ is)
    (PRT (JJ quick))
    (S
      (CC and)
      (NP-SBJ (PRP it))
      (VP
        (VBZ is)
        (ADJP-PRD (VBG jumping))
        (PP-2
          (IN over)
          (NP (DT the) (VBG sleeping) (NN tree))))))) (p=1.62473e-43)
