In [1]:
import nltk

In [2]:
cfg_rules = """
S -> NP-SBJ VP STOP 
NP -> CD JJ NN | CD JJ NNS
NP-SBJ -> DT NN NN
VP -> VBZ NP

DT -> 'the' | 'a'
NN -> 'purchase' | 'price' | 'guild' | 'strike'
VBZ -> 'includes' | 'began'
CD -> 'two'
JJ -> 'ancillary'
NNS -> 'companies'
STOP -> '.'
"""
grammar  = nltk.CFG.fromstring(cfg_rules)


In [122]:
cfg_rules = """
S -> NP-SBJ VP STOP | NP NP STOP
NP -> CD JJ NN | CD JJ NNS | NP PP-DIR | DT NN | DT NN CC NN NN | NNP CD | DT NN VBD
PP-DIR -> IN NP
NP-SBJ -> DT NN NN | DT NN
VP -> VBZ NP | VBD NP PP-TMP
PP-TMP -> IN NP


DT -> 'the' | 'a'
NNP -> 'March'
NN -> 'purchase' | 'price' | 'guild' | 'strike' | 'TV' | 'movie' | 'industry' | 'company'
VBZ -> 'includes'
VBD -> 'began' | 'bought'
CD -> 'two' | '1988' | 'one'
CC -> 'and'
JJ -> 'ancillary'
NNS -> 'companies'
IN -> 'in' | 'against'
STOP -> '.'
"""
grammar = nltk.CFG.fromstring(cfg_rules)


In [62]:
grammar.is_flexible_chomsky_normal_form()

True

In [6]:
sentences = [
    "the purchase price includes two ancillary companies .".split(),
    "the guild began a strike against the TV and movie industry in March 1988 .".split(),
]
for s in sentences:
    grammar.check_coverage(s)

In [120]:
cfg_rules = """
S -> S1 STOP
S1 -> NP-SBJ VP
NP -> NP1 NN | NP1 NNS | NNP CD | NP PP-DIR | NP2 NN
NP1 -> CD JJ
NP2 -> NP3 NN
NP3 -> NP4 CC
NP4 -> DT NN
PP-DIR -> IN NP
NP-SBJ -> NP-SBJ1 NN | DT NN
NP-SBJ1 -> DT NN
VP -> VBZ NP | VP1 PP-TMP
VP1 -> VBD NP
PP-TMP -> IN NP

DT -> 'the' | 'a'
NNP -> 'March'
NN -> 'purchase' | 'price' | 'guild' | 'strike' | 'TV' | 'movie' | 'industry' | 'company'
VBZ -> 'includes'
VBD -> 'began' | 'bought'
CD -> 'two' | '1988' | 'one'
CC -> 'and'
JJ -> 'ancillary'
NNS -> 'companies'
IN -> 'in' | 'against'
STOP -> '.'
"""

grammar = nltk.CFG.fromstring(cfg_rules)

sentences = [
    "the purchase price includes two ancillary companies .".split(),
    "the guild began a strike against the TV and movie industry in March 1988 .".split(),
    'the guild bought one ancillary company .'.split()
]
for s in sentences:
    grammar.check_coverage(s)

print(grammar.is_flexible_chomsky_normal_form())
print(grammar.is_chomsky_normal_form())


True
True


In [154]:
from nltk.parse.chart import BottomUpChartParser
parser = BottomUpChartParser(grammar)

sentences = ['the purchase price includes two ancillary companies .'.split(),
                'the guild began a strike against the TV and movie industry in March 1988 .'.split(),
                "the purchase price includes two ancillary companies .".split(),
                'the guild bought one ancillary company .'.split()]

for sent in sentences:
    for p in parser.parse(sent):
        p.draw()

In [8]:
from nltk.corpus import treebank
print(treebank.parsed_sents()[0])
print(treebank.parsed_sents()[1])

(S
  (NP-SBJ
    (NP (NNP Pierre) (NNP Vinken))
    (, ,)
    (ADJP (NP (CD 61) (NNS years)) (JJ old))
    (, ,))
  (VP
    (MD will)
    (VP
      (VB join)
      (NP (DT the) (NN board))
      (PP-CLR (IN as) (NP (DT a) (JJ nonexecutive) (NN director)))
      (NP-TMP (NNP Nov.) (CD 29))))
  (. .))
(S
  (NP-SBJ (NNP Mr.) (NNP Vinken))
  (VP
    (VBZ is)
    (NP-PRD
      (NP (NN chairman))
      (PP
        (IN of)
        (NP
          (NP (NNP Elsevier) (NNP N.V.))
          (, ,)
          (NP (DT the) (NNP Dutch) (VBG publishing) (NN group))))))
  (. .))


In [133]:
from nltk.grammar import CFG, Nonterminal

prods = list({production for sent in treebank.parsed_sents() for production in sent.productions()})
t_grammar = CFG(Nonterminal('S'), prods)

In [134]:
sents = [
    'Mr. Vinken is chairman .'.split(),
    'Stocks rose .'.split(),
    'Alan introduced a plan .'.split()
]

t_parser = BottomUpChartParser(t_grammar)

    

In [131]:
parses = 0
for s in sents[:1]:
    for p in t_parser.parse(s):
        if parses < 5:
            print(p)
        parses += 1
        
print(parses)

(S
  (NP-SBJ-1 (NP (NNP Mr.)) (FRAG (NP (NNP Vinken))))
  (VP (VBZ is) (NP-CLR (NN chairman)))
  (. .))
(S
  (NP-SBJ-1 (NP (NNP Mr.)) (FRAG (NP (NP (NNP Vinken)))))
  (VP (VBZ is) (NP-CLR (NN chairman)))
  (. .))
(S
  (NP-SBJ-1 (NP (NP (NNP Mr.))) (FRAG (NP (NNP Vinken))))
  (VP (VBZ is) (NP-CLR (NN chairman)))
  (. .))
(S
  (NP-SBJ-1 (NP (NP (NNP Mr.))) (FRAG (NP (NP (NNP Vinken)))))
  (VP (VBZ is) (NP-CLR (NN chairman)))
  (. .))
(S
  (NP-SBJ-1 (NP (NNP Mr.)) (FRAG (ADVP (NNP Vinken))))
  (VP (VBZ is) (NP-CLR (NN chairman)))
  (. .))
32852


In [150]:
transitions = {}
total = 0

for sent in treebank.parsed_sents():
    for parse in sent:
        for prod in parse.productions():
            if prod.lhs() == Nonterminal('S'):
                total += 1
                if transitions.get(prod.rhs()):
                    transitions[prod.rhs()] += 1
                else:
                    transitions[prod.rhs()] = 1


In [153]:
filt_trans = {}
over_5 = 0
for k, v in transitions.items():
    if v >= 5:
        filt_trans[k] = (v, v / total)
    

In [145]:
filt_trans = {k: (v, v/ over_5) for k, v in filt_trans.items()}

In [152]:
filt_trans

{(-NONE-,): (477, 0.09187211093990755),
 (NP-SBJ, ADJP-PRD): (46, 0.008859784283513097),
 (NP-SBJ, ADVP, VP): (51, 0.009822804314329739),
 (NP-SBJ, ADVP-MNR, VP): (5, 0.000963020030816641),
 (NP-SBJ, ADVP-TMP, VP): (55, 0.01059322033898305),
 (NP-SBJ, NP-PRD): (47, 0.009052388289676425),
 (NP-SBJ, PP-PRD): (5, 0.000963020030816641),
 (NP-SBJ, RB, VP): (5, 0.000963020030816641),
 (NP-SBJ, VP): (3387, 0.6523497688751926),
 (NP-SBJ, ``, VP): (16, 0.0030816640986132513),
 (NP-SBJ-1, ADVP, VP): (6, 0.0011556240369799693),
 (NP-SBJ-1, ADVP-TMP, VP): (6, 0.0011556240369799693),
 (NP-SBJ-1, VP): (368, 0.07087827426810478),
 (NP-SBJ-1, ``, VP): (5, 0.000963020030816641),
 (NP-SBJ-2, VP): (201, 0.03871340523882897),
 (NP-SBJ-3, VP): (66, 0.012711864406779662),
 (NP-SBJ-4, VP): (16, 0.0030816640986132513),
 (PP, ,, NP-SBJ, VP): (6, 0.0011556240369799693),
 (PP, NP-SBJ, VP): (5, 0.000963020030816641),
 (PP-TMP, ,, NP-SBJ, VP): (5, 0.000963020030816641),
 (PP-TMP, NP-SBJ, VP): (10, 0.00192604006163

In [97]:
from nltk import induce_pcfg
from nltk import InsideChartParser

prods = list({production for sent in treebank.parsed_sents() for production in sent.productions()})
g_pfcg = induce_pcfg(Nonterminal('S'), prods)

In [110]:
p_parser = InsideChartParser(g_pfcg, beam_size=400)


In [111]:
sents = [
    'Mr. Vinken is chairman .'.split(),
    'Stocks rose .'.split(),
    'Alan introduced a plan .'.split()
]

for sent in sents:
    print(sent)
    for p in p_parser.parse(sent):
        print(p)

['Mr.', 'Vinken', 'is', 'chairman', '.']
(S
  (NP-TTL-SBJ (NNP Mr.) (NNP Vinken))
  (VP (VP (VP (VP (VP (VBZ is)) (NP (NN chairman))))))
  (. .)) (p=2.11819e-37)
['Stocks', 'rose', '.']
(S (NP-SBJ-112 (NNS Stocks)) (VP (VBD rose)) (. .)) (p=2.72739e-13)
['Alan', 'introduced', 'a', 'plan', '.']


In [93]:
list(parse)

[ProbabilisticTree('S', [ProbabilisticTree('NP-SBJ-112', [ProbabilisticTree('NNS', ['Stocks']) (p=0.0006939625260235947)]) (p=0.0006939625260235947), ProbabilisticTree('VP', [ProbabilisticTree('VBD', ['rose']) (p=0.0019342359767891683)]) (p=9.102286949596086e-07), ProbabilisticTree('.', ['.']) (p=0.3333333333333333)]) (p=2.727394664997107e-13)]

In [119]:
list(p_parser.parse(['you', 'are', 'sleeping']))

[ProbabilisticTree('S', [ProbabilisticTree('NP-SBJ-3', [ProbabilisticTree('PRP', ['you']) (p=0.04)]) (p=0.0012903225806451613), ProbabilisticTree('ADVP-TMP', [ProbabilisticTree('IN', ['are']) (p=0.008403361344537815)]) (p=0.00021547080370609782), ProbabilisticTree('VP', [ProbabilisticTree('VBG', ['sleeping']) (p=0.0017513134851138354)]) (p=8.241475224065108e-07)]) (p=2.968071686868129e-16),
 ProbabilisticTree('S', [ProbabilisticTree('NP-SBJ-3', [ProbabilisticTree('PRP', ['you']) (p=0.04)]) (p=0.0012903225806451613), ProbabilisticTree('ADVP', [ProbabilisticTree('IN', ['are']) (p=0.008403361344537815)]) (p=0.00014005602240896358), ProbabilisticTree('VP', [ProbabilisticTree('VBG', ['sleeping']) (p=0.0017513134851138354)]) (p=8.241475224065108e-07)]) (p=1.9292465964642834e-16),
 ProbabilisticTree('S', [ProbabilisticTree('NP-SBJ-2', [ProbabilisticTree('PRP', ['you']) (p=0.04)]) (p=0.0006557377049180328), ProbabilisticTree('ADVP', [ProbabilisticTree('IN', ['are']) (p=0.008403361344537815)]) 