-
Notifications
You must be signed in to change notification settings - Fork 0
/
task2_2_3.py
88 lines (64 loc) · 2.81 KB
/
task2_2_3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
from nltk.corpus import LazyCorpusLoader, BracketParseCorpusReader
from nltk import PCFG
from nltk.grammar import ProbabilisticProduction
from nltk import Tree, Nonterminal
from nltk.treetransforms import chomsky_normal_form
def simplify_functional_tag(tag):
if '-' in tag:
tag = tag.split('-')[0]
return tag
def get_tag(tree):
if isinstance(tree, Tree):
return Nonterminal(simplify_functional_tag(tree.label()))
else:
return tree
def tree_to_production(tree):
return ProbabilisticProduction(get_tag(tree), [get_tag(child) for child in tree], **{'prob': 0})
def tree_to_productions(tree):
yield tree_to_production(tree)
for child in tree:
if isinstance(child, Tree):
for prod in tree_to_productions(child):
yield prod
treebank = LazyCorpusLoader('treebank/combined', BracketParseCorpusReader, r'wsj_.*\.mrg')
def get_productions(productions):
probabilities = dict()
productions_to_return = list(set(productions))
for prod in productions:
if str(prod) in probabilities:
probabilities[str(prod)] += 1
else:
probabilities[str(prod)] = 1
amount_of_interior_nodes = len([prod.lhs() for prod in productions if prod.lhs() != Nonterminal('S')])
lhs_of_prods = set([prod.lhs() for prod in productions])
print('this is the amount of interior nodes: {}'.format(amount_of_interior_nodes))
for lhs in lhs_of_prods:
number_of_occurrences = 0
for prob in probabilities:
if prob.startswith(str(lhs) + " "):
number_of_occurrences += probabilities[prob]
for prob in probabilities:
if prob.startswith(str(lhs) + " "):
probabilities[prob] = probabilities[prob] / number_of_occurrences
for index in range(len(productions_to_return)):
prod = productions_to_return[index]
productions_to_return[index] = ProbabilisticProduction(prod.lhs(), prod.rhs(),
**{'prob': probabilities[str(prod)]})
return productions_to_return
def pcfg_learn(treebank, n):
productions = list()
for i in range(n):
for tree in treebank.parsed_sents()[:i+1]:
chomsky_normal_form(tree, factor='right', horzMarkov=1, vertMarkov=1, childChar='|', parentChar='^')
prod_gen = tree_to_productions(tree)
tree_to_append = next(prod_gen)
while tree_to_append:
productions.append(tree_to_append)
try:
tree_to_append = next(prod_gen)
except Exception as e:
tree_to_append = False
productions = get_productions(productions)
return PCFG(Nonterminal('S'), productions)
print(pcfg_learn(treebank, 200))
#print(pcfg_learn(treebank, 400))