Skip to content

Commit

Permalink
Hunter upgrated (mostly for comments and separation-symbol); ISO EBNF…
Browse files Browse the repository at this point in the history
… source cloned in three

git-svn-id: https://slps.svn.sourceforge.net/svnroot/slps@1058 ab42f6e0-554d-0410-b580-99e487e6eeb2
  • Loading branch information
grammarware committed May 26, 2011
1 parent 5c3f0ba commit f470081
Show file tree
Hide file tree
Showing 15 changed files with 270 additions and 15 deletions.
135 changes: 125 additions & 10 deletions topics/grammars/hunter.py
Expand Up @@ -50,6 +50,28 @@ def isQNumber(x):
else:
return reduce(lambda a,b:a and b=='.' or b.isdigit(),x,True)

def removeComments(ts,s,e):
while s in ts:
i = ts.index(s)
# special case
if i>1 and 'start-terminal-symbol' in config.keys() and ts[i-1:i+2]==[config['start-terminal-symbol'],s,config['end-terminal-symbol']]:
print('STEP 0: adjusted for the comment starting symbol being used as a terminal.')
nts = ts[:i-1]
nts.append(ts[i-1]+ts[i]+ts[i+1])
nts.extend(ts[i+2:])
ts = nts
continue
j = endOfContext(ts,i,e)
if j<0:
print('STEP 0 error: mismatched comment delimiters.')
j = i
nts = ts[:i]
nts.extend(ts[j:])
#print('<<<',ts)
ts = nts
#print('>>>',ts)
return ts

def splitTokenStream(s):
ts = [s[0]]
i = 1
Expand All @@ -65,9 +87,25 @@ def splitTokenStream(s):
ts.append(s[i])
alpha = isAlpha(s[i])
i += 1
return filter(lambda x:x not in [' ',' ',' '],ts)
return list(filter(lambda x:x not in [' ',' ',' '],ts))
# not space, not hard space, not tab; newlines are preserved for now

def reconsiderSpaces(ts,sep,vs):
nts = [ts[0]]
vs = list(vs)
vs.append('\n')
for x in ts[1:]:
if x == sep:
nts.append('')
elif nts[-1] in vs or x in vs:
if nts[-1]=='':
nts[-1] = x
else:
nts.append(x)
else:
nts[-1] += ' ' + x
return nts

def readConfig(f):
global debug
cfg = ET.parse(f)
Expand Down Expand Up @@ -746,18 +784,59 @@ def postfix2confix(p):
q.extend(p[w+1:])
p = q
return p

def useTerminatorToFixProds(ps,ts):
# TODO: will not work with labels
nps = []
for p in ps:
while ts in p:
i = p.index(ts)
nps.append(p[:i])
np = [nps[-1][0]]
if config['defining-symbol'] not in p[i+1:]:
tail = p[i+1:]
if 'ignore-extra-newlines' in config.keys():
while '\n' in tail:
tail.remove('\n')
if len(tail)>0:
print('STEP 4 problem: terminator symbol without proper defining symbol context.',tail)
return nps
else:
p = tail
continue
else:
nt = p[i+1:p.index(config['defining-symbol'])]
if 'ignore-extra-newlines' in config.keys():
while '\n' in nt:
nt.remove('\n')
if len(nt) != 1:
print('STEP 4 problem: cannot determine nonterminal name from',nt)
nt = ' '.join(nt)
else:
nt = nt[0]
np.append(nt)
np.extend(p[p.index(config['defining-symbol'])+1:])
#print('<<<p<<<',p)
p = np
#print('>>>p>>>',p)
return nps

if __name__ == "__main__":
if len(sys.argv) != 4:
print('Usage:')
print(' extract.py input.txt config.edd output.bgf')
sys.exit(-1)
#f = open('src.grammar.txt','r')
f = open(sys.argv[1],'r')
readConfig(sys.argv[2])
# STEP 0: read the file, remove whitespace (?)
print('STEP 0: reading the file, removing whitespace, getting the configuration.')
tokens = list(splitTokenStream(f.read()))
print('STEP 0: reading the file, removing whitespace and comments.')
tokens = splitTokenStream(f.read())
f.close()
readConfig(sys.argv[2])
if 'start-comment-symbol' in config.keys() and 'end-comment-symbol' in config.keys():
# remove comments
# assumption: comments are never nested!
tokens = removeComments(mapglue(mapglue(tokens,config['start-comment-symbol']),config['end-comment-symbol']),config['start-comment-symbol'],config['end-comment-symbol'])
if debug:
print(tokens)
# STEP 1: assemble terminal symbols
Expand Down Expand Up @@ -796,17 +875,38 @@ def postfix2confix(p):
if debug:
print(tokens)
# STEP 4: slice according to defining-symbol
print('STEP 4: splitting the token stream into productions according to defining-symbol.')
print('STEP 4: splitting the token stream into productions.')
if 'nonterminals-may-contain-spaces' in config.keys() and 'concatenate-symbol' in config.keys():
# can only treat them together, because spaces in names without concatenation symbol are highly ambiguous
# and concatenation symbols are never used if nonterminal names do not have spaces
tokens = reconsiderSpaces(tokens,config['concatenate-symbol'],config.values())
if 'defining-symbol' in config.keys():
prods = useDefiningSymbol(tokens,config['defining-symbol'])
else:
print('STEP 4 skipped, sorry: defining-symbol is not specified.')
# TODO
# STEP 4a.1: [sanity check] Infer terminator-symbol
print('STEP 4: inferring terminator-symbol by looking at the productions.')
if debug:
print(prods)
if 'terminator-symbol' not in config.keys():
print('The grammar is perceived like this:')
for p in prods:
print('\t',p[1],'is defined as',p[2:])
print('STEP 4: inferring terminator-symbol by looking at the productions.')
if 'terminator-symbol' in config.keys():
# we do have the terminator, but suppose we also had definition symbol!
# TODO otherwise
ts = findCommonTail(prods[:-1])
if ts:
need2fix = [-1]
prob = 100
else:
(need2fix,ts,prob) = findMostProbableTail(prods)
if ''.join(ts) == config['terminator-symbol']:
print('STEP 4 confirmed terminator-symbol, congratulations!')
else:
print('STEP 4 would have thought that terminator-symbol is',ts,'and not',config['terminator-symbol'])
# now let's fix productions that were joined together
prods = useTerminatorToFixProds(prods,config['terminator-symbol'])
else:
ts = findCommonTail(prods[:-1])
if ts:
print('STEP 4 successful: inferred terminator-symbol:',ts)
Expand All @@ -824,15 +924,27 @@ def postfix2confix(p):
print('%40s'%p[1],'>>>>>>',p[-2:])
# STEP 4a.2: adjusting the terminator-symbol on the unfit productions
poststep4 = 0
if debug:
print('The grammar is perceived like this:')
for p in prods:
print('\t',p[1],'is defined as',p[2:])

for f in need2fix:
for i in range(0,len(config['terminator-symbol'])):
if prods[f][-len(config['terminator-symbol'])+i:] == config['terminator-symbol'][:len(config['terminator-symbol'])-i]:
prods[f] = prods[f][:-len(config['terminator-symbol'])+i]
prods[f].extend(config['terminator-symbol'])
poststep4 += 1
break
if ''.join(prods[f][-len(config['terminator-symbol'])-1:-1]) == config['terminator-symbol'] and prods[f][-1] == '\n':
prods[f].pop()
poststep4 += 1
if poststep4 > 0:
print('STEP 4 also adjusted',poststep4,'productions that did not quite fit the expectations.')
if debug:
print('The grammar is perceived like this:')
for p in prods:
print('\t',p[1],'is defined as',p[2:])
# STEP 4b: splitting the token stream into productions according to terminator-symbol; inferring defining-symbol
# TODO
prods = [p[:-(len(config['terminator-symbol']))] if p[-(len(config['terminator-symbol'])):] == config['terminator-symbol'] else p for p in prods]
Expand Down Expand Up @@ -880,11 +992,14 @@ def postfix2confix(p):
# STEP X: validating bracketing?
# ...
# RESULT
if 'nonterminals-may-contain-spaces' in config.keys():
#
prods = [[x.replace(' ','_') for x in p] for p in prods]
print('LAST STEP: replacing spaces with underscores for BGF compatibility and readability.')
if debug:
print('RESULT:')
for p in prods:
print(p[0],'is defined as:')
print('\t',p[2:])
print('\t',p[1],'is defined as:',p[2:])
# FINAL STEP: compose BGF
bgf = BGF.Grammar()
for q in prods:
Expand Down
6 changes: 6 additions & 0 deletions topics/grammars/metasyntax/ebnf-iso-1/README.txt
@@ -0,0 +1,6 @@
ISO/IEC 14977 : 1996(E)
Final draft version, SC22/N2249: http://www.cl.cam.ac.uk/~mgk25/iso-14977.pdf

src.8.1.txt:
8.1 The syntax of Extended BNF, pages 8–10

@@ -1,11 +1,6 @@
ISO/IEC 14977 : 1996(E)
Final draft version, SC22/N2249: http://www.cl.cam.ac.uk/~mgk25/iso-14977.pdf

src.8.1.txt:
8.1 The syntax of Extended BNF, pages 8–10

src.8.2.txt:
8.2 Extended BNF used to define itself informally, page 10

src.8.3.txt:
8.3 Extended BNF defined informally, page 10
6 changes: 6 additions & 0 deletions topics/grammars/metasyntax/ebnf-iso-3/Makefile
@@ -0,0 +1,6 @@
extract:
../../hunter.py src.8.3.txt config.edd ebnf-iso-3.raw.bgf
${tooldir}/xbgf post-extraction.xbgf ebnf-iso-3.raw.bgf ebnf-iso-3.ext.bgf
${tooldir}/xbgf refactor.xbgf ebnf-iso-3.ext.bgf ebnf-iso-3.bgf

include ../../Makefile.include
5 changes: 5 additions & 0 deletions topics/grammars/metasyntax/ebnf-iso-3/README.txt
@@ -0,0 +1,5 @@
ISO/IEC 14977 : 1996(E)
Final draft version, SC22/N2249: http://www.cl.cam.ac.uk/~mgk25/iso-14977.pdf

src.8.3.txt:
8.3 Extended BNF defined informally, page 10
21 changes: 21 additions & 0 deletions topics/grammars/metasyntax/ebnf-iso-3/config.edd
@@ -0,0 +1,21 @@
<?xml version="1.0" encoding="UTF-8"?>
<edd:config xmlns:edd="http://planet-sl.org/edd">
<defining-symbol>=</defining-symbol>
<definition-separator-symbol>/</definition-separator-symbol>
<concatenate-symbol>,</concatenate-symbol>
<nonterminals-may-contain-spaces/>
<terminator-symbol>.</terminator-symbol>
<start-comment-symbol>(*</start-comment-symbol>
<end-comment-symbol>*)</end-comment-symbol>
<start-terminal-symbol>’</start-terminal-symbol>
<end-terminal-symbol>’</end-terminal-symbol>
<start-option-symbol>(/</start-option-symbol>
<end-option-symbol>/)</end-option-symbol>
<start-star-symbol>(:</start-star-symbol>
<end-star-symbol>:)</end-star-symbol>
<ignore-extra-newlines/>
<mask>
<token>"’"</token>
<terminal>’</terminal>
</mask>
</edd:config>
35 changes: 35 additions & 0 deletions topics/grammars/metasyntax/ebnf-iso-3/post-extraction.xbgf
@@ -0,0 +1,35 @@
<?xml version="1.0" encoding="UTF-8"?>
<xbgf:sequence xmlns:xbgf="http://planet-sl.org/xbgf" xmlns:bgf="http://planet-sl.org/bgf">
<xbgf:replace>
<bgf:expression>
<nonterminal>CHARACTER_-_’’’</nonterminal>
</bgf:expression>
<bgf:expression>
<nonterminal>CHARACTER</nonterminal>
</bgf:expression>
</xbgf:replace>
<xbgf:replace>
<bgf:expression>
<nonterminal>CHARACTER_-_’"’</nonterminal>
</bgf:expression>
<bgf:expression>
<nonterminal>CHARACTER</nonterminal>
</bgf:expression>
</xbgf:replace>
<xbgf:replace>
<bgf:expression>
<nonterminal>CHARACTER_-_’?’</nonterminal>
</bgf:expression>
<bgf:expression>
<nonterminal>CHARACTER</nonterminal>
</bgf:expression>
</xbgf:replace>
<xbgf:replace>
<bgf:expression>
<nonterminal>EMPTY</nonterminal>
</bgf:expression>
<bgf:expression>
<terminal>EMPTY</terminal>
</bgf:expression>
</xbgf:replace>
</xbgf:sequence>
72 changes: 72 additions & 0 deletions topics/grammars/metasyntax/ebnf-iso-3/refactor.xbgf
@@ -0,0 +1,72 @@
<?xml version="1.0" encoding="UTF-8"?>
<xbgf:sequence xmlns:xbgf="http://planet-sl.org/xbgf" xmlns:bgf="http://planet-sl.org/bgf">
<xbgf:massage>
<bgf:expression>
<sequence>
<bgf:expression>
<nonterminal>SYNTAX_RULE</nonterminal>
</bgf:expression>
<bgf:expression>
<star>
<bgf:expression>
<nonterminal>SYNTAX_RULE</nonterminal>
</bgf:expression>
</star>
</bgf:expression>
</sequence>
</bgf:expression>
<bgf:expression>
<plus>
<bgf:expression>
<nonterminal>SYNTAX_RULE</nonterminal>
</bgf:expression>
</plus>
</bgf:expression>
</xbgf:massage>
<xbgf:massage>
<bgf:expression>
<sequence>
<bgf:expression>
<nonterminal>DIGIT</nonterminal>
</bgf:expression>
<bgf:expression>
<star>
<bgf:expression>
<nonterminal>DIGIT</nonterminal>
</bgf:expression>
</star>
</bgf:expression>
</sequence>
</bgf:expression>
<bgf:expression>
<plus>
<bgf:expression>
<nonterminal>DIGIT</nonterminal>
</bgf:expression>
</plus>
</bgf:expression>
</xbgf:massage>
<xbgf:massage>
<bgf:expression>
<sequence>
<bgf:expression>
<nonterminal>CHARACTER</nonterminal>
</bgf:expression>
<bgf:expression>
<star>
<bgf:expression>
<nonterminal>CHARACTER</nonterminal>
</bgf:expression>
</star>
</bgf:expression>
</sequence>
</bgf:expression>
<bgf:expression>
<plus>
<bgf:expression>
<nonterminal>CHARACTER</nonterminal>
</bgf:expression>
</plus>
</bgf:expression>
</xbgf:massage>
</xbgf:sequence>

0 comments on commit f470081

Please sign in to comment.