diff --git a/topics/grammars/hunter.py b/topics/grammars/hunter.py index 688541c8..c48f7638 100755 --- a/topics/grammars/hunter.py +++ b/topics/grammars/hunter.py @@ -50,6 +50,28 @@ def isQNumber(x): else: return reduce(lambda a,b:a and b=='.' or b.isdigit(),x,True) +def removeComments(ts,s,e): + while s in ts: + i = ts.index(s) + # special case + if i>1 and 'start-terminal-symbol' in config.keys() and ts[i-1:i+2]==[config['start-terminal-symbol'],s,config['end-terminal-symbol']]: + print('STEP 0: adjusted for the comment starting symbol being used as a terminal.') + nts = ts[:i-1] + nts.append(ts[i-1]+ts[i]+ts[i+1]) + nts.extend(ts[i+2:]) + ts = nts + continue + j = endOfContext(ts,i,e) + if j<0: + print('STEP 0 error: mismatched comment delimiters.') + j = i + nts = ts[:i] + nts.extend(ts[j:]) + #print('<<<',ts) + ts = nts + #print('>>>',ts) + return ts + def splitTokenStream(s): ts = [s[0]] i = 1 @@ -65,9 +87,25 @@ def splitTokenStream(s): ts.append(s[i]) alpha = isAlpha(s[i]) i += 1 - return filter(lambda x:x not in [' ',' ',' '],ts) + return list(filter(lambda x:x not in [' ',' ',' '],ts)) # not space, not hard space, not tab; newlines are preserved for now +def reconsiderSpaces(ts,sep,vs): + nts = [ts[0]] + vs = list(vs) + vs.append('\n') + for x in ts[1:]: + if x == sep: + nts.append('') + elif nts[-1] in vs or x in vs: + if nts[-1]=='': + nts[-1] = x + else: + nts.append(x) + else: + nts[-1] += ' ' + x + return nts + def readConfig(f): global debug cfg = ET.parse(f) @@ -746,6 +784,43 @@ def postfix2confix(p): q.extend(p[w+1:]) p = q return p + +def useTerminatorToFixProds(ps,ts): + # TODO: will not work with labels + nps = [] + for p in ps: + while ts in p: + i = p.index(ts) + nps.append(p[:i]) + np = [nps[-1][0]] + if config['defining-symbol'] not in p[i+1:]: + tail = p[i+1:] + if 'ignore-extra-newlines' in config.keys(): + while '\n' in tail: + tail.remove('\n') + if len(tail)>0: + print('STEP 4 problem: terminator symbol without proper defining symbol context.',tail) + return nps + else: + p = tail + continue + else: + nt = p[i+1:p.index(config['defining-symbol'])] + if 'ignore-extra-newlines' in config.keys(): + while '\n' in nt: + nt.remove('\n') + if len(nt) != 1: + print('STEP 4 problem: cannot determine nonterminal name from',nt) + nt = ' '.join(nt) + else: + nt = nt[0] + np.append(nt) + np.extend(p[p.index(config['defining-symbol'])+1:]) + #print('<<
>>p>>>',p)
+ return nps
+
if __name__ == "__main__":
if len(sys.argv) != 4:
print('Usage:')
@@ -753,11 +828,15 @@ def postfix2confix(p):
sys.exit(-1)
#f = open('src.grammar.txt','r')
f = open(sys.argv[1],'r')
+ readConfig(sys.argv[2])
# STEP 0: read the file, remove whitespace (?)
- print('STEP 0: reading the file, removing whitespace, getting the configuration.')
- tokens = list(splitTokenStream(f.read()))
+ print('STEP 0: reading the file, removing whitespace and comments.')
+ tokens = splitTokenStream(f.read())
f.close()
- readConfig(sys.argv[2])
+ if 'start-comment-symbol' in config.keys() and 'end-comment-symbol' in config.keys():
+ # remove comments
+ # assumption: comments are never nested!
+ tokens = removeComments(mapglue(mapglue(tokens,config['start-comment-symbol']),config['end-comment-symbol']),config['start-comment-symbol'],config['end-comment-symbol'])
if debug:
print(tokens)
# STEP 1: assemble terminal symbols
@@ -796,17 +875,38 @@ def postfix2confix(p):
if debug:
print(tokens)
# STEP 4: slice according to defining-symbol
- print('STEP 4: splitting the token stream into productions according to defining-symbol.')
+ print('STEP 4: splitting the token stream into productions.')
+ if 'nonterminals-may-contain-spaces' in config.keys() and 'concatenate-symbol' in config.keys():
+ # can only treat them together, because spaces in names without concatenation symbol are highly ambiguous
+ # and concatenation symbols are never used if nonterminal names do not have spaces
+ tokens = reconsiderSpaces(tokens,config['concatenate-symbol'],config.values())
if 'defining-symbol' in config.keys():
prods = useDefiningSymbol(tokens,config['defining-symbol'])
else:
print('STEP 4 skipped, sorry: defining-symbol is not specified.')
# TODO
# STEP 4a.1: [sanity check] Infer terminator-symbol
- print('STEP 4: inferring terminator-symbol by looking at the productions.')
if debug:
- print(prods)
- if 'terminator-symbol' not in config.keys():
+ print('The grammar is perceived like this:')
+ for p in prods:
+ print('\t',p[1],'is defined as',p[2:])
+ print('STEP 4: inferring terminator-symbol by looking at the productions.')
+ if 'terminator-symbol' in config.keys():
+ # we do have the terminator, but suppose we also had definition symbol!
+ # TODO otherwise
+ ts = findCommonTail(prods[:-1])
+ if ts:
+ need2fix = [-1]
+ prob = 100
+ else:
+ (need2fix,ts,prob) = findMostProbableTail(prods)
+ if ''.join(ts) == config['terminator-symbol']:
+ print('STEP 4 confirmed terminator-symbol, congratulations!')
+ else:
+ print('STEP 4 would have thought that terminator-symbol is',ts,'and not',config['terminator-symbol'])
+ # now let's fix productions that were joined together
+ prods = useTerminatorToFixProds(prods,config['terminator-symbol'])
+ else:
ts = findCommonTail(prods[:-1])
if ts:
print('STEP 4 successful: inferred terminator-symbol:',ts)
@@ -824,6 +924,11 @@ def postfix2confix(p):
print('%40s'%p[1],'>>>>>>',p[-2:])
# STEP 4a.2: adjusting the terminator-symbol on the unfit productions
poststep4 = 0
+ if debug:
+ print('The grammar is perceived like this:')
+ for p in prods:
+ print('\t',p[1],'is defined as',p[2:])
+
for f in need2fix:
for i in range(0,len(config['terminator-symbol'])):
if prods[f][-len(config['terminator-symbol'])+i:] == config['terminator-symbol'][:len(config['terminator-symbol'])-i]:
@@ -831,8 +936,15 @@ def postfix2confix(p):
prods[f].extend(config['terminator-symbol'])
poststep4 += 1
break
+ if ''.join(prods[f][-len(config['terminator-symbol'])-1:-1]) == config['terminator-symbol'] and prods[f][-1] == '\n':
+ prods[f].pop()
+ poststep4 += 1
if poststep4 > 0:
print('STEP 4 also adjusted',poststep4,'productions that did not quite fit the expectations.')
+ if debug:
+ print('The grammar is perceived like this:')
+ for p in prods:
+ print('\t',p[1],'is defined as',p[2:])
# STEP 4b: splitting the token stream into productions according to terminator-symbol; inferring defining-symbol
# TODO
prods = [p[:-(len(config['terminator-symbol']))] if p[-(len(config['terminator-symbol'])):] == config['terminator-symbol'] else p for p in prods]
@@ -880,11 +992,14 @@ def postfix2confix(p):
# STEP X: validating bracketing?
# ...
# RESULT
+ if 'nonterminals-may-contain-spaces' in config.keys():
+ #
+ prods = [[x.replace(' ','_') for x in p] for p in prods]
+ print('LAST STEP: replacing spaces with underscores for BGF compatibility and readability.')
if debug:
print('RESULT:')
for p in prods:
- print(p[0],'is defined as:')
- print('\t',p[2:])
+ print('\t',p[1],'is defined as:',p[2:])
# FINAL STEP: compose BGF
bgf = BGF.Grammar()
for q in prods:
diff --git a/topics/grammars/metasyntax/ebnf-iso/Makefile b/topics/grammars/metasyntax/ebnf-iso-1/Makefile
similarity index 100%
rename from topics/grammars/metasyntax/ebnf-iso/Makefile
rename to topics/grammars/metasyntax/ebnf-iso-1/Makefile
diff --git a/topics/grammars/metasyntax/ebnf-iso-1/README.txt b/topics/grammars/metasyntax/ebnf-iso-1/README.txt
new file mode 100644
index 00000000..9b6316a0
--- /dev/null
+++ b/topics/grammars/metasyntax/ebnf-iso-1/README.txt
@@ -0,0 +1,6 @@
+ISO/IEC 14977 : 1996(E)
+Final draft version, SC22/N2249: http://www.cl.cam.ac.uk/~mgk25/iso-14977.pdf
+
+src.8.1.txt:
+ 8.1 The syntax of Extended BNF, pages 8–10
+
diff --git a/topics/grammars/metasyntax/ebnf-iso/ebnf.iso.formal.bgf b/topics/grammars/metasyntax/ebnf-iso-1/ebnf.iso.formal.bgf
similarity index 100%
rename from topics/grammars/metasyntax/ebnf-iso/ebnf.iso.formal.bgf
rename to topics/grammars/metasyntax/ebnf-iso-1/ebnf.iso.formal.bgf
diff --git a/topics/grammars/metasyntax/ebnf-iso/ebnf.iso.informal.bgf b/topics/grammars/metasyntax/ebnf-iso-1/ebnf.iso.informal.bgf
similarity index 100%
rename from topics/grammars/metasyntax/ebnf-iso/ebnf.iso.informal.bgf
rename to topics/grammars/metasyntax/ebnf-iso-1/ebnf.iso.informal.bgf
diff --git a/topics/grammars/metasyntax/ebnf-iso/generalize.xbgf b/topics/grammars/metasyntax/ebnf-iso-1/generalize.xbgf
similarity index 100%
rename from topics/grammars/metasyntax/ebnf-iso/generalize.xbgf
rename to topics/grammars/metasyntax/ebnf-iso-1/generalize.xbgf
diff --git a/topics/grammars/metasyntax/ebnf-iso/src.8.1.txt b/topics/grammars/metasyntax/ebnf-iso-1/src.8.1.txt
similarity index 100%
rename from topics/grammars/metasyntax/ebnf-iso/src.8.1.txt
rename to topics/grammars/metasyntax/ebnf-iso-1/src.8.1.txt
diff --git a/topics/grammars/metasyntax/ebnf-iso/README.txt b/topics/grammars/metasyntax/ebnf-iso-2/README.txt
similarity index 59%
rename from topics/grammars/metasyntax/ebnf-iso/README.txt
rename to topics/grammars/metasyntax/ebnf-iso-2/README.txt
index c609c594..f50cea05 100644
--- a/topics/grammars/metasyntax/ebnf-iso/README.txt
+++ b/topics/grammars/metasyntax/ebnf-iso-2/README.txt
@@ -1,11 +1,6 @@
ISO/IEC 14977 : 1996(E)
Final draft version, SC22/N2249: http://www.cl.cam.ac.uk/~mgk25/iso-14977.pdf
-src.8.1.txt:
- 8.1 The syntax of Extended BNF, pages 8–10
-
src.8.2.txt:
8.2 Extended BNF used to define itself informally, page 10
-src.8.3.txt:
- 8.3 Extended BNF defined informally, page 10
diff --git a/topics/grammars/metasyntax/ebnf-iso/src.8.2.txt b/topics/grammars/metasyntax/ebnf-iso-2/src.8.2.txt
similarity index 100%
rename from topics/grammars/metasyntax/ebnf-iso/src.8.2.txt
rename to topics/grammars/metasyntax/ebnf-iso-2/src.8.2.txt
diff --git a/topics/grammars/metasyntax/ebnf-iso-3/Makefile b/topics/grammars/metasyntax/ebnf-iso-3/Makefile
new file mode 100644
index 00000000..b0b77268
--- /dev/null
+++ b/topics/grammars/metasyntax/ebnf-iso-3/Makefile
@@ -0,0 +1,6 @@
+extract:
+ ../../hunter.py src.8.3.txt config.edd ebnf-iso-3.raw.bgf
+ ${tooldir}/xbgf post-extraction.xbgf ebnf-iso-3.raw.bgf ebnf-iso-3.ext.bgf
+ ${tooldir}/xbgf refactor.xbgf ebnf-iso-3.ext.bgf ebnf-iso-3.bgf
+
+include ../../Makefile.include
diff --git a/topics/grammars/metasyntax/ebnf-iso-3/README.txt b/topics/grammars/metasyntax/ebnf-iso-3/README.txt
new file mode 100644
index 00000000..f1d30c65
--- /dev/null
+++ b/topics/grammars/metasyntax/ebnf-iso-3/README.txt
@@ -0,0 +1,5 @@
+ISO/IEC 14977 : 1996(E)
+Final draft version, SC22/N2249: http://www.cl.cam.ac.uk/~mgk25/iso-14977.pdf
+
+src.8.3.txt:
+ 8.3 Extended BNF defined informally, page 10
diff --git a/topics/grammars/metasyntax/ebnf-iso-3/config.edd b/topics/grammars/metasyntax/ebnf-iso-3/config.edd
new file mode 100644
index 00000000..c6109004
--- /dev/null
+++ b/topics/grammars/metasyntax/ebnf-iso-3/config.edd
@@ -0,0 +1,21 @@
+
+