Hunter upgrated (mostly for comments and separation-symbol); ISO EBNF…

… source cloned in three git-svn-id: https://slps.svn.sourceforge.net/svnroot/slps@1058 ab42f6e0-554d-0410-b580-99e487e6eeb2
grammarware · May 26, 2011 · f470081 · f470081
1 parent 5c3f0ba
commit f470081
Show file tree

Hide file tree

Showing 15 changed files with 270 additions and 15 deletions.
diff --git a/topics/grammars/hunter.py b/topics/grammars/hunter.py
@@ -50,6 +50,28 @@ def isQNumber(x):
 	else:
 		return reduce(lambda a,b:a and b=='.' or b.isdigit(),x,True)
 
+def removeComments(ts,s,e):
+	while s in ts:
+		i = ts.index(s)
+		# special case
+		if i>1 and 'start-terminal-symbol' in config.keys() and ts[i-1:i+2]==[config['start-terminal-symbol'],s,config['end-terminal-symbol']]:
+			print('STEP 0: adjusted for the comment starting symbol being used as a terminal.')
+			nts = ts[:i-1]
+			nts.append(ts[i-1]+ts[i]+ts[i+1])
+			nts.extend(ts[i+2:])
+			ts = nts
+			continue
+		j = endOfContext(ts,i,e)
+		if j<0:
+			print('STEP 0 error: mismatched comment delimiters.')
+			j = i
+		nts = ts[:i]
+		nts.extend(ts[j:])
+		#print('<<<',ts)
+		ts = nts
+		#print('>>>',ts)
+	return ts
+
 def splitTokenStream(s):
 	ts = [s[0]]
 	i = 1
@@ -65,9 +87,25 @@ def splitTokenStream(s):
 			ts.append(s[i])
 			alpha = isAlpha(s[i])
 		i += 1
-	return filter(lambda x:x not in [' ',' ','	'],ts)
+	return list(filter(lambda x:x not in [' ',' ','	'],ts))
 	# not space, not hard space, not tab; newlines are preserved for now
 
+def reconsiderSpaces(ts,sep,vs):
+	nts = [ts[0]]
+	vs = list(vs)
+	vs.append('\n')
+	for x in ts[1:]:
+		if x == sep:
+			nts.append('')
+		elif nts[-1] in vs or x in vs:
+			if nts[-1]=='':
+				nts[-1] = x
+			else:
+				nts.append(x)
+		else:
+			nts[-1] += ' ' + x
+	return nts
+
 def readConfig(f):
 	global debug
 	cfg = ET.parse(f)
@@ -746,18 +784,59 @@ def postfix2confix(p):
 				q.extend(p[w+1:])
 				p = q
 	return p
+
+def useTerminatorToFixProds(ps,ts):
+	# TODO: will not work with labels
+	nps = []
+	for p in ps:
+		while ts in p:
+			i = p.index(ts)
+			nps.append(p[:i])
+			np = [nps[-1][0]]
+			if config['defining-symbol'] not in p[i+1:]:
+				tail = p[i+1:]
+				if 'ignore-extra-newlines' in config.keys():
+					while '\n' in tail:
+						tail.remove('\n')
+				if len(tail)>0:
+					print('STEP 4 problem: terminator symbol without proper defining symbol context.',tail)
+					return nps
+				else:
+					p = tail
+					continue
+			else:
+				nt = p[i+1:p.index(config['defining-symbol'])]
+				if 'ignore-extra-newlines' in config.keys():
+					while '\n' in nt:
+						nt.remove('\n')
+				if len(nt) != 1:
+					print('STEP 4 problem: cannot determine nonterminal name from',nt)
+					nt = ' '.join(nt)
+				else:
+					nt = nt[0]
+				np.append(nt)
+			np.extend(p[p.index(config['defining-symbol'])+1:])
+			#print('<<<p<<<',p)
+			p = np
+			#print('>>>p>>>',p)
+	return nps
+
 if __name__ == "__main__":
 	if len(sys.argv) != 4:
 		print('Usage:')
 		print('	extract.py input.txt config.edd output.bgf')
 		sys.exit(-1)
 	#f = open('src.grammar.txt','r')
 	f = open(sys.argv[1],'r')
+	readConfig(sys.argv[2])
 	# STEP 0: read the file, remove whitespace (?)
-	print('STEP 0: reading the file, removing whitespace, getting the configuration.')
-	tokens = list(splitTokenStream(f.read()))
+	print('STEP 0: reading the file, removing whitespace and comments.')
+	tokens = splitTokenStream(f.read())
 	f.close()
-	readConfig(sys.argv[2])
+	if 'start-comment-symbol' in config.keys() and 'end-comment-symbol' in config.keys():
+		# remove comments
+		# assumption: comments are never nested!
+		tokens = removeComments(mapglue(mapglue(tokens,config['start-comment-symbol']),config['end-comment-symbol']),config['start-comment-symbol'],config['end-comment-symbol'])
 	if debug:
 		print(tokens)
 	# STEP 1: assemble terminal symbols
@@ -796,17 +875,38 @@ def postfix2confix(p):
 	if debug:
 		print(tokens)
 	# STEP 4: slice according to defining-symbol
-	print('STEP 4: splitting the token stream into productions according to defining-symbol.')
+	print('STEP 4: splitting the token stream into productions.')
+	if 'nonterminals-may-contain-spaces' in config.keys() and 'concatenate-symbol' in config.keys():
+		# can only treat them together, because spaces in names without concatenation symbol are highly ambiguous
+		# and concatenation symbols are never used if nonterminal names do not have spaces
+		tokens = reconsiderSpaces(tokens,config['concatenate-symbol'],config.values())
 	if 'defining-symbol' in config.keys():
 		prods = useDefiningSymbol(tokens,config['defining-symbol'])
 	else:
 		print('STEP 4 skipped, sorry: defining-symbol is not specified.')
 		# TODO
 	# STEP 4a.1: [sanity check] Infer terminator-symbol
-	print('STEP 4: inferring terminator-symbol by looking at the productions.')
 	if debug:
-		print(prods)
-	if 'terminator-symbol' not in config.keys():
+		print('The grammar is perceived like this:')
+		for p in prods:
+			print('\t',p[1],'is defined as',p[2:])
+	print('STEP 4: inferring terminator-symbol by looking at the productions.')
+	if 'terminator-symbol' in config.keys():
+		# we do have the terminator, but suppose we also had definition symbol!
+		# TODO otherwise
+		ts = findCommonTail(prods[:-1])
+		if ts:
+			need2fix = [-1]
+			prob = 100
+		else:
+			(need2fix,ts,prob) = findMostProbableTail(prods)
+		if ''.join(ts) == config['terminator-symbol']:
+			print('STEP 4 confirmed terminator-symbol, congratulations!')
+		else:
+			print('STEP 4 would have thought that terminator-symbol is',ts,'and not',config['terminator-symbol'])
+		# now let's fix productions that were joined together
+		prods = useTerminatorToFixProds(prods,config['terminator-symbol'])
+	else:
 		ts = findCommonTail(prods[:-1])
 		if ts:
 			print('STEP 4 successful: inferred terminator-symbol:',ts)
@@ -824,15 +924,27 @@ def postfix2confix(p):
 					print('%40s'%p[1],'>>>>>>',p[-2:])
 	# STEP 4a.2: adjusting the terminator-symbol on the unfit productions
 	poststep4 = 0
+	if debug:
+		print('The grammar is perceived like this:')
+		for p in prods:
+			print('\t',p[1],'is defined as',p[2:])
+
 	for f in need2fix:
 		for i in range(0,len(config['terminator-symbol'])):
 			if prods[f][-len(config['terminator-symbol'])+i:] == config['terminator-symbol'][:len(config['terminator-symbol'])-i]:
 				prods[f] = prods[f][:-len(config['terminator-symbol'])+i]
 				prods[f].extend(config['terminator-symbol'])
 				poststep4 += 1
 				break
+		if ''.join(prods[f][-len(config['terminator-symbol'])-1:-1]) == config['terminator-symbol'] and prods[f][-1] == '\n':
+			prods[f].pop()
+			poststep4 += 1
 	if poststep4 > 0:
 		print('STEP 4 also adjusted',poststep4,'productions that did not quite fit the expectations.')
+	if debug:
+		print('The grammar is perceived like this:')
+		for p in prods:
+			print('\t',p[1],'is defined as',p[2:])
 	# STEP 4b: splitting the token stream into productions according to terminator-symbol; inferring defining-symbol
 	# TODO
 	prods = [p[:-(len(config['terminator-symbol']))] if p[-(len(config['terminator-symbol'])):] == config['terminator-symbol'] else p for p in prods]
@@ -880,11 +992,14 @@ def postfix2confix(p):
 	# STEP X: validating bracketing?
 	# ...
 	# RESULT
+	if 'nonterminals-may-contain-spaces' in config.keys():
+		#
+		prods = [[x.replace(' ','_') for x in p] for p in prods]
+		print('LAST STEP: replacing spaces with underscores for BGF compatibility and readability.')
 	if debug:
 		print('RESULT:')
 		for p in prods:
-			print(p[0],'is defined as:')
-			print('\t',p[2:])
+			print('\t',p[1],'is defined as:',p[2:])
 	# FINAL STEP: compose BGF
 	bgf = BGF.Grammar()
 	for q in prods:

diff --git a/topics/grammars/metasyntax/ebnf-iso/Makefile → ...s/grammars/metasyntax/ebnf-iso-1/Makefile b/topics/grammars/metasyntax/ebnf-iso/Makefile → ...s/grammars/metasyntax/ebnf-iso-1/Makefile
diff --git a/topics/grammars/metasyntax/ebnf-iso-1/README.txt b/topics/grammars/metasyntax/ebnf-iso-1/README.txt
@@ -0,0 +1,6 @@
+ISO/IEC 14977 : 1996(E)
+Final draft version, SC22/N2249: http://www.cl.cam.ac.uk/~mgk25/iso-14977.pdf
+
+src.8.1.txt:
+	8.1	The syntax of Extended BNF, pages 8–10
+
diff --git a/...s/metasyntax/ebnf-iso/ebnf.iso.formal.bgf → ...metasyntax/ebnf-iso-1/ebnf.iso.formal.bgf b/...s/metasyntax/ebnf-iso/ebnf.iso.formal.bgf → ...metasyntax/ebnf-iso-1/ebnf.iso.formal.bgf
diff --git a/...metasyntax/ebnf-iso/ebnf.iso.informal.bgf → ...tasyntax/ebnf-iso-1/ebnf.iso.informal.bgf b/...metasyntax/ebnf-iso/ebnf.iso.informal.bgf → ...tasyntax/ebnf-iso-1/ebnf.iso.informal.bgf
diff --git a/...mmars/metasyntax/ebnf-iso/generalize.xbgf → ...ars/metasyntax/ebnf-iso-1/generalize.xbgf b/...mmars/metasyntax/ebnf-iso/generalize.xbgf → ...ars/metasyntax/ebnf-iso-1/generalize.xbgf
diff --git a/.../grammars/metasyntax/ebnf-iso/src.8.1.txt → ...rammars/metasyntax/ebnf-iso-1/src.8.1.txt b/.../grammars/metasyntax/ebnf-iso/src.8.1.txt → ...rammars/metasyntax/ebnf-iso-1/src.8.1.txt
diff --git a/...s/grammars/metasyntax/ebnf-iso/README.txt → ...grammars/metasyntax/ebnf-iso-2/README.txt b/...s/grammars/metasyntax/ebnf-iso/README.txt → ...grammars/metasyntax/ebnf-iso-2/README.txt
@@ -1,11 +1,6 @@
 ISO/IEC 14977 : 1996(E)
 Final draft version, SC22/N2249: http://www.cl.cam.ac.uk/~mgk25/iso-14977.pdf
 
-src.8.1.txt:
-	8.1	The syntax of Extended BNF, pages 8–10
-
 src.8.2.txt:
 	8.2	Extended BNF used to define itself informally, page 10
 
-src.8.3.txt:
-	8.3	Extended BNF defined informally, page 10
diff --git a/.../grammars/metasyntax/ebnf-iso/src.8.2.txt → ...rammars/metasyntax/ebnf-iso-2/src.8.2.txt b/.../grammars/metasyntax/ebnf-iso/src.8.2.txt → ...rammars/metasyntax/ebnf-iso-2/src.8.2.txt
diff --git a/topics/grammars/metasyntax/ebnf-iso-3/Makefile b/topics/grammars/metasyntax/ebnf-iso-3/Makefile
@@ -0,0 +1,6 @@
+extract:
+	../../hunter.py src.8.3.txt config.edd ebnf-iso-3.raw.bgf
+	${tooldir}/xbgf post-extraction.xbgf ebnf-iso-3.raw.bgf ebnf-iso-3.ext.bgf
+	${tooldir}/xbgf refactor.xbgf ebnf-iso-3.ext.bgf ebnf-iso-3.bgf
+
+include ../../Makefile.include
diff --git a/topics/grammars/metasyntax/ebnf-iso-3/README.txt b/topics/grammars/metasyntax/ebnf-iso-3/README.txt
@@ -0,0 +1,5 @@
+ISO/IEC 14977 : 1996(E)
+Final draft version, SC22/N2249: http://www.cl.cam.ac.uk/~mgk25/iso-14977.pdf
+
+src.8.3.txt:
+	8.3	Extended BNF defined informally, page 10
diff --git a/topics/grammars/metasyntax/ebnf-iso-3/config.edd b/topics/grammars/metasyntax/ebnf-iso-3/config.edd
@@ -0,0 +1,21 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<edd:config xmlns:edd="http://planet-sl.org/edd">
+	<defining-symbol>=</defining-symbol>
+	<definition-separator-symbol>/</definition-separator-symbol>
+	<concatenate-symbol>,</concatenate-symbol>
+	<nonterminals-may-contain-spaces/>
+	<terminator-symbol>.</terminator-symbol>
+	<start-comment-symbol>(*</start-comment-symbol>
+	<end-comment-symbol>*)</end-comment-symbol>
+	<start-terminal-symbol>’</start-terminal-symbol>
+	<end-terminal-symbol>’</end-terminal-symbol>
+	<start-option-symbol>(/</start-option-symbol>
+	<end-option-symbol>/)</end-option-symbol>
+	<start-star-symbol>(:</start-star-symbol>
+	<end-star-symbol>:)</end-star-symbol>
+	<ignore-extra-newlines/>
+	<mask>
+		<token>"’"</token>
+		<terminal>’</terminal>
+	</mask>
+</edd:config>
diff --git a/topics/grammars/metasyntax/ebnf-iso-3/post-extraction.xbgf b/topics/grammars/metasyntax/ebnf-iso-3/post-extraction.xbgf
@@ -0,0 +1,35 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<xbgf:sequence xmlns:xbgf="http://planet-sl.org/xbgf" xmlns:bgf="http://planet-sl.org/bgf">
+	<xbgf:replace>
+		<bgf:expression>
+			<nonterminal>CHARACTER_-_’’’</nonterminal>
+		</bgf:expression>
+		<bgf:expression>
+			<nonterminal>CHARACTER</nonterminal>
+		</bgf:expression>
+	</xbgf:replace>
+	<xbgf:replace>
+		<bgf:expression>
+			<nonterminal>CHARACTER_-_’"’</nonterminal>
+		</bgf:expression>
+		<bgf:expression>
+			<nonterminal>CHARACTER</nonterminal>
+		</bgf:expression>
+	</xbgf:replace>
+	<xbgf:replace>
+		<bgf:expression>
+			<nonterminal>CHARACTER_-_’?’</nonterminal>
+		</bgf:expression>
+		<bgf:expression>
+			<nonterminal>CHARACTER</nonterminal>
+		</bgf:expression>
+	</xbgf:replace>
+	<xbgf:replace>
+		<bgf:expression>
+			<nonterminal>EMPTY</nonterminal>
+		</bgf:expression>
+		<bgf:expression>
+			<terminal>EMPTY</terminal>
+		</bgf:expression>
+	</xbgf:replace>
+</xbgf:sequence>
diff --git a/topics/grammars/metasyntax/ebnf-iso-3/refactor.xbgf b/topics/grammars/metasyntax/ebnf-iso-3/refactor.xbgf
@@ -0,0 +1,72 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<xbgf:sequence xmlns:xbgf="http://planet-sl.org/xbgf" xmlns:bgf="http://planet-sl.org/bgf">
+	<xbgf:massage>
+		<bgf:expression>
+			<sequence>
+				<bgf:expression>
+					<nonterminal>SYNTAX_RULE</nonterminal>
+				</bgf:expression>
+				<bgf:expression>
+					<star>
+						<bgf:expression>
+							<nonterminal>SYNTAX_RULE</nonterminal>
+						</bgf:expression>
+					</star>
+				</bgf:expression>
+			</sequence>
+		</bgf:expression>
+		<bgf:expression>
+			<plus>
+				<bgf:expression>
+					<nonterminal>SYNTAX_RULE</nonterminal>
+				</bgf:expression>
+			</plus>
+		</bgf:expression>
+	</xbgf:massage>
+	<xbgf:massage>
+		<bgf:expression>
+			<sequence>
+				<bgf:expression>
+					<nonterminal>DIGIT</nonterminal>
+				</bgf:expression>
+				<bgf:expression>
+					<star>
+						<bgf:expression>
+							<nonterminal>DIGIT</nonterminal>
+						</bgf:expression>
+					</star>
+				</bgf:expression>
+			</sequence>
+		</bgf:expression>
+		<bgf:expression>
+			<plus>
+				<bgf:expression>
+					<nonterminal>DIGIT</nonterminal>
+				</bgf:expression>
+			</plus>
+		</bgf:expression>
+	</xbgf:massage>
+	<xbgf:massage>
+		<bgf:expression>
+			<sequence>
+				<bgf:expression>
+					<nonterminal>CHARACTER</nonterminal>
+				</bgf:expression>
+				<bgf:expression>
+					<star>
+						<bgf:expression>
+							<nonterminal>CHARACTER</nonterminal>
+						</bgf:expression>
+					</star>
+				</bgf:expression>
+			</sequence>
+		</bgf:expression>
+		<bgf:expression>
+			<plus>
+				<bgf:expression>
+					<nonterminal>CHARACTER</nonterminal>
+				</bgf:expression>
+			</plus>
+		</bgf:expression>
+	</xbgf:massage>
+</xbgf:sequence>
diff --git a/.../grammars/metasyntax/ebnf-iso/src.8.3.txt → ...rammars/metasyntax/ebnf-iso-3/src.8.3.txt b/.../grammars/metasyntax/ebnf-iso/src.8.3.txt → ...rammars/metasyntax/ebnf-iso-3/src.8.3.txt