From 50e5fac7e3e748ce5d2b631361c0a1defa0e4248 Mon Sep 17 00:00:00 2001
From: grammarware <vadim@grammarware.net>
Date: Fri, 27 May 2011 13:03:06 +0000
Subject: [PATCH] extending Hunter with indentation juggling facilities; +1
 Eiffel grammar

git-svn-id: https://slps.svn.sourceforge.net/svnroot/slps@1065 ab42f6e0-554d-0410-b580-99e487e6eeb2
---
 topics/grammars/eiffel/bezault/Makefile       |   6 +
 topics/grammars/eiffel/bezault/complete.xbgf  | 212 ++++++++++++++++++
 topics/grammars/eiffel/bezault/config.edd     |  28 +++
 .../grammars/eiffel/bezault/post-extract.xbgf |  48 ++++
 topics/grammars/hunter.py                     |  94 ++++++--
 5 files changed, 370 insertions(+), 18 deletions(-)
 create mode 100644 topics/grammars/eiffel/bezault/Makefile
 create mode 100644 topics/grammars/eiffel/bezault/complete.xbgf
 create mode 100644 topics/grammars/eiffel/bezault/config.edd
 create mode 100644 topics/grammars/eiffel/bezault/post-extract.xbgf
diff --git a/topics/grammars/eiffel/bezault/Makefile b/topics/grammars/eiffel/bezault/Makefile
new file mode 100644
index 00000000..55a967a8
--- /dev/null
+++ b/topics/grammars/eiffel/bezault/Makefile
@@ -0,0 +1,6 @@
+extract:
+	../../hunter.py src.the.syntax.txt config.edd eiffel.raw.bgf
+	${tooldir}/xbgf post-extract.xbgf eiffel.raw.bgf eiffel.ext.bgf
+	${tooldir}/xbgf complete.xbgf eiffel.ext.bgf eiffel.bgf
+
+include ../../Makefile.include
diff --git a/topics/grammars/eiffel/bezault/complete.xbgf b/topics/grammars/eiffel/bezault/complete.xbgf
new file mode 100644
index 00000000..e8ec9140
--- /dev/null
+++ b/topics/grammars/eiffel/bezault/complete.xbgf
@@ -0,0 +1,212 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<xbgf:sequence xmlns:xbgf="http://planet-sl.org/xbgf" xmlns:bgf="http://planet-sl.org/bgf">
+	<!--
+		An identifier is a sequence of one or more characters, of which the first is a letter (a to z and A to Z) and each of the subsequent ones,
+		if any, is a letter, a decimal digit (0 to 9) or an underscore character (_).
+		Letter case is not significant for letters: the two identifiers lInKeD_liST and LINKED_LIST are considered the same.
+	-->
+	<xbgf:define>
+		<bgf:production>
+			<nonterminal>Identifier</nonterminal>
+			<bgf:expression>
+				<sequence>
+					<bgf:expression>
+						<nonterminal>Letter</nonterminal>
+					</bgf:expression>
+					<bgf:expression>
+						<star>
+							<bgf:expression>
+								<choice>
+									<bgf:expression>
+										<nonterminal>Letter</nonterminal>
+									</bgf:expression>
+									<bgf:expression>
+										<nonterminal>Decimal_digit</nonterminal>
+									</bgf:expression>
+									<bgf:expression>
+										<terminal>_</terminal>
+									</bgf:expression>
+								</choice>
+							</bgf:expression>
+						</star>
+					</bgf:expression>
+				</sequence>
+			</bgf:expression>
+		</bgf:production>
+	</xbgf:define>
+	<!--
+		An integer is a sequence of characters, each of which must be either:
+	        a decimal digit (0 to 9)
+	        an underscore (_), which may not be the first character.
+	    If any underscore is present, then there must be three digits to the right of every underscore, and there must not be
+		any consecutive group of four digits.
+	-->
+	<xbgf:define>
+		<bgf:production>
+			<nonterminal>Integer</nonterminal>
+			<bgf:expression>
+				<choice>
+					<bgf:expression>
+						<plus>
+							<bgf:expression>
+								<nonterminal>Decimal_digit</nonterminal>
+							</bgf:expression>
+						</plus>
+					</bgf:expression>
+					<bgf:expression>
+						<sequence>
+							<bgf:expression>
+								<nonterminal>Decimal_digit</nonterminal>
+							</bgf:expression>
+							<bgf:expression>
+								<optional>
+									<bgf:expression>
+										<sequence>
+											<bgf:expression>
+												<nonterminal>Decimal_digit</nonterminal>
+											</bgf:expression>
+											<bgf:expression>
+												<optional>
+													<bgf:expression>
+														<nonterminal>Decimal_digit</nonterminal>
+													</bgf:expression>
+												</optional>
+											</bgf:expression>
+										</sequence>
+									</bgf:expression>
+								</optional>
+							</bgf:expression>
+							<bgf:expression>
+								<plus>
+									<bgf:expression>
+										<sequence>
+											<bgf:expression>
+												<terminal>_</terminal>
+											</bgf:expression>
+											<bgf:expression>
+												<nonterminal>Decimal_digit</nonterminal>
+											</bgf:expression>
+											<bgf:expression>
+												<nonterminal>Decimal_digit</nonterminal>
+											</bgf:expression>
+											<bgf:expression>
+												<nonterminal>Decimal_digit</nonterminal>
+											</bgf:expression>
+										</sequence>
+									</bgf:expression>
+								</plus>
+							</bgf:expression>
+						</sequence>
+					</bgf:expression>
+				</choice>
+			</bgf:expression>
+		</bgf:production>
+	</xbgf:define>
+	<!--
+		Hexadecimal_constant
+		    An hexadecimal constant is a sequence of two or more characters, whose first character is a decimal digit (0 to 9),
+			whose subsequent characters but the last are decimal digits or letters a to f or A to F, and followed by x or X,
+			with no other intervening characters.
+	-->
+	<!--
+		Real
+			A real number is made of the following elements:
+		        an optional Integer, giving the integral part (If this is absent, the integral part is 0.)
+		        a required dot (.)
+		        an optional Integer written backwards, which gives the fractional part (if this is absent, the fractional part is 0.)
+		        an optional exponent, which is the letter e or E followed by an optional Sign (+ or -) and an Integer.
+					The integer is required if the e or E is present. This indicates that the value appearing before the e or E must
+					be scaled by 10^n, where n is the given integer.
+		    No intervening character (blank or otherwise) is permitted between these elements.
+			The integral and fractional parts may not both be absent. If underscores are used in the integral or the fractional part,
+			they must also appear in the other part, unless it has three digits or less.
+	-->
+	<!--
+		Character_constant
+		    A character constant is either:
+		        a printable character except percent (%) and single quote (')
+		        a Special_character
+		    enclosed in single quotes (').
+	-->
+	<!--
+		Manifest_string
+		    A manifest string is an arbitrary sequence of:
+		        printable characters except percent (%) and double quote (")
+		        Special_characters
+		    enclosed in double quotes (").
+		    An extended form allows one to write a manifest string on two or more lines. Every line but the last must end
+			with a percent (%) and every line but the first must begin with a percent (%) possibly preceded by blanks ( ) and tab characters.
+	-->
+	<!--
+		Bit_constant
+		    A bit constant is a sequence of digits 0 or 1, followed by b or B, with no other intervening characters.
+	-->
+	<!--
+		Free_operator
+		    A free operator is a sequence of one or more characters, whose first character is any one of @ # | & and whose subsequent
+			characters, if any, may be any printable characters. Letter case is not significant for letters in free operators.
+	-->
+	<!--
+		Comment
+		    A comment begins with two dash characters (- -) and extends to the end of the line.
+		    An extended form allows one to write a comment on two or more lines. Every line but the first must begin
+			with two dash characters possibly preceded by blanks and tab characters.
+		Header_comment
+		    Comment
+	-->
+	<!--
+		Break
+		    A break is made of a sequence of one or more of the following characters:
+		        blank
+		        tab
+		        new line
+		    A break can be inserted between two adjacent elements without affecting the semantics.
+	-->
+	<!--
+		Special_character
+		    A special character has one of the following forms:
+		        a sequence %/code/ where code is an unsigned integer representing the character of ASCII code code in decimal value
+		        a sequence %K used to represent the following special characters:
+						@ 	%A 	At-sign
+						BS 	%B 	Backspace
+						^ 	%C 	Circumflex
+						$ 	%D 	Dollar
+						FF 	%F 	Form feed
+						\ 	%H 	backslasH
+						~ 	%L 	tiLda
+						NL (LF) 	%N 	Newline
+						` 	%Q 	back Quote
+						CR 	%R 	carriage Return
+						# 	%S 	Sharp
+						HT 	%T 	horizontal Tab
+						NUL 	%U 	nUll character
+						| 	%V 	Vertical bar
+						% 	%% 	percent
+						' 	%' 	single quote
+						" 	%" 	double quote
+						[ 	%( 	opening bracket
+						] 	%) 	closing bracket
+						{ 	%< 	opening brace
+						} 	%> 	closing brace
+	-->
+	<!--
+		Reserved_word
+		    A reserved word is either:
+		        a keyword, which serves to introduce and delimit the variant components of constructs. The Eiffel keywords are:
+					alias, all, and, as, check, class, creation, debug, deferred, do, else, elseif, end, ensure, expanded, export,
+					external, feature, from, frozen, if, implies, indexing, infix, inherit, inspect, invariant, is, like, local, loop,
+					not, obsolete, old, once, or, prefix, redefine, rename, require, rescue, retry, select, separate, then, undefine,
+					until, variant, when, xor.
+		        a predefined name, which comes at positions where variable tokens would also be permissible. The Eiffel predefined names are:
+					BIT, Current, False, Precursor, Result, Strip, True, Unique.
+		    The letter case is not significant for reserved words: the two words Result and rEsUlT are considered the same.
+	-->
+	<!--
+		Semicolon
+		    Semicolons are used as separators in lists such as Index_list or Compound. Semicolons are optional in most places. However they are required in some cases to remove ambiguities in Assertion and Compound. The ambiguity appears in the following piece of code:
+
+		        foo (expr).bar
+
+		    where this could be recognized as "bar applied to the result of function foo with argument expr" or as "a call to foo followed by bar applied to expr". The rule to resolve this ambiguity is to put a semicolon between 'foo' and '(expr).bar' to get the second interpretation, or to leave it as it is to get the first one.
+	-->
+</xbgf:sequence>
diff --git a/topics/grammars/eiffel/bezault/config.edd b/topics/grammars/eiffel/bezault/config.edd
new file mode 100644
index 00000000..c4504744
--- /dev/null
+++ b/topics/grammars/eiffel/bezault/config.edd
@@ -0,0 +1,28 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<edd:config xmlns:edd="http://planet-sl.org/edd">
+	<consider-indentation/>
+	<tabulation-symbol>    </tabulation-symbol>
+	<defining-symbol>@@@0-1</defining-symbol>
+	<terminator-symbol>@@@1-0</terminator-symbol>
+	<definition-separator-symbol>|</definition-separator-symbol>
+	<start-option-symbol>[</start-option-symbol>
+	<end-option-symbol>]</end-option-symbol>
+	<start-star-symbol>{</start-star-symbol>
+	<end-star-symbol>...}</end-star-symbol>
+	<start-plus-symbol>{</start-plus-symbol>
+	<end-plus-symbol>...}+</end-plus-symbol>
+	<undefined-nonterminals-are-terminals/>
+	<terminal-if-uppercase/>
+	<nonterminal-if-contains>_</nonterminal-if-contains>
+	<nonterminal-if-camelcase>
+		<except>Result</except>
+		<except>Current</except>
+		<except>True</except>
+		<except>False</except>
+	</nonterminal-if-camelcase>
+	<glue-nonalphanumeric-terminals/>
+	<ignore>
+		<newline/>
+		<same-indentation/>
+	</ignore>
+</edd:config>
diff --git a/topics/grammars/eiffel/bezault/post-extract.xbgf b/topics/grammars/eiffel/bezault/post-extract.xbgf
new file mode 100644
index 00000000..eb9c360b
--- /dev/null
+++ b/topics/grammars/eiffel/bezault/post-extract.xbgf
@@ -0,0 +1,48 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<xbgf:sequence xmlns:xbgf="http://planet-sl.org/xbgf" xmlns:bgf="http://planet-sl.org/bgf">
+	<!-- Disambiguating the terminal "Unique" with the nonterminal Unique -->
+	<xbgf:replace>
+		<bgf:expression>
+			<nonterminal>Unique</nonterminal>
+		</bgf:expression>
+		<bgf:expression>
+			<terminal>Unique</terminal>
+		</bgf:expression>
+		<in>
+			<nonterminal>Unique</nonterminal>
+		</in>
+	</xbgf:replace>
+	<!-- Disambiguating the terminal "Precursor" with the nonterminal Precursor -->
+	<xbgf:replace>
+		<bgf:expression>
+			<nonterminal>Precursor</nonterminal>
+		</bgf:expression>
+		<bgf:expression>
+			<terminal>Precursor</terminal>
+		</bgf:expression>
+		<in>
+			<nonterminal>Precursor</nonterminal>
+		</in>
+	</xbgf:replace>
+	<!-- Disambiguating the terminal "Strip" with the nonterminal Strip -->
+	<xbgf:replace>
+		<bgf:expression>
+			<nonterminal>Strip</nonterminal>
+		</bgf:expression>
+		<bgf:expression>
+			<terminal>Strip</terminal>
+		</bgf:expression>
+		<in>
+			<nonterminal>Strip</nonterminal>
+		</in>
+	</xbgf:replace>
+	<!-- Empty is a metasymbol (written in red), not a terminal symbol (in bold) -->
+	<xbgf:replace>
+		<bgf:expression>
+			<terminal>empty</terminal>
+		</bgf:expression>
+		<bgf:expression>
+			<epsilon/>
+		</bgf:expression>
+	</xbgf:replace>
+</xbgf:sequence>
diff --git a/topics/grammars/hunter.py b/topics/grammars/hunter.py
index 9f3cd656..a23c3a08 100755
--- a/topics/grammars/hunter.py
+++ b/topics/grammars/hunter.py
@@ -12,6 +12,7 @@
 masked = {}
 always_terminals = []
 always_nonterminals = []
+ignore_tokens = []
 
 special = \
 	[
@@ -87,7 +88,10 @@ def splitTokenStream(s):
 			ts.append(s[i])
 			alpha = isAlpha(s[i])
 		i += 1
-	return list(filter(lambda x:x not in [' ',' ','	'],ts))
+	if 'tabulation-symbol' in config.keys():
+		ts = mapglue(ts,config['tabulation-symbol'])
+	#return list(filter(lambda x:x not in [' ',' ','	'],['TABULATION-SYMBOL' if t == config['tabulation-symbol'] else t for t in ts]))
+	return ['\t' if t=='TABULATION' else t for t in filter(lambda x:x not in [' ',' ','	'],['TABULATION' if t == config['tabulation-symbol'] else t for t in ts])]
 	# not space, not hard space, not tab; newlines are preserved for now
 
 def reconsiderSpaces(ts,sep,vs):
@@ -95,7 +99,7 @@ def reconsiderSpaces(ts,sep,vs):
 	vs = list(vs)
 	vs.append('\n')
 	for x in ts[1:]:
-		if x == '\n' and 'ignore-extra-newlines' in config.keys():
+		if x in ignore_tokens:
 			continue
 		if x == sep:
 			nts.append('')
@@ -127,6 +131,15 @@ def readConfig(f):
 		if e.tag=='decompose-symbols':
 			for x in e.findall('except'):
 				always_terminals.append(x.text)
+		if e.tag=='ignore':
+			for x in e.findall('*'):
+				if x.tag=='newline':
+					ignore_tokens.append('\n')
+					ignore_tokens.append('@@@0-0')
+				elif x.tag=='same-indentation':
+					ignore_tokens.append('@@@1-1')
+				else:
+					ignore_tokens.append(x.text)
 	if debug:
 		print('Ok',config)
 
@@ -745,7 +758,8 @@ def balanceProd(p):
 				else:
 					fail = True
 			if fail:
-				print('Cannot balance a production :-(')
+				print('STEP 6: Cannot balance a production, reverting',oldpi,'to a terminal.')
+				p[i] = config['start-terminal-symbol']+config[oldpi.lower()]+config['end-terminal-symbol']
 				i += 1
 			else:
 				print('STEP 6: Rebalanced ambiguity of',oldpi,'with',p[i])
@@ -797,9 +811,9 @@ def useTerminatorToFixProds(ps,ts):
 			np = [nps[-1][0]]
 			if config['defining-symbol'] not in p[i+1:]:
 				tail = p[i+1:]
-				if 'ignore-extra-newlines' in config.keys():
-					while '\n' in tail:
-						tail.remove('\n')
+				for x in ignore_tokens:
+					while x in tail:
+						tail.remove(x)
 				if len(tail)>0:
 					print('STEP 4 problem: terminator symbol without proper defining symbol context.',tail)
 					return nps
@@ -808,9 +822,9 @@ def useTerminatorToFixProds(ps,ts):
 					continue
 			else:
 				nt = p[i+1:p.index(config['defining-symbol'])]
-				if 'ignore-extra-newlines' in config.keys():
-					while '\n' in nt:
-						nt.remove('\n')
+				for x in ignore_tokens:
+					while x in nt:
+						nt.remove(x)
 				if len(nt) != 1:
 					print('STEP 4 problem: cannot determine nonterminal name from',nt)
 					nt = ' '.join(nt)
@@ -823,6 +837,29 @@ def useTerminatorToFixProds(ps,ts):
 			#print('>>>p>>>',p)
 	return nps
 
+def considerIndentation(ts):
+	nts = ['@@@0-']
+	oldlevel = level = 0
+	# ['A', '\n', '\t', 'B', '\n', '\t', 'C', 'D', 'E', 'F', '\n', '\t', '\t', 'G', '\n', '\t', 'H', '\n', 'X', '\n', '\t', 'Y', '\n', '\t', 'Z', '\n', 'B', '\n', '\t', 'K', '\n', 'C', '\n', '\t', 'L', '\n']
+	for t in ts:
+		if t == '\n':
+			if nts[-1][:3] == '@@@':
+				nts[-1] += '0'
+				oldlevel = level
+			nts.append('@@@'+str(oldlevel)+'-')
+			level = 0
+		elif t == '\t':
+			level += 1
+		elif nts[-1][:3]=='@@@':
+			nts[-1] += str(level)
+			oldlevel = level
+			nts.append(t)
+		else:
+			nts.append(t)
+	if nts[-1][:3] == '@@@':
+		nts[-1] += '0'
+	return nts
+
 if __name__ == "__main__":
 	if len(sys.argv) != 4:
 		print('Usage:')
@@ -844,7 +881,7 @@ def useTerminatorToFixProds(ps,ts):
 	# STEP 1: assemble terminal symbols
 	print('STEP 1: assembling terminal symbols according to start-terminal-symbol and end-terminal-symbol.')
 	for k in masked.keys():
-		if len(k)>1:
+		if len(k)>1 and k.find('@@@')<0:
 			print('STEP 1: going to glue tokens that resemble masked terminal', k.replace('\n','\\n'))
 			tokens = mapglue(tokens,k)
 	if 'start-terminal-symbol' in config.keys() and 'end-terminal-symbol' in config.keys():
@@ -870,14 +907,21 @@ def useTerminatorToFixProds(ps,ts):
 	# STEP 3: assembling composite metasymbols together
 	print('STEP 3: assembling metasymbols according to their possible values.')
 	tokens = assembleQualifiedNumbers(tokens)
-	for k in config.keys():
-		if len(config[k])>1:
-			print('STEP 3: going to glue tokens that resemble metasymbol', config[k].replace('\n','\\n'))
-			tokens = mapglue(tokens,config[k])
+	for k in config.values():
+		if len(k)>1 and k.find('\n')<0:
+			print('STEP 3: going to glue tokens that resemble metasymbol', k.replace('\n','\\n'))
+			tokens = mapglue(tokens,k)
 	if debug:
 		print(tokens)
 	# STEP 4: slice according to defining-symbol
 	print('STEP 4: splitting the token stream into productions.')
+	if 'consider-indentation' in config.keys():
+		# rewrite tokens with tabulation
+		tokens = considerIndentation(tokens)
+	if debug:
+		print('After considering indentation:',tokens)
+	if debug:
+		print(tokens)
 	if 'nonterminals-may-contain-spaces' in config.keys() and 'concatenate-symbol' in config.keys():
 		# can only treat them together, because spaces in names without concatenation symbol are highly ambiguous
 		# and concatenation symbols are never used if nonterminal names do not have spaces
@@ -930,7 +974,6 @@ def useTerminatorToFixProds(ps,ts):
 		print('The grammar is perceived like this:')
 		for p in prods:
 			print('\t',p[1],'is defined as',p[2:])
-	
 	for f in need2fix:
 		for i in range(0,len(config['terminator-symbol'])):
 			if prods[f][-len(config['terminator-symbol'])+i:] == config['terminator-symbol'][:len(config['terminator-symbol'])-i]:
@@ -941,6 +984,12 @@ def useTerminatorToFixProds(ps,ts):
 		if ''.join(prods[f][-len(config['terminator-symbol'])-1:-1]) == config['terminator-symbol'] and prods[f][-1] == '\n':
 			prods[f].pop()
 			poststep4 += 1
+	ii = list(filter(lambda x:x[:3]=='@@@',ignore_tokens))
+	if 'consider-indentation' in config.keys() and len(ii)>0:
+		# rewrite tokens with tabulation
+		#tokens = considerIndentation(tokens)
+		for x in ii:
+			prods = [list(filter(lambda y:y!=x,p)) for p in prods]
 	if poststep4 > 0:
 		print('STEP 4 also adjusted',poststep4,'productions that did not quite fit the expectations.')
 	if debug:
@@ -961,8 +1010,16 @@ def useTerminatorToFixProds(ps,ts):
 	if not step5:
 		print('STEP 5 skipped: sorry, no metasymbols specified.')
 	# STEP 6: validating metasymbols
+	if debug:
+		print('The grammar is perceived like this:')
+		for p in prods:
+			print('\t',p[1],'is defined as',p[2:])
 	prods = list(map(postfix2confix,prods))
 	prods = list(map(balanceProd,prods))
+	if debug:
+		print('The grammar is perceived like this:')
+		for p in prods:
+			print('\t',p[1],'is defined as',p[2:])
 	# STEP 7: various commands
 	print('STEP 7: executing special extraction commands.')
 	step7 = False
@@ -970,10 +1027,11 @@ def useTerminatorToFixProds(ps,ts):
 	if debug:
 		print('Defined are',defined)
 	defined.append(config['defining-symbol'])
-	if 'ignore-extra-newlines' in config.keys():
-		print('STEP 7: ignoring extra newlines.')
+	if len(ignore_tokens)>0:
+		print('STEP 7: ignoring extra tokens.')
 		step7 = True
-		prods = [list(filter(lambda y:y!='\n',p)) for p in prods]
+		for x in ignore_tokens:
+			prods = [list(filter(lambda y:y!=x,p)) for p in prods]
 		#prods = list(map(lambda x:filter(lambda y:y!='\n',x),prods))
 	if 'decompose-symbols' in config.keys():
 		print('STEP 7 (part of rule 4): decomposing compound symbols.')