From a50a7e5abce905bdfb6778cc790936a10dfd30cd Mon Sep 17 00:00:00 2001
From: grammarware <vadim@grammarware.net>
Date: Wed, 10 Sep 2008 21:00:54 +0000
Subject: [PATCH] enabling JLS 1

git-svn-id: https://slps.svn.sourceforge.net/svnroot/slps@251 ab42f6e0-554d-0410-b580-99e487e6eeb2
---
 topics/extraction/html2bgf/Makefile    |  5 +-
 topics/extraction/html2bgf/README.txt  |  8 +++
 topics/extraction/html2bgf/html2bgf.py | 80 ++++++++++++++++++++------
 topics/extraction/html2bgf/xpathpre.py | 13 +++++
 4 files changed, 88 insertions(+), 18 deletions(-)
 create mode 100644 topics/extraction/html2bgf/README.txt
 create mode 100755 topics/extraction/html2bgf/xpathpre.py
diff --git a/topics/extraction/html2bgf/Makefile b/topics/extraction/html2bgf/Makefile
index 3d5c838b..bf177bac 100644
--- a/topics/extraction/html2bgf/Makefile
+++ b/topics/extraction/html2bgf/Makefile
@@ -1,10 +1,13 @@
 build:
 
 test:
+	python xpathpre.py ../../java/jls1/syntax.html >jls1.html
+	python html2bgf.py jls1.html jls1.bgf
 	python html2bgf.py ../../java/jls2/syntax.html jls2.bgf
 	python html2bgf.py ../../java/jls3/syntax.html jls3.bgf
+	../../../shared/tools/checkxml bgf jls1.bgf
 	../../../shared/tools/checkxml bgf jls2.bgf
 	../../../shared/tools/checkxml bgf jls3.bgf
 
 clean:
-	rm -f *.bgf
+	rm -f *.bgf jls1.html
diff --git a/topics/extraction/html2bgf/README.txt b/topics/extraction/html2bgf/README.txt
new file mode 100644
index 00000000..eaabecb4
--- /dev/null
+++ b/topics/extraction/html2bgf/README.txt
@@ -0,0 +1,8 @@
+This extractor tries to squeeze a BGF from the hypertext documentation.
+In order to do so, is applies a number of heuristics and fixes the most
+cmmonly encountered problems in manually created language specifications.
+
+It works equally well with Java Language Standard 2 and 3, but requires
+an additional action for Java Language Standard 1. This action virtually
+equals running some sort of //pre XPath query, but the HTML was too dirty
+and non-well-formed to use the real xpath utility.
diff --git a/topics/extraction/html2bgf/html2bgf.py b/topics/extraction/html2bgf/html2bgf.py
index 40099962..5df65afc 100755
--- a/topics/extraction/html2bgf/html2bgf.py
+++ b/topics/extraction/html2bgf/html2bgf.py
@@ -105,16 +105,22 @@ def traverse(c):
    line += map2expr(alt)
   return line+'</choice></bgf:expression>'
 
-def addProduction(name,choices):
+def addProduction(name,choices,oneof):
  bs = []
- for s in range(0,len(choices)):
-  ss = []
-  for i in range(0,len(choices[s][0])):
-   if choices[s][1][i]:
-    ss.append(choices[s][0][i])
-   else:
-    ss.append('"'+choices[s][0][i]+'"')
-  bs.append(ss)
+ if oneof:
+  # concatenate all choices
+  for c in choices:
+   for s in c[0]:
+    bs.append(['"'+s+'"'])
+ else:
+  for s in range(0,len(choices)):
+   ss = []
+   for i in range(0,len(choices[s][0])):
+    if choices[s][1][i]:
+     ss.append(choices[s][0][i])
+    else:
+     ss.append('"'+choices[s][0][i]+'"')
+   bs.append(ss)
  prods[name]=bs
 
 def serialiseT(name,choices):
@@ -162,6 +168,29 @@ def parseLine(line):
    emph[0] = True
    line = line[7:]
    continue
+  if line.find('<sub><i>opt</i></sub>')==0:
+   last = tokens.pop()
+   lastf = flags.pop()
+   tokens.extend(['[',last,']'])
+   flags.extend([True,lastf,True])
+   line = line[21:]
+   continue
+  if line.find('<sub><i>opt')==0:
+   last = tokens.pop()
+   lastf = flags.pop()
+   tokens.extend(['[',last,']'])
+   flags.extend([True,lastf,True])
+   line = line[11:]
+   continue
+  if line.find('</sub>')==0:
+   line = line[6:]
+   continue
+  if line.find('<sub>')==0:
+   line = line[5:]
+   continue
+  if line.find('<')==0:
+   print 'Found unknown tag while parsing "'+line+'", skipping!'
+   line = line[line.index('>')+1:]
   else:
    if line.find('<')>0:
     extra = line[:line.index('<')].strip().split()
@@ -190,6 +219,7 @@ def ifContinuation(s):
  return True
 
 def readGrammar(fn):
+ oneof = False
  src = open(fn,'r')
  grammar = False
  name = ''
@@ -197,7 +227,7 @@ def readGrammar(fn):
  for line in src:
   if line.find('<pre>')>=0 or line.find('</pre>')>=0:
    if grammar:
-    addProduction(name,choices)
+    addProduction(name,choices,oneof)
    else:
     # dummy parse line for the sake of <i>/<em>
     a,b=parseLine(line.split('<pre>')[1])
@@ -214,17 +244,26 @@ def readGrammar(fn):
      # new definition
      if choices:
       # flush the current one
-      addProduction(name,choices)
+      addProduction(name,choices,oneof)
      choices = []
      name = a[0]
+     oneof = False
     elif len(a)==4 and a[0]==a[2] and a[1]==':' and a[-1]==':':
      # new mingled definition
      if choices:
       # flush the current one
-      addProduction(name,choices)
+      addProduction(name,choices,oneof)
      choices = []
      name = a[0]
+     oneof = False
      print name,'double-declared, fixed'
+    elif len(a)==4 and a[1]==':' and a[2]=='one' and a[3]=='of':
+     # new "one-of" definition
+     if choices:
+      addProduction(name,choices,oneof)
+     choices = []
+     name = a[0]
+     oneof = True
     elif cont:
      # line continuation
      print 'Line continuation enforced while parsing',name
@@ -271,7 +310,7 @@ def automatedImprove():
   newprods = []
   for bs in prods[nt]:
    for i in range(0,len(bs)):
-    if bs[i]=='"|"' and len(bs)>1:
+    if bs[i]=='"|"' and len(bs)>1 and nt.find('Or')<0:
      print 'Terminal to nonterminal heuristic fix:',bs[i],'in',nt,'(suspicious context)'
      bs[i] = '|'
     elif bs[i][0]=='"':
@@ -279,6 +318,13 @@ def automatedImprove():
       print 'Terminal to nonterminal heuristic fix:',bs[i],'in',nt,'(familiar name)'
       bs[i]=bs[i][1:-1]
       continue
+     if bs[i]=='"opt"':
+      print 'Structural heuristic fix:',bs[i],'in',nt,'(changed to BNF optional)'
+      newbs=bs[:i-1]
+      newbs.extend(['[',bs[i-1],']'])
+      newbs.extend(bs[i+1:])
+      bs = newbs
+      continue
      if bs[i].find('&')<0:
       bs[i] = breakWords(nt,bs[i])
      continue
@@ -294,17 +340,17 @@ def automatedImprove():
       # () is not BNF bracketing
       bs[i]='"("'
       bs[i+1]='")"'
-      print 'Bracketing heuristic fix in',nt,'(empty group)'
+      print 'Structural heuristic fix in',nt,'(empty group)'
      if i+2<len(bs) and bs[i+2]==')':
       # (x) is not BNF bracketing
       bs[i]='"("'
       bs[i+2]='")"'
-      print 'Bracketing heuristic fix in',nt,'(singleton group)'
+      print 'Structural heuristic fix in',nt,'(singleton group)'
      if i+4<len(bs) and bs[i+4]==')' and ((bs[i+1]=='[' and bs[i+3]==']') or (bs[i+1]=='{' and bs[i+3]=='}')):
       # ([x]) or ({x}) is not BNF bracketing either
       bs[i]='"("'
       bs[i+4]='")"'
-      print 'Bracketing heuristic fix in',nt,'(singleton complex group)'
+      print 'Structural heuristic fix in',nt,'(singleton complex group)'
    newprods.append(fixBrackets(nt,' '.join(bs).split()))
   prods[nt]=newprods
  pass
@@ -327,7 +373,7 @@ def fixBracketPair(nt,arr,left,right):
  if cx==0:
   return arr
  else:
-  print 'Bracketing heuristic fix in',nt,
+  print 'Structural  heuristic fix in',nt,
   #print arr,'->'
   arr.reverse()
   while(cx>0):
diff --git a/topics/extraction/html2bgf/xpathpre.py b/topics/extraction/html2bgf/xpathpre.py
new file mode 100755
index 00000000..04eae5e1
--- /dev/null
+++ b/topics/extraction/html2bgf/xpathpre.py
@@ -0,0 +1,13 @@
+#!/usr/bin/python
+import sys
+
+f = open(sys.argv[1],'r')
+grammar = False
+print '<pre>'
+for chunk in ''.join(f.readlines()).split('<pre>'):
+ if chunk.find('19.2')>0:
+  grammar = True
+ elif grammar:
+  print chunk.split('</pre>')[0].replace('<br>','').replace('&#32;',' ')
+print '</pre>'
+f.close()