From a50a7e5abce905bdfb6778cc790936a10dfd30cd Mon Sep 17 00:00:00 2001 From: grammarware Date: Wed, 10 Sep 2008 21:00:54 +0000 Subject: [PATCH] enabling JLS 1 git-svn-id: https://slps.svn.sourceforge.net/svnroot/slps@251 ab42f6e0-554d-0410-b580-99e487e6eeb2 --- topics/extraction/html2bgf/Makefile | 5 +- topics/extraction/html2bgf/README.txt | 8 +++ topics/extraction/html2bgf/html2bgf.py | 80 ++++++++++++++++++++------ topics/extraction/html2bgf/xpathpre.py | 13 +++++ 4 files changed, 88 insertions(+), 18 deletions(-) create mode 100644 topics/extraction/html2bgf/README.txt create mode 100755 topics/extraction/html2bgf/xpathpre.py diff --git a/topics/extraction/html2bgf/Makefile b/topics/extraction/html2bgf/Makefile index 3d5c838b..bf177bac 100644 --- a/topics/extraction/html2bgf/Makefile +++ b/topics/extraction/html2bgf/Makefile @@ -1,10 +1,13 @@ build: test: + python xpathpre.py ../../java/jls1/syntax.html >jls1.html + python html2bgf.py jls1.html jls1.bgf python html2bgf.py ../../java/jls2/syntax.html jls2.bgf python html2bgf.py ../../java/jls3/syntax.html jls3.bgf + ../../../shared/tools/checkxml bgf jls1.bgf ../../../shared/tools/checkxml bgf jls2.bgf ../../../shared/tools/checkxml bgf jls3.bgf clean: - rm -f *.bgf + rm -f *.bgf jls1.html diff --git a/topics/extraction/html2bgf/README.txt b/topics/extraction/html2bgf/README.txt new file mode 100644 index 00000000..eaabecb4 --- /dev/null +++ b/topics/extraction/html2bgf/README.txt @@ -0,0 +1,8 @@ +This extractor tries to squeeze a BGF from the hypertext documentation. +In order to do so, is applies a number of heuristics and fixes the most +cmmonly encountered problems in manually created language specifications. + +It works equally well with Java Language Standard 2 and 3, but requires +an additional action for Java Language Standard 1. This action virtually +equals running some sort of //pre XPath query, but the HTML was too dirty +and non-well-formed to use the real xpath utility. diff --git a/topics/extraction/html2bgf/html2bgf.py b/topics/extraction/html2bgf/html2bgf.py index 40099962..5df65afc 100755 --- a/topics/extraction/html2bgf/html2bgf.py +++ b/topics/extraction/html2bgf/html2bgf.py @@ -105,16 +105,22 @@ def traverse(c): line += map2expr(alt) return line+'' -def addProduction(name,choices): +def addProduction(name,choices,oneof): bs = [] - for s in range(0,len(choices)): - ss = [] - for i in range(0,len(choices[s][0])): - if choices[s][1][i]: - ss.append(choices[s][0][i]) - else: - ss.append('"'+choices[s][0][i]+'"') - bs.append(ss) + if oneof: + # concatenate all choices + for c in choices: + for s in c[0]: + bs.append(['"'+s+'"']) + else: + for s in range(0,len(choices)): + ss = [] + for i in range(0,len(choices[s][0])): + if choices[s][1][i]: + ss.append(choices[s][0][i]) + else: + ss.append('"'+choices[s][0][i]+'"') + bs.append(ss) prods[name]=bs def serialiseT(name,choices): @@ -162,6 +168,29 @@ def parseLine(line): emph[0] = True line = line[7:] continue + if line.find('opt')==0: + last = tokens.pop() + lastf = flags.pop() + tokens.extend(['[',last,']']) + flags.extend([True,lastf,True]) + line = line[21:] + continue + if line.find('opt')==0: + last = tokens.pop() + lastf = flags.pop() + tokens.extend(['[',last,']']) + flags.extend([True,lastf,True]) + line = line[11:] + continue + if line.find('')==0: + line = line[6:] + continue + if line.find('')==0: + line = line[5:] + continue + if line.find('<')==0: + print 'Found unknown tag while parsing "'+line+'", skipping!' + line = line[line.index('>')+1:] else: if line.find('<')>0: extra = line[:line.index('<')].strip().split() @@ -190,6 +219,7 @@ def ifContinuation(s): return True def readGrammar(fn): + oneof = False src = open(fn,'r') grammar = False name = '' @@ -197,7 +227,7 @@ def readGrammar(fn): for line in src: if line.find('
')>=0 or line.find('
')>=0: if grammar: - addProduction(name,choices) + addProduction(name,choices,oneof) else: # dummy parse line for the sake of / a,b=parseLine(line.split('
')[1])
@@ -214,17 +244,26 @@ def readGrammar(fn):
      # new definition
      if choices:
       # flush the current one
-      addProduction(name,choices)
+      addProduction(name,choices,oneof)
      choices = []
      name = a[0]
+     oneof = False
     elif len(a)==4 and a[0]==a[2] and a[1]==':' and a[-1]==':':
      # new mingled definition
      if choices:
       # flush the current one
-      addProduction(name,choices)
+      addProduction(name,choices,oneof)
      choices = []
      name = a[0]
+     oneof = False
      print name,'double-declared, fixed'
+    elif len(a)==4 and a[1]==':' and a[2]=='one' and a[3]=='of':
+     # new "one-of" definition
+     if choices:
+      addProduction(name,choices,oneof)
+     choices = []
+     name = a[0]
+     oneof = True
     elif cont:
      # line continuation
      print 'Line continuation enforced while parsing',name
@@ -271,7 +310,7 @@ def automatedImprove():
   newprods = []
   for bs in prods[nt]:
    for i in range(0,len(bs)):
-    if bs[i]=='"|"' and len(bs)>1:
+    if bs[i]=='"|"' and len(bs)>1 and nt.find('Or')<0:
      print 'Terminal to nonterminal heuristic fix:',bs[i],'in',nt,'(suspicious context)'
      bs[i] = '|'
     elif bs[i][0]=='"':
@@ -279,6 +318,13 @@ def automatedImprove():
       print 'Terminal to nonterminal heuristic fix:',bs[i],'in',nt,'(familiar name)'
       bs[i]=bs[i][1:-1]
       continue
+     if bs[i]=='"opt"':
+      print 'Structural heuristic fix:',bs[i],'in',nt,'(changed to BNF optional)'
+      newbs=bs[:i-1]
+      newbs.extend(['[',bs[i-1],']'])
+      newbs.extend(bs[i+1:])
+      bs = newbs
+      continue
      if bs[i].find('&')<0:
       bs[i] = breakWords(nt,bs[i])
      continue
@@ -294,17 +340,17 @@ def automatedImprove():
       # () is not BNF bracketing
       bs[i]='"("'
       bs[i+1]='")"'
-      print 'Bracketing heuristic fix in',nt,'(empty group)'
+      print 'Structural heuristic fix in',nt,'(empty group)'
      if i+2'
   arr.reverse()
   while(cx>0):
diff --git a/topics/extraction/html2bgf/xpathpre.py b/topics/extraction/html2bgf/xpathpre.py
new file mode 100755
index 00000000..04eae5e1
--- /dev/null
+++ b/topics/extraction/html2bgf/xpathpre.py
@@ -0,0 +1,13 @@
+#!/usr/bin/python
+import sys
+
+f = open(sys.argv[1],'r')
+grammar = False
+print '
'
+for chunk in ''.join(f.readlines()).split('
'):
+ if chunk.find('19.2')>0:
+  grammar = True
+ elif grammar:
+  print chunk.split('
')[0].replace('
','').replace(' ',' ') +print '
' +f.close()