general preprocessor; LCI advancements; much smarter extractor

git-svn-id: https://slps.svn.sourceforge.net/svnroot/slps@257 ab42f6e0-554d-0410-b580-99e487e6eeb2
grammarware · Sep 11, 2008 · 35e33d4 · 35e33d4
1 parent 1799766
commit 35e33d4
Show file tree

Hide file tree

Showing 8 changed files with 221 additions and 39 deletions.
diff --git a/shared/tools/jls2bgf b/shared/tools/jls2bgf
@@ -8,14 +8,14 @@ SLPS=${PWD}
 cd ${LOCAL1}
 
 if [ $# -lt 2 ]; then
-    echo "This tool extracts a BGF from Java Language Standard 1"
+    echo "This tool extracts a BGF from Java Language Standard that needs pre-processing"
     echo "Usage: $0 <input-hypertext-document> <output-bgf> [options]"
     exit 1
 elif [ ! -r $1 ]; then
     echo "Oops: $1 not found or not readable."
     exit 1
 else
-    python ${SLPS}/topics/extraction/html2bgf/xpathpre.py $1 > $1.fixed
+    python ${SLPS}/topics/extraction/html2bgf/xpathpre.py LALR -Difficulties <$1 > $1.fixed
     python ${SLPS}/topics/extraction/html2bgf/html2bgf.py $1.fixed $2 $3
     rm -f $1.fixed
 fi
diff --git a/topics/extraction/html2bgf/html2bgf.py b/topics/extraction/html2bgf/html2bgf.py
@@ -3,6 +3,7 @@
 
 #global
 emph = [False]
+pessimistic = [False,0]
 prods = {}
 
 def serialise(name,choices):
@@ -144,6 +145,11 @@ def parseLine(line):
  flags = []
  while line:
   line = line.strip()
+  if pessimistic[0]:
+   if line=='<hr>':
+    pessimistic[0]=False
+   line = ''
+   continue
   if line.find('</i>')==0:
    emph[0] = False
    line = line[4:]
@@ -169,17 +175,21 @@ def parseLine(line):
    line = line[7:]
    continue
   if line.find('<sub><i>opt</i></sub>')==0:
-   last = tokens.pop()
-   lastf = flags.pop()
-   tokens.extend(['[',last,']'])
-   flags.extend([True,lastf,True])
+   tokens.append('?????')
+   flags.append(True)
+   #last = tokens.pop()
+   #lastf = flags.pop()
+   #tokens.extend(['[',last,']'])
+   #flags.extend([True,lastf,True])
    line = line[21:]
    continue
   if line.find('<sub><i>opt')==0:
-   last = tokens.pop()
-   lastf = flags.pop()
-   tokens.extend(['[',last,']'])
-   flags.extend([True,lastf,True])
+   tokens.append('?????')
+   flags.append(True)
+   #last = tokens.pop()
+   #lastf = flags.pop()
+   #tokens.extend(['[',last,']'])
+   #flags.extend([True,lastf,True])
    line = line[11:]
    continue
   if line.find('</sub>')==0:
@@ -188,6 +198,15 @@ def parseLine(line):
   if line.find('<sub>')==0:
    line = line[5:]
    continue
+  if line.find('<hr>')==0:
+   line = line.replace('<hr>','')
+   pessimistic[0] = False
+   continue
+  if line.find('<a')==0:
+   #print 'Anchor found, skipping everything that is left of this snippet.'
+   pessimistic[0] = True
+   pessimistic[1] += 1
+   continue
   if line.find('<')==0:
    print 'Found unknown tag while parsing "'+line+'", skipping!'
    line = line[line.index('>')+1:]
@@ -274,6 +293,8 @@ def readGrammar(fn):
      # add choice branch
      choices.append([a,b])
  src.close()
+ if pessimistic[1]:
+  print 'Skipped',pessimistic[1],'anchor-containing snippets'
 
 def printGrammarText(fn):
  ext = open(fn,'w')
@@ -312,6 +333,11 @@ def automatedImprove():
    for i in range(0,len(bs)):
     if not bs[i]:
      continue
+    if bs[i]=='?????':
+     # Change to classic EBNF
+     bs[i-1] = '[ '+bs[i-1]
+     bs[i]   = ']'
+     continue
     if bs[i]=='"|"' and len(bs)>1 and nt.find('OrExpression')<0:
      print 'Terminal to nonterminal heuristic fix:',bs[i],'in',nt,'(suspicious context)'
      bs[i] = '|'
@@ -355,23 +381,86 @@ def automatedImprove():
       print 'Structural heuristic fix in',nt,'(singleton complex group)'
    newprods.append(fixBrackets(nt,' '.join(bs).split()))
   prods[nt]=newprods
+ pass
+
+def glueSymbols():
  for nt in prods.keys():
   newprods = []
   for bs in prods[nt]:
-   for i in range(0,len(bs)):
+   for i in range(0,len(bs)-1):
     if not bs[i]:
      continue
-    if bs[i][0]=='"':
-     if i+1<len(bs) and bs[i+1][0]=='"' and len(bs[i+1])==3 and bs[i+1][1].isalpha():
-      bs[i]='"'+bs[i][1:-1]+bs[i+1][1]+'"'
-      bs[i+1]=''
-      print 'Multiple terminals heuristic fix:',bs[i],'in',nt,'(2 to 1)'
-      continue
-     if i+1<len(bs) and bs[i+1][0]=='"' and len(bs[i])==3 and bs[i][1].isalpha():
-      bs[i]='"'+bs[i][1]+bs[i+1][1:-1]+'"'
-      bs[i+1]=''
-      print 'Multiple terminals heuristic fix:',bs[i],'in',nt,'(2 to 1)'
-      continue
+    if bs[i][0]=='"' and len(bs[i])==3 and bs[i][1].isalpha():
+     if bs[i+1][0]=='"':
+      # "N" "ame"
+      test = bs[i][1]+bs[i+1][1:-1]
+     else:
+      # "N" ame
+      test = bs[i][1]+bs[i+1]
+     if test.isalnum():
+      if test in prods.keys():
+       print 'Multiple terminals heuristic fix:','"'+test+'"','in',nt,'(2 to 1)'
+       bs[i] = test
+       bs[i+1]=''
+       print 'Terminal to nonterminal heuristic fix:',bs[i],'in',nt,'(familiar name)'
+      elif not (bs[i+1][0].isupper() or bs[i+1] in prods.keys()):
+       print 'Multiple terminals heuristic fix:','"'+test+'"','in',nt,'(2 to 1)'
+       bs[i] = '"'+test+'"'
+       bs[i+1]=''
+    elif bs[i][0]!='"' and len(bs[i])==1 and bs[i][0].isalpha():
+     if bs[i+1][0]=='"':
+      # N "ame"
+      test = bs[i][0]+bs[i+1][1:-1]
+     else:
+      # N ame
+      test = bs[i][0]+bs[i+1]
+     if test.isalnum():
+      if test in prods.keys():
+       print 'Multiple terminals heuristic fix:','"'+test+'"','in',nt,'(2 to 1)'
+       bs[i] = test
+       bs[i+1]=''
+       print 'Terminal to nonterminal heuristic fix:',bs[i],'in',nt,'(familiar name)'
+      elif not (bs[i+1][0].isupper() or bs[i+1] in prods.keys()):
+       print 'Multiple terminals heuristic fix:','"'+test+'"','in',nt,'(2 to 1)'
+       bs[i] = '"'+test+'"'
+       bs[i+1]=''
+   for i in range(1,len(bs)):
+    if not bs[i]:
+     continue
+    if bs[i][0]=='"' and len(bs[i])==3 and bs[i][1].isalpha():
+     if bs[i-1][0]=='"':
+      # "continu" "e"
+      test = bs[i-1][1:-1]+bs[i][1]
+     else:
+      # continu "e"
+      test = bs[i-1]+bs[i][1]
+     if test.isalnum():
+      if test in prods.keys():
+       print 'Multiple terminals heuristic fix:','"'+test+'"','in',nt,'(2 to 1)'
+       bs[i] = test
+       bs[i-1]=''
+       print 'Terminal to nonterminal heuristic fix:',bs[i],'in',nt,'(familiar name)'
+      elif bs[i-1] not in prods.keys():
+       print 'Multiple terminals heuristic fix:','"'+test+'"','in',nt,'(2 to 1)'
+       bs[i]='"'+test+'"'
+       bs[i-1]=''
+    elif bs[i][0]!='"' and len(bs[i])==1 and bs[i][0].isalpha():
+     if bs[i-1][0]=='"':
+      # "continu" e
+      test = bs[i-1][1:-1]+bs[i][0]
+     else:
+      # continu e
+      test = bs[i-1]+bs[i][0]
+     if test.isalnum():
+      if test in prods.keys():
+       print 'Multiple terminals heuristic fix:','"'+test+'"','in',nt,'(2 to 1)'
+       bs[i] = test
+       bs[i-1]=''
+       print 'Terminal to nonterminal heuristic fix:',bs[i],'in',nt,'(familiar name)'
+      elif bs[i-1] not in prods.keys():
+       print 'Multiple terminals heuristic fix:','"'+test+'"','in',nt,'(2 to 1)'
+       bs[i]='"'+test+'"'
+       bs[i-1]=''
    newprods.append(' '.join(bs).split())
   prods[nt]=newprods
  pass
@@ -429,6 +518,7 @@ def fixBracketPair(nt,arr,left,right):
   print 'Reading the HTML document...'
   readGrammar(sys.argv[1])
   print 'Massaging the grammar...'
+  glueSymbols()
   automatedImprove()
   print 'Writing the extracted grammar...'
   if sys.argv[-1]=='-bnf':

diff --git a/topics/extraction/html2bgf/xpathpre.py b/topics/extraction/html2bgf/xpathpre.py
@@ -1,13 +1,49 @@
 #!/usr/bin/python
 import sys
 
-f = open(sys.argv[1],'r')
-grammar = False
-print '<pre>'
-for chunk in ''.join(f.readlines()).split('<pre>'):
- if chunk.find('19.2')>0:
-  grammar = True
- elif grammar:
-  print chunk.split('</pre>')[0].replace('<br>','').replace('&#32;',' ')
-print '</pre>'
-f.close()
+yes = []
+no = []
+
+def checkSection(text,tagN,includeFlag):
+ for chapter in text.split('<h'+`tagN`+'>')[1:]:
+  grammar = includeFlag
+  content = chapter.split('</h'+`tagN`+'>')
+  for kw in yes:
+   if content[0].find(kw)>=0:
+    grammar = True
+  for kw in no:
+   if content[0].find(kw)>=0:
+    grammar = False
+  if grammar and content[1].find('<h')==-1:
+   for chunk in content[1].split('<pre>')[1:]:
+    print chunk.split('</pre>')[0].replace('<br>','').replace('&#32;',' ')
+    print '<hr>'
+  else:
+   #print 'Going deeper than',content[0].split()[0]
+   if grammar:
+    for chunk in content[1].split('<h'+`tagN+1`+'>')[0].split('<pre>')[1:]:
+     print chunk.split('</pre>')[0].replace('<br>','').replace('&#32;',' ')
+     print '<hr>'
+   checkSection(content[1],tagN+1,grammar)
+
+if len(sys.argv)<2:
+ print '''This tool simulates a particular XPath query that it can execute upon a badly composed HTML.
+
+Usage:
+	python xpathpre.py keyword [keyword ...] <input >output
+
+It will read the input, looking for sections (<h?>) that contain keywords in the title.
+Once found, it will output the content of all <pre> tags from such sections.
+Keywords can be positive or negative, with positive being default.'''
+else:
+ for kw in sys.argv[1:]:
+  if kw[0]=='-':
+   no.append(kw[1:])
+  elif kw[0]=='+':
+   yes.append(kw[1:])
+  else:
+   yes.append(kw)
+ print '<pre>'
+ checkSection(''.join(sys.stdin.readlines()),1,False)
+ print '</pre>'
+
diff --git a/topics/java/lci/Makefile b/topics/java/lci/Makefile
@@ -3,7 +3,7 @@ validator = ../../../shared/tools/checkxml
 build:
 
 diff: test
-	gdt bgf/jls1.prepare1.refactorStatements.structure1.bgf bgf/jls2.prepare2.bgf
+	gdt bgf/jls1.prepare1.refactorStatements.structure1.addFeatures1to2.bgf bgf/jls2.prepare2.bgf
 
 test:
 	python ../../convergence/lci/lci.py java.lcf architecture

diff --git a/topics/java/lci/java.lcf b/topics/java/lci/java.lcf
@@ -89,6 +89,7 @@
       <perform>prepare1</perform>
       <perform>refactorStatements</perform>
       <perform>structure1</perform>
+      <perform>addFeatures1to2</perform>
     </branch>
     <branch>
       <input>jls2</input>

diff --git a/topics/java/lci/xbgf/addFeatures1to2.xbgf b/topics/java/lci/xbgf/addFeatures1to2.xbgf
@@ -0,0 +1,14 @@
+<xbgf:sequence
+  xmlns:bgf="http://planet-sl.org/bgf"
+  xmlns:xbgf="http://planet-sl.org/xbgf">
+
+  <xbgf:add>
+    <bgf:production>
+      <nonterminal>Modifier</nonterminal>
+      <bgf:expression>
+        <terminal>strictfp</terminal>
+      </bgf:expression>
+    </bgf:production>
+  </xbgf:add>
+
+</xbgf:sequence>
diff --git a/topics/java/lci/xbgf/refactorStatements.xbgf b/topics/java/lci/xbgf/refactorStatements.xbgf
@@ -75,7 +75,7 @@
     <to>Statement</to>
   </xbgf:unite>
   <xbgf:inline>Finally</xbgf:inline>
-  <xbgf:fold>
+  <!--xbgf:fold>
     <bgf:production>
       <nonterminal>DoStatement</nonterminal>
       <bgf:expression>
@@ -98,7 +98,7 @@
         </sequence>
       </bgf:expression>
     </bgf:production>
-  </xbgf:fold>
+  </xbgf:fold-->
   <xbgf:project>
     <bgf:production>
       <nonterminal>ContinueStatement</nonterminal>
@@ -148,9 +148,6 @@
   <xbgf:inline>SwitchBlock</xbgf:inline>
 
 
-  <!--xbgf:restrict>
-
-  </xbgf:restrict-->
   <xbgf:undefine>StatementExpression</xbgf:undefine>
   <xbgf:define>
     <bgf:production>
@@ -161,4 +158,18 @@
     </bgf:production>
   </xbgf:define>
 
+  <!--xbgf:lassoc>
+    <bgf:production>
+      <nonterminal>SwitchBlockStatementGroups</nonterminal>
+      <bgf:expression>
+        <star>
+          <bgf:expression>
+            <nonterminal>SwitchBlockStatementGroup</nonterminal>
+          </bgf:expression>
+        </star>
+      </bgf:expression>
+    </bgf:production>
+  </xbgf:lassoc-->
+
+  <!--...-->
 </xbgf:sequence>
diff --git a/topics/java/lci/xbgf/structure1.xbgf b/topics/java/lci/xbgf/structure1.xbgf
@@ -89,5 +89,35 @@
 
 
   <xbgf:inline>PackageDeclaration</xbgf:inline>
+  <xbgf:unite>
+    <add>PrimaryNoNewArray</add>
+    <to>Primary</to>
+  </xbgf:unite>
+  <!--xbgf:inline>PrimaryNoNewArray</xbgf:inline-->
+  <xbgf:inline>ArrayCreationExpression</xbgf:inline>
 
-</xbgf:sequence>
+  <!--xbgf:extract>
+    <bgf:production>
+      <nonterminal>ClassOrInterfaceDeclaration</nonterminal>
+      <bgf:expression>
+        <sequence>
+          <bgf:expression>
+            <nonterminal>ModifiersOpt</nonterminal>
+          </bgf:expression>
+          <bgf:expression>
+            <choice>
+              <bgf:expression>
+                <nonterminal>ClassDeclaration</nonterminal>
+              </bgf:expression>
+              <bgf:expression>
+                <nonterminal>InterfaceDeclaration</nonterminal>
+              </bgf:expression>
+            </choice>
+          </bgf:expression>
+        </sequence>
+      </bgf:expression>
+    </bgf:production>
+  </xbgf:extract-->
+
+  <!--...-->
+</xbgf:sequence>