Skip to content

Commit

Permalink
general preprocessor; LCI advancements; much smarter extractor
Browse files Browse the repository at this point in the history
git-svn-id: https://slps.svn.sourceforge.net/svnroot/slps@257 ab42f6e0-554d-0410-b580-99e487e6eeb2
  • Loading branch information
grammarware committed Sep 11, 2008
1 parent 1799766 commit 35e33d4
Show file tree
Hide file tree
Showing 8 changed files with 221 additions and 39 deletions.
4 changes: 2 additions & 2 deletions shared/tools/jls2bgf
Expand Up @@ -8,14 +8,14 @@ SLPS=${PWD}
cd ${LOCAL1}

if [ $# -lt 2 ]; then
echo "This tool extracts a BGF from Java Language Standard 1"
echo "This tool extracts a BGF from Java Language Standard that needs pre-processing"
echo "Usage: $0 <input-hypertext-document> <output-bgf> [options]"
exit 1
elif [ ! -r $1 ]; then
echo "Oops: $1 not found or not readable."
exit 1
else
python ${SLPS}/topics/extraction/html2bgf/xpathpre.py $1 > $1.fixed
python ${SLPS}/topics/extraction/html2bgf/xpathpre.py LALR -Difficulties <$1 > $1.fixed
python ${SLPS}/topics/extraction/html2bgf/html2bgf.py $1.fixed $2 $3
rm -f $1.fixed
fi
130 changes: 110 additions & 20 deletions topics/extraction/html2bgf/html2bgf.py
Expand Up @@ -3,6 +3,7 @@

#global
emph = [False]
pessimistic = [False,0]
prods = {}

def serialise(name,choices):
Expand Down Expand Up @@ -144,6 +145,11 @@ def parseLine(line):
flags = []
while line:
line = line.strip()
if pessimistic[0]:
if line=='<hr>':
pessimistic[0]=False
line = ''
continue
if line.find('</i>')==0:
emph[0] = False
line = line[4:]
Expand All @@ -169,17 +175,21 @@ def parseLine(line):
line = line[7:]
continue
if line.find('<sub><i>opt</i></sub>')==0:
last = tokens.pop()
lastf = flags.pop()
tokens.extend(['[',last,']'])
flags.extend([True,lastf,True])
tokens.append('?????')
flags.append(True)
#last = tokens.pop()
#lastf = flags.pop()
#tokens.extend(['[',last,']'])
#flags.extend([True,lastf,True])
line = line[21:]
continue
if line.find('<sub><i>opt')==0:
last = tokens.pop()
lastf = flags.pop()
tokens.extend(['[',last,']'])
flags.extend([True,lastf,True])
tokens.append('?????')
flags.append(True)
#last = tokens.pop()
#lastf = flags.pop()
#tokens.extend(['[',last,']'])
#flags.extend([True,lastf,True])
line = line[11:]
continue
if line.find('</sub>')==0:
Expand All @@ -188,6 +198,15 @@ def parseLine(line):
if line.find('<sub>')==0:
line = line[5:]
continue
if line.find('<hr>')==0:
line = line.replace('<hr>','')
pessimistic[0] = False
continue
if line.find('<a')==0:
#print 'Anchor found, skipping everything that is left of this snippet.'
pessimistic[0] = True
pessimistic[1] += 1
continue
if line.find('<')==0:
print 'Found unknown tag while parsing "'+line+'", skipping!'
line = line[line.index('>')+1:]
Expand Down Expand Up @@ -274,6 +293,8 @@ def readGrammar(fn):
# add choice branch
choices.append([a,b])
src.close()
if pessimistic[1]:
print 'Skipped',pessimistic[1],'anchor-containing snippets'

def printGrammarText(fn):
ext = open(fn,'w')
Expand Down Expand Up @@ -312,6 +333,11 @@ def automatedImprove():
for i in range(0,len(bs)):
if not bs[i]:
continue
if bs[i]=='?????':
# Change to classic EBNF
bs[i-1] = '[ '+bs[i-1]
bs[i] = ']'
continue
if bs[i]=='"|"' and len(bs)>1 and nt.find('OrExpression')<0:
print 'Terminal to nonterminal heuristic fix:',bs[i],'in',nt,'(suspicious context)'
bs[i] = '|'
Expand Down Expand Up @@ -355,23 +381,86 @@ def automatedImprove():
print 'Structural heuristic fix in',nt,'(singleton complex group)'
newprods.append(fixBrackets(nt,' '.join(bs).split()))
prods[nt]=newprods
pass

def glueSymbols():
for nt in prods.keys():
newprods = []
for bs in prods[nt]:
for i in range(0,len(bs)):
for i in range(0,len(bs)-1):
if not bs[i]:
continue
if bs[i][0]=='"':
if i+1<len(bs) and bs[i+1][0]=='"' and len(bs[i+1])==3 and bs[i+1][1].isalpha():
bs[i]='"'+bs[i][1:-1]+bs[i+1][1]+'"'
bs[i+1]=''
print 'Multiple terminals heuristic fix:',bs[i],'in',nt,'(2 to 1)'
continue
if i+1<len(bs) and bs[i+1][0]=='"' and len(bs[i])==3 and bs[i][1].isalpha():
bs[i]='"'+bs[i][1]+bs[i+1][1:-1]+'"'
bs[i+1]=''
print 'Multiple terminals heuristic fix:',bs[i],'in',nt,'(2 to 1)'
continue
if bs[i][0]=='"' and len(bs[i])==3 and bs[i][1].isalpha():
if bs[i+1][0]=='"':
# "N" "ame"
test = bs[i][1]+bs[i+1][1:-1]
else:
# "N" ame
test = bs[i][1]+bs[i+1]
if test.isalnum():
if test in prods.keys():
print 'Multiple terminals heuristic fix:','"'+test+'"','in',nt,'(2 to 1)'
bs[i] = test
bs[i+1]=''
print 'Terminal to nonterminal heuristic fix:',bs[i],'in',nt,'(familiar name)'
elif not (bs[i+1][0].isupper() or bs[i+1] in prods.keys()):
print 'Multiple terminals heuristic fix:','"'+test+'"','in',nt,'(2 to 1)'
bs[i] = '"'+test+'"'
bs[i+1]=''
elif bs[i][0]!='"' and len(bs[i])==1 and bs[i][0].isalpha():
if bs[i+1][0]=='"':
# N "ame"
test = bs[i][0]+bs[i+1][1:-1]
else:
# N ame
test = bs[i][0]+bs[i+1]
if test.isalnum():
if test in prods.keys():
print 'Multiple terminals heuristic fix:','"'+test+'"','in',nt,'(2 to 1)'
bs[i] = test
bs[i+1]=''
print 'Terminal to nonterminal heuristic fix:',bs[i],'in',nt,'(familiar name)'
elif not (bs[i+1][0].isupper() or bs[i+1] in prods.keys()):
print 'Multiple terminals heuristic fix:','"'+test+'"','in',nt,'(2 to 1)'
bs[i] = '"'+test+'"'
bs[i+1]=''
for i in range(1,len(bs)):
if not bs[i]:
continue
if bs[i][0]=='"' and len(bs[i])==3 and bs[i][1].isalpha():
if bs[i-1][0]=='"':
# "continu" "e"
test = bs[i-1][1:-1]+bs[i][1]
else:
# continu "e"
test = bs[i-1]+bs[i][1]
if test.isalnum():
if test in prods.keys():
print 'Multiple terminals heuristic fix:','"'+test+'"','in',nt,'(2 to 1)'
bs[i] = test
bs[i-1]=''
print 'Terminal to nonterminal heuristic fix:',bs[i],'in',nt,'(familiar name)'
elif bs[i-1] not in prods.keys():
print 'Multiple terminals heuristic fix:','"'+test+'"','in',nt,'(2 to 1)'
bs[i]='"'+test+'"'
bs[i-1]=''
elif bs[i][0]!='"' and len(bs[i])==1 and bs[i][0].isalpha():
if bs[i-1][0]=='"':
# "continu" e
test = bs[i-1][1:-1]+bs[i][0]
else:
# continu e
test = bs[i-1]+bs[i][0]
if test.isalnum():
if test in prods.keys():
print 'Multiple terminals heuristic fix:','"'+test+'"','in',nt,'(2 to 1)'
bs[i] = test
bs[i-1]=''
print 'Terminal to nonterminal heuristic fix:',bs[i],'in',nt,'(familiar name)'
elif bs[i-1] not in prods.keys():
print 'Multiple terminals heuristic fix:','"'+test+'"','in',nt,'(2 to 1)'
bs[i]='"'+test+'"'
bs[i-1]=''
newprods.append(' '.join(bs).split())
prods[nt]=newprods
pass
Expand Down Expand Up @@ -429,6 +518,7 @@ def fixBracketPair(nt,arr,left,right):
print 'Reading the HTML document...'
readGrammar(sys.argv[1])
print 'Massaging the grammar...'
glueSymbols()
automatedImprove()
print 'Writing the extracted grammar...'
if sys.argv[-1]=='-bnf':
Expand Down
56 changes: 46 additions & 10 deletions topics/extraction/html2bgf/xpathpre.py
@@ -1,13 +1,49 @@
#!/usr/bin/python
import sys

f = open(sys.argv[1],'r')
grammar = False
print '<pre>'
for chunk in ''.join(f.readlines()).split('<pre>'):
if chunk.find('19.2')>0:
grammar = True
elif grammar:
print chunk.split('</pre>')[0].replace('<br>','').replace('&#32;',' ')
print '</pre>'
f.close()
yes = []
no = []

def checkSection(text,tagN,includeFlag):
for chapter in text.split('<h'+`tagN`+'>')[1:]:
grammar = includeFlag
content = chapter.split('</h'+`tagN`+'>')
for kw in yes:
if content[0].find(kw)>=0:
grammar = True
for kw in no:
if content[0].find(kw)>=0:
grammar = False
if grammar and content[1].find('<h')==-1:
for chunk in content[1].split('<pre>')[1:]:
print chunk.split('</pre>')[0].replace('<br>','').replace('&#32;',' ')
print '<hr>'
else:
#print 'Going deeper than',content[0].split()[0]
if grammar:
for chunk in content[1].split('<h'+`tagN+1`+'>')[0].split('<pre>')[1:]:
print chunk.split('</pre>')[0].replace('<br>','').replace('&#32;',' ')
print '<hr>'
checkSection(content[1],tagN+1,grammar)

if len(sys.argv)<2:
print '''This tool simulates a particular XPath query that it can execute upon a badly composed HTML.
Usage:
python xpathpre.py keyword [keyword ...] <input >output
It will read the input, looking for sections (<h?>) that contain keywords in the title.
Once found, it will output the content of all <pre> tags from such sections.
Keywords can be positive or negative, with positive being default.'''
else:
for kw in sys.argv[1:]:
if kw[0]=='-':
no.append(kw[1:])
elif kw[0]=='+':
yes.append(kw[1:])
else:
yes.append(kw)
print '<pre>'
checkSection(''.join(sys.stdin.readlines()),1,False)
print '</pre>'

2 changes: 1 addition & 1 deletion topics/java/lci/Makefile
Expand Up @@ -3,7 +3,7 @@ validator = ../../../shared/tools/checkxml
build:

diff: test
gdt bgf/jls1.prepare1.refactorStatements.structure1.bgf bgf/jls2.prepare2.bgf
gdt bgf/jls1.prepare1.refactorStatements.structure1.addFeatures1to2.bgf bgf/jls2.prepare2.bgf

test:
python ../../convergence/lci/lci.py java.lcf architecture
Expand Down
1 change: 1 addition & 0 deletions topics/java/lci/java.lcf
Expand Up @@ -89,6 +89,7 @@
<perform>prepare1</perform>
<perform>refactorStatements</perform>
<perform>structure1</perform>
<perform>addFeatures1to2</perform>
</branch>
<branch>
<input>jls2</input>
Expand Down
14 changes: 14 additions & 0 deletions topics/java/lci/xbgf/addFeatures1to2.xbgf
@@ -0,0 +1,14 @@
<xbgf:sequence
xmlns:bgf="http://planet-sl.org/bgf"
xmlns:xbgf="http://planet-sl.org/xbgf">

<xbgf:add>
<bgf:production>
<nonterminal>Modifier</nonterminal>
<bgf:expression>
<terminal>strictfp</terminal>
</bgf:expression>
</bgf:production>
</xbgf:add>

</xbgf:sequence>
21 changes: 16 additions & 5 deletions topics/java/lci/xbgf/refactorStatements.xbgf
Expand Up @@ -75,7 +75,7 @@
<to>Statement</to>
</xbgf:unite>
<xbgf:inline>Finally</xbgf:inline>
<xbgf:fold>
<!--xbgf:fold>
<bgf:production>
<nonterminal>DoStatement</nonterminal>
<bgf:expression>
Expand All @@ -98,7 +98,7 @@
</sequence>
</bgf:expression>
</bgf:production>
</xbgf:fold>
</xbgf:fold-->
<xbgf:project>
<bgf:production>
<nonterminal>ContinueStatement</nonterminal>
Expand Down Expand Up @@ -148,9 +148,6 @@
<xbgf:inline>SwitchBlock</xbgf:inline>


<!--xbgf:restrict>

</xbgf:restrict-->
<xbgf:undefine>StatementExpression</xbgf:undefine>
<xbgf:define>
<bgf:production>
Expand All @@ -161,4 +158,18 @@
</bgf:production>
</xbgf:define>

<!--xbgf:lassoc>
<bgf:production>
<nonterminal>SwitchBlockStatementGroups</nonterminal>
<bgf:expression>
<star>
<bgf:expression>
<nonterminal>SwitchBlockStatementGroup</nonterminal>
</bgf:expression>
</star>
</bgf:expression>
</bgf:production>
</xbgf:lassoc-->

<!--...-->
</xbgf:sequence>
32 changes: 31 additions & 1 deletion topics/java/lci/xbgf/structure1.xbgf
Expand Up @@ -89,5 +89,35 @@


<xbgf:inline>PackageDeclaration</xbgf:inline>
<xbgf:unite>
<add>PrimaryNoNewArray</add>
<to>Primary</to>
</xbgf:unite>
<!--xbgf:inline>PrimaryNoNewArray</xbgf:inline-->
<xbgf:inline>ArrayCreationExpression</xbgf:inline>

</xbgf:sequence>
<!--xbgf:extract>
<bgf:production>
<nonterminal>ClassOrInterfaceDeclaration</nonterminal>
<bgf:expression>
<sequence>
<bgf:expression>
<nonterminal>ModifiersOpt</nonterminal>
</bgf:expression>
<bgf:expression>
<choice>
<bgf:expression>
<nonterminal>ClassDeclaration</nonterminal>
</bgf:expression>
<bgf:expression>
<nonterminal>InterfaceDeclaration</nonterminal>
</bgf:expression>
</choice>
</bgf:expression>
</sequence>
</bgf:expression>
</bgf:production>
</xbgf:extract-->

<!--...-->
</xbgf:sequence>

0 comments on commit 35e33d4

Please sign in to comment.