Skip to content

Commit

Permalink
smarter!
Browse files Browse the repository at this point in the history
git-svn-id: https://slps.svn.sourceforge.net/svnroot/slps@273 ab42f6e0-554d-0410-b580-99e487e6eeb2
  • Loading branch information
grammarware committed Sep 15, 2008
1 parent 663cb89 commit 09b7441
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 7 deletions.
3 changes: 2 additions & 1 deletion topics/extraction/html2bgf/getpre.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,9 @@ def processSection(text,tagN,cx,p):
subsections=content[1].split('<h'+`tagN+1`+'>')
for pre in subsections[0].split('<pre>')[1:]:
if max:
print 'Matched <pre> in',content[0].split()[0]
p.write(pre.split('</pre>')[0].replace('<br>','').replace('&#32;',' '))
p.write('\n<hr>')
p.write('\n<hr>\n')
max -= 1
else:
#print 'Skipped <pre> in',content[0].split()[0]
Expand Down
27 changes: 21 additions & 6 deletions topics/extraction/html2bgf/html2bgf.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,7 @@ def preprocess(line):
return l2.replace('&gt ; ','&gt;').replace('&lt ; ','&lt;').replace('&amp ; ','&amp;')

def parseLine(line):
oldline = line[:]
tokens = []
flags = []
while line:
Expand All @@ -163,8 +164,14 @@ def parseLine(line):
line = line[5:]
continue
if line.find('<em>')==0:
emph[0] = True
line = line[4:]
if emph[0] and tokens and oldline.find(tokens[-1]+'<em>'+line[4:line.index('>')])>=0:
print 'Token-breaking <em> tag endangers',
line = tokens.pop()+line[4:]
print line.split()[0].split('<')[0]
flags.pop()
else:
emph[0] = True
line = line[4:]
continue
if line.find('<code>')==0:
emph[0] = False
Expand Down Expand Up @@ -228,22 +235,29 @@ def cleanup(line):
return line.replace('<!-- </i> -->','').replace(' ','\t')
#.replace('<code>','"').replace('</code>','"')

def ifContinuation(s):
def ifContinuation(s,olds):
if not s:
return False
if s[0]=='\t' and s[1]!='\t':
if s[0]=='\t' and (s[1]!='\t' or s[1]==olds[1]):
return False
if s[0]==' ':
i=0
while s[i]==' ' and olds[i]==' ':
i+=1
if olds[i]!=' ' and s[i]==' ' and s[i+1]!=' ':
# one space indentation equals line continuation
return True
return False
if s[0]=='<':
return ifContinuation(s[s.index('>')+1:])
return ifContinuation(s[s.index('>')+1:],olds)
return True

def readGrammar(fn):
oneof = False
src = open(fn,'r')
grammar = False
name = ''
oldline = ''
choices = []
for line in src:
if line.find('<pre>')>=0 or line.find('</pre>')>=0:
Expand All @@ -255,7 +269,8 @@ def readGrammar(fn):
grammar = not grammar
continue
if grammar:
cont = ifContinuation(line)
cont = ifContinuation(line,oldline)
oldline = line
line = preprocess(cleanup(line))
#print 'Parsing "'+line+'"...'
a,b=parseLine(line)
Expand Down

0 comments on commit 09b7441

Please sign in to comment.