Skip to content

Commit

Permalink
extractor made more flexible
Browse files Browse the repository at this point in the history
git-svn-id: https://slps.svn.sourceforge.net/svnroot/slps@523 ab42f6e0-554d-0410-b580-99e487e6eeb2
  • Loading branch information
grammarware committed Mar 30, 2009
1 parent b4ac484 commit 4e63d94
Showing 1 changed file with 15 additions and 6 deletions.
21 changes: 15 additions & 6 deletions topics/extraction/html2bgf/getpre.py
Expand Up @@ -13,8 +13,12 @@ def processSection(text,tagN,cx,p):
# p - file output
max = cx
content = text.split('</h'+`tagN`+'>')
if content[0].find('<a name')==0:
title = content[0].split('>')[1].split()[0].split('<')[0]
else:
title = content[0].split()[0]
for kw in yes:
if content[0].split()[0]==kw:
if title==kw:
max = counter[kw]
break
for kw in no:
Expand All @@ -25,15 +29,15 @@ def processSection(text,tagN,cx,p):
if max:
if pattern.has_key(kw):
if pattern[kw][len(pattern[kw])-max]=='-':
#print 'Not including one <pre> in',kw
print 'Not including one <pre> in',kw
max -= 1
continue
#print 'Matched <pre> in',content[0].split()[0]
print 'Matched <pre> in',title
p.write(pre.split('</pre>')[0].replace('<br>','').replace('&#32;',' '))
p.write('\n<hr>\n')
max -= 1
else:
#print 'Skipped <pre> in',content[0].split()[0]
print 'Skipped <pre> in',title
pass
for ss in subsections[1:]:
processSection(ss,tagN+1,max,p)
Expand Down Expand Up @@ -77,7 +81,12 @@ def processSection(text,tagN,cx,p):
out = open(sys.argv[3],'w')
out.write('<pre>')
#checkSection(''.join(open(sys.argv[2],'r').readlines()),1,False,out,'start')
for toplevel in ''.join(open(sys.argv[2],'r').readlines()).split('<h1>')[1:]:
processSection(toplevel,1,0,out)
content = ''.join(open(sys.argv[2],'r').readlines())
if content.find('<h1>')<0:
for toplevel in content.split('<h2>')[1:]:
processSection(toplevel,2,0,out)
else:
for toplevel in content.split('<h1>')[1:]:
processSection(toplevel,1,0,out)
out.write('</pre>')
out.close()

0 comments on commit 4e63d94

Please sign in to comment.