/
parsing.py
82 lines (68 loc) · 2.87 KB
/
parsing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
from helpers import *
from HTMLParser import HTMLParser
################## PARSER ##############################
class MyHTMLParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.snipdic = {"captionlist" : []}
self.snipdic["snipsecdict"] = {}
self.snipdic["sniplist"] = []
self.section = 'Header'
self.snipdic["snipsecdict"]['Header']=''
self.snipdic["sniptext"] = ''
self.active = False
self.inlist = []
#self.text = ''
self.refs = ''
def handle_starttag(self, tag, attributes):
if tag == 'i':
self.starttag = u'!!?i?!!'
self.endtag = u'!!?/i?!!'
else:
self.starttag = u''
self.endtag = u''
def handle_entityref(self, name):
if name in ["shy", "nbsp"]: #All the problematic charrefs
pass
else:
self.snipdic["snipsecdict"][self.section] = self.snipdic["snipsecdict"][self.section] + u"&"+name+";"
self.snipdic["sniplist"][-1]+= u"&"+name+";"
def handle_data(self, text):
snipsecdict = self.snipdic["snipsecdict"]
tagtext = repr(self.get_starttag_text())
if tagtext.count("BODY") == 1:
self.active = True
if self.active:
if text != '':
snippet = unicode(text)
self.snipdic["sniptext"] += snippet
snippet = self.starttag+snippet+self.endtag
#Determine the caption by the given key words and some thoughts about how captions show up in
#the text. This works a lot better than the alternative methods with headings, but some keywords
#show up in the same form somewhere else in the text. Needs less work in the rawdocs than
#the alternative though.
seccodes = ["Grammatik","Wortschatz", u"Übungen"]#"Fragen", "Aufgaben", "Aufgabe",
sec = sanitize_snippet(text)
if sec != '':
if sec in seccodes or (sec[0:4] == "Text" and sec[4:8].replace(" ","").isdigit() and len(sec) <= 9 and not ":" in sec):
caption = sec
self.section = sanitize_snippet(snippet)
self.snipdic["captionlist"].append(caption)
snipsecdict[self.section] = ''
return
#Alternative way of determining the captions via the Headings (kind of works, but the Headings,
#are used very inconsistently. So this would require quite some work in the raw documents by
#hand.
#for i in range(1,7):
#if sanitize_snippet(text) != '':
#if tagtext.count("H"+str(i)) == 1 :
#caption = sanitize_snippet(text)
#self.section = sanitize_snippet(snippet)
#self.snipdic["captionlist"].append(caption)
#snipsecdict[self.section] = ''
#return
snipsecdict[self.section] = snipsecdict[self.section] + snippet
self.snipdic["sniplist"].append(snippet)
##################### //PARSER #######################