diff --git a/technologies/HTMLFactExtractor/example/company.html b/technologies/HTMLFactExtractor/example/company.html
new file mode 100644
index 000000000..70a1b785f
--- /dev/null
+++ b/technologies/HTMLFactExtractor/example/company.html
@@ -0,0 +1,32 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+Company
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/technologies/HTMLFactExtractor/extractor.py b/technologies/HTMLFactExtractor/extractor.py
old mode 100644
new mode 100755
index aee778db2..f6ba06503
--- a/technologies/HTMLFactExtractor/extractor.py
+++ b/technologies/HTMLFactExtractor/extractor.py
@@ -1,68 +1,93 @@
+#! /usr/bin/env python
from HTMLParser import HTMLParser
import json
+import re
+import sys
-class MyHTMLParser(HTMLParser):
+
+class FragmentHTMLParser(HTMLParser):
tree_stack = []
fragments = {"fragments": []}
- def get_line_number(self, tag):
+ def get_line_number(self):
return self.getpos()[0]
- def get_current_position(self):
- position = self.fragments
- for fragment_name in self.tree_stack:
- for fragment in position["fragments"]:
- if fragment["name"] == fragment_name:
- position = fragment
- break
- return position
-
- def count_tags(self, fragment_name):
+ def count_fragments(self, new_fragment):
fragment_count = 0
for fragment in self.get_current_position()["fragments"]:
- if fragment["name"] == fragment_name:
+ if fragment["name"] == new_fragment["name"] and fragment["classifier"] == new_fragment["classifier"]:
fragment_count += 1
return fragment_count
- def add_fragment(self, new_fragment):
- self.get_current_position()["fragments"].append(new_fragment)
+ def get_current_position(self):
+ position = self.fragments
+ for fragment_id in self.tree_stack:
+ position = position["fragments"][fragment_id]
+ return position
+ def get_fragment_by_name_and_classifier(self, new_fragment):
+ fragment_list = self.get_current_position()["fragments"]
+ for fragment in fragment_list:
+ if fragment["name"] == new_fragment["name"] and fragment["classifier"] == new_fragment["classifier"]:
+ return fragment
+
+ def create_index(self, new_fragment):
+ old_index = self.count_fragments(new_fragment)
+ if old_index == 1:
+ self.get_fragment_by_name_and_classifier(new_fragment)["index"] = 1
+ new_fragment['index'] = 2
+ elif old_index > 1:
+ new_fragment['index'] = old_index + 1
+
+ def add_fragment(self, new_fragment, push_tree_stack):
+ self.create_index(new_fragment)
+ count_of_fragments = len(self.get_current_position()["fragments"])
+ self.get_current_position()["fragments"].append(new_fragment)
+ if push_tree_stack:
+ self.tree_stack.append(count_of_fragments)
#Begin Override Functions
def handle_starttag(self, tag, attrs):
- self.tree_stack.append(str(tag))
- #print(self.count_tags(str(tag)))
- self.add_fragment({"name": str(tag), "startLine": self.get_line_number(tag), "fragments": [], "classifier": "tag"})
-
+ self.add_fragment(
+ {"name": str(tag), "startLine": self.get_line_number(), "fragments": [], "classifier": "tag"}, True)
+ for attribute in attrs:
+ attribute_fragment = {"name": str(attribute[0]), "startLine": self.get_line_number(),
+ "endLine": self.get_line_number(), "fragments": [], "classifier": "attribute",
+ "value": str(attribute[1])}
+ self.get_current_position()["fragments"].append(attribute_fragment)
#debugBegin
- #print(self.get_line_number(tag))
- print(self.tree_stack)
+ #print("Beg " + str(tag) + str(self.tree_stack))
def handle_endtag(self, tag):
- self.get_current_position()["endLine"] = self.get_line_number(tag)
+ self.get_current_position()["endLine"] = self.get_line_number()
self.tree_stack.pop()
#debugBegin
- #print(self.get_line_number(tag))
- #print(self.tree_stack)
+ #print("End " + str(tag) + str(self.tree_stack))
def handle_data(self, data):
- pass
+ if re.search("\S", data):
+ end_line = self.get_line_number() + len(data.splitlines()) - 1
+ text_fragment = {"name": "text", "startLine": self.get_line_number(), "endLine": end_line,
+ "fragments": [], "classifier": "text"}
+ self.add_fragment(text_fragment, False)
def get_fragments(self):
return self.fragments
-# instantiate the parser and fed it some HTML
-parser = MyHTMLParser()
-parser.feed('\n'
- '\n'
- 'Test\n'
- '\n'
- '\n'
- 'Parse me!
\n'
- 'Parse me too!
\n'
- 'Parse me tooo!
\n'
- '\n'
- '\n')
+parser = FragmentHTMLParser()
+parser.feed(sys.stdin.read())
+
+#parser.feed('\n'
+# '\n'
+# 'Test
\n'
+# ''
+# '\n'
+# '\n'
+# 'Parse me!
\n'
+# 'Parse me too!
\n'
+# 'Hey Parse me tooo!
\n'
+# '\n'
+# '\n')
print(json.dumps(parser.get_fragments()))
\ No newline at end of file