diff --git a/technologies/HTMLFactExtractor/example/company.html b/technologies/HTMLFactExtractor/example/company.html new file mode 100644 index 000000000..70a1b785f --- /dev/null +++ b/technologies/HTMLFactExtractor/example/company.html @@ -0,0 +1,32 @@ + + + + + + + + + + +
+ + +

Company

+ + + + + + +

+

+

+ + +
+ + + diff --git a/technologies/HTMLFactExtractor/extractor.py b/technologies/HTMLFactExtractor/extractor.py old mode 100644 new mode 100755 index aee778db2..f6ba06503 --- a/technologies/HTMLFactExtractor/extractor.py +++ b/technologies/HTMLFactExtractor/extractor.py @@ -1,68 +1,93 @@ +#! /usr/bin/env python from HTMLParser import HTMLParser import json +import re +import sys -class MyHTMLParser(HTMLParser): + +class FragmentHTMLParser(HTMLParser): tree_stack = [] fragments = {"fragments": []} - def get_line_number(self, tag): + def get_line_number(self): return self.getpos()[0] - def get_current_position(self): - position = self.fragments - for fragment_name in self.tree_stack: - for fragment in position["fragments"]: - if fragment["name"] == fragment_name: - position = fragment - break - return position - - def count_tags(self, fragment_name): + def count_fragments(self, new_fragment): fragment_count = 0 for fragment in self.get_current_position()["fragments"]: - if fragment["name"] == fragment_name: + if fragment["name"] == new_fragment["name"] and fragment["classifier"] == new_fragment["classifier"]: fragment_count += 1 return fragment_count - def add_fragment(self, new_fragment): - self.get_current_position()["fragments"].append(new_fragment) + def get_current_position(self): + position = self.fragments + for fragment_id in self.tree_stack: + position = position["fragments"][fragment_id] + return position + def get_fragment_by_name_and_classifier(self, new_fragment): + fragment_list = self.get_current_position()["fragments"] + for fragment in fragment_list: + if fragment["name"] == new_fragment["name"] and fragment["classifier"] == new_fragment["classifier"]: + return fragment + + def create_index(self, new_fragment): + old_index = self.count_fragments(new_fragment) + if old_index == 1: + self.get_fragment_by_name_and_classifier(new_fragment)["index"] = 1 + new_fragment['index'] = 2 + elif old_index > 1: + new_fragment['index'] = old_index + 1 + + def add_fragment(self, new_fragment, push_tree_stack): + self.create_index(new_fragment) + count_of_fragments = len(self.get_current_position()["fragments"]) + self.get_current_position()["fragments"].append(new_fragment) + if push_tree_stack: + self.tree_stack.append(count_of_fragments) #Begin Override Functions def handle_starttag(self, tag, attrs): - self.tree_stack.append(str(tag)) - #print(self.count_tags(str(tag))) - self.add_fragment({"name": str(tag), "startLine": self.get_line_number(tag), "fragments": [], "classifier": "tag"}) - + self.add_fragment( + {"name": str(tag), "startLine": self.get_line_number(), "fragments": [], "classifier": "tag"}, True) + for attribute in attrs: + attribute_fragment = {"name": str(attribute[0]), "startLine": self.get_line_number(), + "endLine": self.get_line_number(), "fragments": [], "classifier": "attribute", + "value": str(attribute[1])} + self.get_current_position()["fragments"].append(attribute_fragment) #debugBegin - #print(self.get_line_number(tag)) - print(self.tree_stack) + #print("Beg " + str(tag) + str(self.tree_stack)) def handle_endtag(self, tag): - self.get_current_position()["endLine"] = self.get_line_number(tag) + self.get_current_position()["endLine"] = self.get_line_number() self.tree_stack.pop() #debugBegin - #print(self.get_line_number(tag)) - #print(self.tree_stack) + #print("End " + str(tag) + str(self.tree_stack)) def handle_data(self, data): - pass + if re.search("\S", data): + end_line = self.get_line_number() + len(data.splitlines()) - 1 + text_fragment = {"name": "text", "startLine": self.get_line_number(), "endLine": end_line, + "fragments": [], "classifier": "text"} + self.add_fragment(text_fragment, False) def get_fragments(self): return self.fragments -# instantiate the parser and fed it some HTML -parser = MyHTMLParser() -parser.feed('\n' - '\n' - 'Test\n' - '\n' - '\n' - '

Parse me!

\n' - '

Parse me too!

\n' - '

Parse me tooo!

\n' - '\n' - '\n') +parser = FragmentHTMLParser() +parser.feed(sys.stdin.read()) + +#parser.feed('\n' +# '\n' +# '<h1>Test</h1>\n' +# '' +# '\n' +# '\n' +# '

Parse me!

\n' +# '

Parse me too!

\n' +# '

Hey Parse me tooo!

\n' +# '\n' +# '\n') print(json.dumps(parser.get_fragments())) \ No newline at end of file