Skip to content

Commit

Permalink
Release HTMLFactExtractor: handles tags, attributes and text
Browse files Browse the repository at this point in the history
  • Loading branch information
todeslord committed Jan 9, 2014
1 parent a94f90a commit 8333283
Show file tree
Hide file tree
Showing 2 changed files with 94 additions and 37 deletions.
32 changes: 32 additions & 0 deletions technologies/HTMLFactExtractor/example/company.html
@@ -0,0 +1,32 @@
<!DOCTYPE HTML>
<!-- Company view -->
<html>
<head>

<link rel="stylesheet" type="text/css" href="style.css" />
<title></title>
</head>
<body>

<center>
<script type="text/javascript" src="head.js"></script>

<h3>Company</h3>

<script type="text/javascript" src="company.js"></script>
<script type="text/javascript" src="companyController.js"></script>
<script type="text/javascript" src="companyView.js"></script>
<script type="text/javascript" src="companyModel.js"></script>

<div id="content" name="content"></div><br>
<div id="reset" name="reset"></div><br>
<div id="test" name="test"></div><br>

<script type="text/javascript">
model.loadCompany();

</script>
</center>

</body>
</html>
99 changes: 62 additions & 37 deletions technologies/HTMLFactExtractor/extractor.py 100644 → 100755
@@ -1,68 +1,93 @@
#! /usr/bin/env python
from HTMLParser import HTMLParser
import json
import re
import sys

class MyHTMLParser(HTMLParser):

class FragmentHTMLParser(HTMLParser):
tree_stack = []
fragments = {"fragments": []}

def get_line_number(self, tag):
def get_line_number(self):
return self.getpos()[0]

def get_current_position(self):
position = self.fragments
for fragment_name in self.tree_stack:
for fragment in position["fragments"]:
if fragment["name"] == fragment_name:
position = fragment
break
return position

def count_tags(self, fragment_name):
def count_fragments(self, new_fragment):
fragment_count = 0
for fragment in self.get_current_position()["fragments"]:
if fragment["name"] == fragment_name:
if fragment["name"] == new_fragment["name"] and fragment["classifier"] == new_fragment["classifier"]:
fragment_count += 1
return fragment_count

def add_fragment(self, new_fragment):
self.get_current_position()["fragments"].append(new_fragment)
def get_current_position(self):
position = self.fragments
for fragment_id in self.tree_stack:
position = position["fragments"][fragment_id]
return position

def get_fragment_by_name_and_classifier(self, new_fragment):
fragment_list = self.get_current_position()["fragments"]
for fragment in fragment_list:
if fragment["name"] == new_fragment["name"] and fragment["classifier"] == new_fragment["classifier"]:
return fragment

def create_index(self, new_fragment):
old_index = self.count_fragments(new_fragment)
if old_index == 1:
self.get_fragment_by_name_and_classifier(new_fragment)["index"] = 1
new_fragment['index'] = 2
elif old_index > 1:
new_fragment['index'] = old_index + 1

def add_fragment(self, new_fragment, push_tree_stack):
self.create_index(new_fragment)
count_of_fragments = len(self.get_current_position()["fragments"])
self.get_current_position()["fragments"].append(new_fragment)
if push_tree_stack:
self.tree_stack.append(count_of_fragments)

#Begin Override Functions
def handle_starttag(self, tag, attrs):
self.tree_stack.append(str(tag))
#print(self.count_tags(str(tag)))
self.add_fragment({"name": str(tag), "startLine": self.get_line_number(tag), "fragments": [], "classifier": "tag"})

self.add_fragment(
{"name": str(tag), "startLine": self.get_line_number(), "fragments": [], "classifier": "tag"}, True)
for attribute in attrs:
attribute_fragment = {"name": str(attribute[0]), "startLine": self.get_line_number(),
"endLine": self.get_line_number(), "fragments": [], "classifier": "attribute",
"value": str(attribute[1])}
self.get_current_position()["fragments"].append(attribute_fragment)
#debugBegin
#print(self.get_line_number(tag))
print(self.tree_stack)
#print("Beg " + str(tag) + str(self.tree_stack))

def handle_endtag(self, tag):
self.get_current_position()["endLine"] = self.get_line_number(tag)
self.get_current_position()["endLine"] = self.get_line_number()
self.tree_stack.pop()

#debugBegin
#print(self.get_line_number(tag))
#print(self.tree_stack)
#print("End " + str(tag) + str(self.tree_stack))

def handle_data(self, data):
pass
if re.search("\S", data):
end_line = self.get_line_number() + len(data.splitlines()) - 1
text_fragment = {"name": "text", "startLine": self.get_line_number(), "endLine": end_line,
"fragments": [], "classifier": "text"}
self.add_fragment(text_fragment, False)

def get_fragments(self):
return self.fragments


# instantiate the parser and fed it some HTML
parser = MyHTMLParser()
parser.feed('<html>\n'
'<head>\n'
'<title>Test</title>\n'
'</head>\n'
'<body>\n'
'<h1>Parse me!</h1>\n'
'<h1>Parse me too!</h1>\n'
'<h1><b>P</b><b>a</b>rse me tooo!</h1>\n'
'</body>\n'
'</html>\n')
parser = FragmentHTMLParser()
parser.feed(sys.stdin.read())

#parser.feed('<html>\n'
# '<head>\n'
# '<title><h1>Test</h1></title>\n'
# '<script src="source/sfd/fs.js" lang="JS"></script>'
# '</head>\n'
# '<body>\n'
# '<h1>Parse me!</h1>\n'
# '<h1>Parse me too!</h1>\n'
# '<h1>Hey <b>P</b><b>a</b>rse me tooo!</h1>\n'
# '</body>\n'
# '</html>\n')
print(json.dumps(parser.get_fragments()))

0 comments on commit 8333283

Please sign in to comment.