Skip to content

Commit

Permalink
Improved implicit close tag heuristic detection when parsing malforme…
Browse files Browse the repository at this point in the history
…d HTML.

Fixes an issue where appending / prepending rows to a table (or  to similar implicit
element structures) would create a redundant wrapping elements.

Fixes jhy#21
  • Loading branch information
jhy committed Jul 2, 2011
1 parent 5599adf commit 0081d16
Show file tree
Hide file tree
Showing 6 changed files with 87 additions and 25 deletions.
6 changes: 6 additions & 0 deletions CHANGES
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,12 @@ jsoup changelog
* Added Element#parents and Elements#parents to retrieve an element's ancestor chain
<http://github.com/jhy/jsoup/issues/issue/20>

* Fixes an issue where appending / prepending rows to a table (or to similar implicit
element structures) would create a redundant wrapping elements
<http://github.com/jhy/jsoup/issues/issue/21>

* Improved implicit close tag heuristic detection when parsing malformed HTML

* Fixes an issue where text content after a script (or other data-node) was
incorrectly added to the data node.
<http://github.com/jhy/jsoup/issues/issue/22>
Expand Down
8 changes: 3 additions & 5 deletions src/main/java/org/jsoup/nodes/Element.java
Original file line number Diff line number Diff line change
Expand Up @@ -267,8 +267,7 @@ public Element prependText(String text) {
public Element append(String html) {
Validate.notNull(html);

Element fragment = Parser.parseBodyFragment(html, baseUri).body();
// TODO: must parse without implicit elements, so you can e.g. add <td> to a <tr> (without creating a whole new table)
Element fragment = Parser.parseBodyFragmentRelaxed(html, baseUri()).body();
for (Node node : fragment.childNodes()) {
node.parentNode = null;
appendChild(node);
Expand All @@ -285,8 +284,7 @@ public Element append(String html) {
public Element prepend(String html) {
Validate.notNull(html);

Element fragment = Parser.parseBodyFragment(html, baseUri).body();
// TODO: must parse without implicit elements, so you can e.g. add <td> to a <tr> (without creating a whole new table)
Element fragment = Parser.parseBodyFragmentRelaxed(html, baseUri()).body();
List<Node> nodes = fragment.childNodes();
for (int i = nodes.size() - 1; i >= 0; i--) {
Node node = nodes.get(i);
Expand All @@ -313,7 +311,7 @@ public Element empty() {
public Element wrap(String html) {
Validate.notEmpty(html);

Element wrapBody = Parser.parseBodyFragment(html, baseUri).body();
Element wrapBody = Parser.parseBodyFragmentRelaxed(html, baseUri).body();
Elements wrapChildren = wrapBody.children();
Element wrap = wrapChildren.first();
if (wrap == null) // nothing to wrap with; noop
Expand Down
24 changes: 21 additions & 3 deletions src/main/java/org/jsoup/parser/Parser.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ public class Parser {
private final TokenQueue tq;
private final Document doc;
private String baseUri;
private boolean relaxed = false;

private Parser(String html, String baseUri, boolean isBodyFragment) {
Validate.notNull(html);
Expand Down Expand Up @@ -63,6 +64,19 @@ public static Document parseBodyFragment(String bodyHtml, String baseUri) {
return parser.parse();
}

/**
Parse a fragment of HTML into the {@code body} of a Document, with relaxed parsing enabled. Relaxed, in this
context, means that implicit tags are not automatically created when missing.
@param bodyHtml fragment of HTML
@param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
@return Document, with empty head, and HTML parsed into body
*/
public static Document parseBodyFragmentRelaxed(String bodyHtml, String baseUri) {
Parser parser = new Parser(bodyHtml, baseUri, true);
parser.relaxed = true;
return parser.parse();
}

private Document parse() {
while (!tq.isEmpty()) {
if (tq.matches("<!--")) {
Expand Down Expand Up @@ -213,7 +227,7 @@ private Element addChildToParent(Element child, boolean isEmptyElement) {
Tag childTag = child.tag();
boolean validAncestor = stackHasValidParent(childTag);

if (!validAncestor) {
if (!validAncestor && !relaxed) {
// create implicit parent around this child
Tag parentTag = childTag.getImplicitParent();
Element implicit = new Element(parentTag, baseUri);
Expand Down Expand Up @@ -241,11 +255,15 @@ private Element addChildToParent(Element child, boolean isEmptyElement) {
private boolean stackHasValidParent(Tag childTag) {
if (stack.size() == 1 && childTag.equals(htmlTag))
return true; // root is valid for html node


if (childTag.requiresSpecificParent())
return stack.getLast().tag().isValidParent(childTag);

// otherwise, look up the stack for valid ancestors
for (int i = stack.size() -1; i >= 0; i--) {
Element el = stack.get(i);
Tag parent2 = el.tag();
if (parent2.isValidParent(childTag)) {
if (parent2.isValidAncestor(childTag)) {
return true;
}
}
Expand Down
53 changes: 36 additions & 17 deletions src/main/java/org/jsoup/parser/Tag.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@ public class Tag {
private boolean optionalClosing = false; // If tag is open, and another seen, close previous tag
private boolean empty = false; // can hold nothing; e.g. img
private boolean preserveWhitespace = false; // for pre, textarea, script etc
private List<Tag> ancestors;
private List<Tag> ancestors; // elements must be a descendant of one of these ancestors
private Tag parent; // if not null, elements must be a direct child of parent

private Tag(String tagName) {
this.tagName = tagName.toLowerCase();
Expand Down Expand Up @@ -95,6 +96,10 @@ boolean canContain(Tag child) {
if (this.tagName.equals("dd") && child.tagName.equals("dt"))
return false;

// don't allow children to contain their parent (directly)
if (this.requiresSpecificParent() && this.getImplicitParent().equals(child))
return false;

return true;
}

Expand Down Expand Up @@ -150,7 +155,15 @@ Tag getImplicitParent() {
return (!ancestors.isEmpty()) ? ancestors.get(0) : null;
}

boolean requiresSpecificParent() {
return this.parent != null;
}

boolean isValidParent(Tag child) {
return this.equals(child.parent);
}

boolean isValidAncestor(Tag child) {
if (child.ancestors.isEmpty())
return true; // HTML tag

Expand Down Expand Up @@ -217,8 +230,8 @@ public String toString() {
createBlock("TITLE").setAncestor("HEAD", "BODY").setContainDataOnly();
createInline("BASE").setAncestor("HEAD", "BODY").setEmpty();

createBlock("FRAME").setAncestor("FRAMESET").setEmpty();
createBlock("NOFRAMES").setAncestor("FRAMESET").setContainDataOnly();
createBlock("FRAME").setParent("FRAMESET").setEmpty();
createBlock("NOFRAMES").setParent("FRAMESET").setContainDataOnly();



Expand Down Expand Up @@ -281,34 +294,34 @@ public String toString() {
createInline("TEXTAREA").setAncestor("FORM").setContainDataOnly();
createInline("LABEL").setAncestor("FORM").setOptionalClosing(); // not self
createInline("BUTTON").setAncestor("FORM"); // bunch of excludes not defined
createInline("OPTGROUP").setAncestor("SELECT"); // only contain option
createInline("OPTION").setAncestor("SELECT").setContainDataOnly();
createInline("OPTGROUP").setParent("SELECT"); // only contain option
createInline("OPTION").setParent("SELECT").setContainDataOnly();
createBlock("FIELDSET").setAncestor("FORM");
createInline("LEGEND").setAncestor("FIELDSET");

// other
createInline("AREA").setEmpty(); // not an inline per-se
createInline("PARAM").setAncestor("OBJECT").setEmpty();
createInline("PARAM").setParent("OBJECT").setEmpty();
createBlock("INS"); // only within body
createBlock("DEL"); // only within body

createBlock("DL");
createInline("DT").setAncestor("DL").setOptionalClosing(); // only within DL.
createInline("DD").setAncestor("DL").setOptionalClosing(); // only within DL.
createInline("DT").setParent("DL").setOptionalClosing(); // only within DL.
createInline("DD").setParent("DL").setOptionalClosing(); // only within DL.

createBlock("LI").setAncestor("UL", "OL").setOptionalClosing(); // only within OL or UL.

// tables
createBlock("TABLE"); // specific list of only includes (tr, td, thead etc) not implemented
createBlock("CAPTION").setAncestor("TABLE");
createBlock("THEAD").setAncestor("TABLE").setOptionalClosing(); // just TR
createBlock("TFOOT").setAncestor("TABLE").setOptionalClosing(); // just TR
createBlock("TBODY").setAncestor("TABLE").setOptionalClosing(); // optional / implicit open too. just TR
createBlock("COLGROUP").setAncestor("TABLE").setOptionalClosing(); // just COL
createBlock("COL").setAncestor("COLGROUP").setEmpty();
createBlock("TR").setAncestor("TABLE").setOptionalClosing(); // just TH, TD
createBlock("TH").setAncestor("TR").setOptionalClosing();
createBlock("TD").setAncestor("TR").setOptionalClosing();
createBlock("CAPTION").setParent("TABLE");
createBlock("THEAD").setParent("TABLE").setOptionalClosing(); // just TR
createBlock("TFOOT").setParent("TABLE").setOptionalClosing(); // just TR
createBlock("TBODY").setParent("TABLE").setOptionalClosing(); // optional / implicit open too. just TR
createBlock("COLGROUP").setParent("TABLE").setOptionalClosing(); // just COL
createBlock("COL").setParent("COLGROUP").setEmpty();
createBlock("TR").setParent("TABLE").setOptionalClosing(); // just TH, TD
createBlock("TH").setParent("TR").setOptionalClosing();
createBlock("TD").setParent("TR").setOptionalClosing();
}

private static Tag createBlock(String tagName) {
Expand Down Expand Up @@ -371,4 +384,10 @@ private Tag setAncestor(String... tagNames) {
}
return this;
}

private Tag setParent(String tagName) {
parent = Tag.valueOf(tagName);
setAncestor(tagName);
return this;
}
}
16 changes: 16 additions & 0 deletions src/test/java/org/jsoup/nodes/ElementTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,22 @@ public class ElementTest {
assertEquals("<html><head></head><body><div id=\"1\"><p>Hello</p><p>there</p><p class=\"second\">now</p></div></body></html>",
TextUtil.stripNewlines(doc.html()));
}

@Test public void testAppendRowToTable() {
Document doc = Jsoup.parse("<table><tr><td>1</td></tr></table>");
Element table = doc.select("table").first();
table.append("<tr><td>2</td></tr>");

assertEquals("<table><tr><td>1</td></tr><tr><td>2</td></tr></table>", TextUtil.stripNewlines(doc.body().html()));
}

@Test public void testPrependRowToTable() {
Document doc = Jsoup.parse("<table><tr><td>1</td></tr></table>");
Element table = doc.select("table").first();
table.prepend("<tr><td>2</td></tr>");

assertEquals("<table><tr><td>2</td></tr><tr><td>1</td></tr></table>", TextUtil.stripNewlines(doc.body().html()));
}

@Test public void testPrependElement() {
Document doc = Jsoup.parse("<div id=1><p>Hello</p></div>");
Expand Down
5 changes: 5 additions & 0 deletions src/test/java/org/jsoup/parser/ParserTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,11 @@ public class ParserTest {
assertEquals("<table><tr><td>Hello</td><td><p>There</p><p>now</p></td></tr></table>", TextUtil.stripNewlines(doc.body().html()));
}

@Test public void handlesNestedImplicitTable() {
Document doc = Jsoup.parse("<table><td>1</td></tr> <td>2</td></tr> <td> <table><td>3</td> <td>4</td></table> <tr><td>5</table>");
assertEquals("<table><tr><td>1</td></tr> <tr><td>2</td></tr> <tr><td> <table><tr><td>3</td> <td>4</td></tr></table> </td></tr><tr><td>5</td></tr></table>", TextUtil.stripNewlines(doc.body().html()));
}

@Test public void handlesBaseTags() {
String h = "<a href=1>#</a><base href='/2/'><a href='3'>#</a><base href='http://bar'><a href=4>#</a>";
Document doc = Jsoup.parse(h, "http://foo/");
Expand Down

0 comments on commit 0081d16

Please sign in to comment.