Skip to content
This repository has been archived by the owner on May 24, 2022. It is now read-only.

Commit

Permalink
Improved implicit close tag heuristic detection when parsing malforme…
Browse files Browse the repository at this point in the history
…d HTML.

Fixes an issue where appending / prepending rows to a table (or  to similar implicit
element structures) would create a redundant wrapping elements.

Fixes jhy#21
  • Loading branch information
jhy committed Jun 13, 2010
1 parent e5716c0 commit 3e8cf58
Show file tree
Hide file tree
Showing 6 changed files with 87 additions and 25 deletions.
6 changes: 6 additions & 0 deletions CHANGES
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,12 @@ jsoup changelog
* Added Element#parents and Elements#parents to retrieve an element's ancestor chain
<http://github.com/jhy/jsoup/issues/issue/20>

* Fixes an issue where appending / prepending rows to a table (or to similar implicit
element structures) would create a redundant wrapping elements
<http://github.com/jhy/jsoup/issues/issue/21>

* Improved implicit close tag heuristic detection when parsing malformed HTML

* Fixes an issue where text content after a script (or other data-node) was
incorrectly added to the data node.
<http://github.com/jhy/jsoup/issues/issue/22>
Expand Down
8 changes: 3 additions & 5 deletions src/main/java/org/jsoup/nodes/Element.java
Original file line number Diff line number Diff line change
Expand Up @@ -267,8 +267,7 @@ public Element prependText(String text) {
public Element append(String html) {
Validate.notNull(html);

Element fragment = Parser.parseBodyFragment(html, baseUri).body();
// TODO: must parse without implicit elements, so you can e.g. add <td> to a <tr> (without creating a whole new table)
Element fragment = Parser.parseBodyFragmentRelaxed(html, baseUri()).body();
for (Node node : fragment.childNodes()) {
node.parentNode = null;
appendChild(node);
Expand All @@ -285,8 +284,7 @@ public Element append(String html) {
public Element prepend(String html) {
Validate.notNull(html);

Element fragment = Parser.parseBodyFragment(html, baseUri).body();
// TODO: must parse without implicit elements, so you can e.g. add <td> to a <tr> (without creating a whole new table)
Element fragment = Parser.parseBodyFragmentRelaxed(html, baseUri()).body();
List<Node> nodes = fragment.childNodes();
for (int i = nodes.size() - 1; i >= 0; i--) {
Node node = nodes.get(i);
Expand All @@ -313,7 +311,7 @@ public Element empty() {
public Element wrap(String html) {
Validate.notEmpty(html);

Element wrapBody = Parser.parseBodyFragment(html, baseUri).body();
Element wrapBody = Parser.parseBodyFragmentRelaxed(html, baseUri).body();
Elements wrapChildren = wrapBody.children();
Element wrap = wrapChildren.first();
if (wrap == null) // nothing to wrap with; noop
Expand Down
24 changes: 21 additions & 3 deletions src/main/java/org/jsoup/parser/Parser.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ public class Parser {
private final TokenQueue tq;
private final Document doc;
private String baseUri;
private boolean relaxed = false;

private Parser(String html, String baseUri, boolean isBodyFragment) {
Validate.notNull(html);
Expand Down Expand Up @@ -63,6 +64,19 @@ public static Document parseBodyFragment(String bodyHtml, String baseUri) {
return parser.parse();
}

/**
Parse a fragment of HTML into the {@code body} of a Document, with relaxed parsing enabled. Relaxed, in this
context, means that implicit tags are not automatically created when missing.
@param bodyHtml fragment of HTML
@param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
@return Document, with empty head, and HTML parsed into body
*/
public static Document parseBodyFragmentRelaxed(String bodyHtml, String baseUri) {
Parser parser = new Parser(bodyHtml, baseUri, true);
parser.relaxed = true;
return parser.parse();
}

private Document parse() {
while (!tq.isEmpty()) {
if (tq.matches("<!--")) {
Expand Down Expand Up @@ -213,7 +227,7 @@ private Element addChildToParent(Element child, boolean isEmptyElement) {
Tag childTag = child.tag();
boolean validAncestor = stackHasValidParent(childTag);

if (!validAncestor) {
if (!validAncestor && !relaxed) {
// create implicit parent around this child
Tag parentTag = childTag.getImplicitParent();
Element implicit = new Element(parentTag, baseUri);
Expand Down Expand Up @@ -241,11 +255,15 @@ private Element addChildToParent(Element child, boolean isEmptyElement) {
private boolean stackHasValidParent(Tag childTag) {
if (stack.size() == 1 && childTag.equals(htmlTag))
return true; // root is valid for html node


if (childTag.requiresSpecificParent())
return stack.getLast().tag().isValidParent(childTag);

// otherwise, look up the stack for valid ancestors
for (int i = stack.size() -1; i >= 0; i--) {
Element el = stack.get(i);
Tag parent2 = el.tag();
if (parent2.isValidParent(childTag)) {
if (parent2.isValidAncestor(childTag)) {
return true;
}
}
Expand Down
53 changes: 36 additions & 17 deletions src/main/java/org/jsoup/parser/Tag.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@ public class Tag {
private boolean optionalClosing = false; // If tag is open, and another seen, close previous tag
private boolean empty = false; // can hold nothing; e.g. img
private boolean preserveWhitespace = false; // for pre, textarea, script etc
private List<Tag> ancestors;
private List<Tag> ancestors; // elements must be a descendant of one of these ancestors
private Tag parent; // if not null, elements must be a direct child of parent

private Tag(String tagName) {
this.tagName = tagName.toLowerCase();
Expand Down Expand Up @@ -95,6 +96,10 @@ boolean canContain(Tag child) {
if (this.tagName.equals("dd") && child.tagName.equals("dt"))
return false;

// don't allow children to contain their parent (directly)
if (this.requiresSpecificParent() && this.getImplicitParent().equals(child))
return false;

return true;
}

Expand Down Expand Up @@ -150,7 +155,15 @@ Tag getImplicitParent() {
return (!ancestors.isEmpty()) ? ancestors.get(0) : null;
}

boolean requiresSpecificParent() {
return this.parent != null;
}

boolean isValidParent(Tag child) {
return this.equals(child.parent);
}

boolean isValidAncestor(Tag child) {
if (child.ancestors.isEmpty())
return true; // HTML tag

Expand Down Expand Up @@ -217,8 +230,8 @@ public String toString() {
createBlock("TITLE").setAncestor("HEAD", "BODY").setContainDataOnly();
createInline("BASE").setAncestor("HEAD", "BODY").setEmpty();

createBlock("FRAME").setAncestor("FRAMESET").setEmpty();
createBlock("NOFRAMES").setAncestor("FRAMESET").setContainDataOnly();
createBlock("FRAME").setParent("FRAMESET").setEmpty();
createBlock("NOFRAMES").setParent("FRAMESET").setContainDataOnly();



Expand Down Expand Up @@ -281,34 +294,34 @@ public String toString() {
createInline("TEXTAREA").setAncestor("FORM").setContainDataOnly();
createInline("LABEL").setAncestor("FORM").setOptionalClosing(); // not self
createInline("BUTTON").setAncestor("FORM"); // bunch of excludes not defined
createInline("OPTGROUP").setAncestor("SELECT"); // only contain option
createInline("OPTION").setAncestor("SELECT").setContainDataOnly();
createInline("OPTGROUP").setParent("SELECT"); // only contain option
createInline("OPTION").setParent("SELECT").setContainDataOnly();
createBlock("FIELDSET").setAncestor("FORM");
createInline("LEGEND").setAncestor("FIELDSET");

// other
createInline("AREA").setEmpty(); // not an inline per-se
createInline("PARAM").setAncestor("OBJECT").setEmpty();
createInline("PARAM").setParent("OBJECT").setEmpty();
createBlock("INS"); // only within body
createBlock("DEL"); // only within body

createBlock("DL");
createInline("DT").setAncestor("DL").setOptionalClosing(); // only within DL.
createInline("DD").setAncestor("DL").setOptionalClosing(); // only within DL.
createInline("DT").setParent("DL").setOptionalClosing(); // only within DL.
createInline("DD").setParent("DL").setOptionalClosing(); // only within DL.

createBlock("LI").setAncestor("UL", "OL").setOptionalClosing(); // only within OL or UL.

// tables
createBlock("TABLE"); // specific list of only includes (tr, td, thead etc) not implemented
createBlock("CAPTION").setAncestor("TABLE");
createBlock("THEAD").setAncestor("TABLE").setOptionalClosing(); // just TR
createBlock("TFOOT").setAncestor("TABLE").setOptionalClosing(); // just TR
createBlock("TBODY").setAncestor("TABLE").setOptionalClosing(); // optional / implicit open too. just TR
createBlock("COLGROUP").setAncestor("TABLE").setOptionalClosing(); // just COL
createBlock("COL").setAncestor("COLGROUP").setEmpty();
createBlock("TR").setAncestor("TABLE").setOptionalClosing(); // just TH, TD
createBlock("TH").setAncestor("TR").setOptionalClosing();
createBlock("TD").setAncestor("TR").setOptionalClosing();
createBlock("CAPTION").setParent("TABLE");
createBlock("THEAD").setParent("TABLE").setOptionalClosing(); // just TR
createBlock("TFOOT").setParent("TABLE").setOptionalClosing(); // just TR
createBlock("TBODY").setParent("TABLE").setOptionalClosing(); // optional / implicit open too. just TR
createBlock("COLGROUP").setParent("TABLE").setOptionalClosing(); // just COL
createBlock("COL").setParent("COLGROUP").setEmpty();
createBlock("TR").setParent("TABLE").setOptionalClosing(); // just TH, TD
createBlock("TH").setParent("TR").setOptionalClosing();
createBlock("TD").setParent("TR").setOptionalClosing();
}

private static Tag createBlock(String tagName) {
Expand Down Expand Up @@ -371,4 +384,10 @@ private Tag setAncestor(String... tagNames) {
}
return this;
}

private Tag setParent(String tagName) {
parent = Tag.valueOf(tagName);
setAncestor(tagName);
return this;
}
}
16 changes: 16 additions & 0 deletions src/test/java/org/jsoup/nodes/ElementTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,22 @@ public class ElementTest {
assertEquals("<html><head></head><body><div id=\"1\"><p>Hello</p><p>there</p><p class=\"second\">now</p></div></body></html>",
TextUtil.stripNewlines(doc.html()));
}

@Test public void testAppendRowToTable() {
Document doc = Jsoup.parse("<table><tr><td>1</td></tr></table>");
Element table = doc.select("table").first();
table.append("<tr><td>2</td></tr>");

assertEquals("<table><tr><td>1</td></tr><tr><td>2</td></tr></table>", TextUtil.stripNewlines(doc.body().html()));
}

@Test public void testPrependRowToTable() {
Document doc = Jsoup.parse("<table><tr><td>1</td></tr></table>");
Element table = doc.select("table").first();
table.prepend("<tr><td>2</td></tr>");

assertEquals("<table><tr><td>2</td></tr><tr><td>1</td></tr></table>", TextUtil.stripNewlines(doc.body().html()));
}

@Test public void testPrependElement() {
Document doc = Jsoup.parse("<div id=1><p>Hello</p></div>");
Expand Down
5 changes: 5 additions & 0 deletions src/test/java/org/jsoup/parser/ParserTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,11 @@ public class ParserTest {
assertEquals("<table><tr><td>Hello</td><td><p>There</p><p>now</p></td></tr></table>", TextUtil.stripNewlines(doc.body().html()));
}

@Test public void handlesNestedImplicitTable() {
Document doc = Jsoup.parse("<table><td>1</td></tr> <td>2</td></tr> <td> <table><td>3</td> <td>4</td></table> <tr><td>5</table>");
assertEquals("<table><tr><td>1</td></tr> <tr><td>2</td></tr> <tr><td> <table><tr><td>3</td> <td>4</td></tr></table> </td></tr><tr><td>5</td></tr></table>", TextUtil.stripNewlines(doc.body().html()));
}

@Test public void handlesBaseTags() {
String h = "<a href=1>#</a><base href='/2/'><a href='3'>#</a><base href='http://bar'><a href=4>#</a>";
Document doc = Jsoup.parse(h, "http://foo/");
Expand Down

0 comments on commit 3e8cf58

Please sign in to comment.