Skip to content

Commit

Permalink
feature #30892 [DomCrawler] Improve Crawler HTML5 parser need detecti…
Browse files Browse the repository at this point in the history
…on (tgalopin)

This PR was merged into the 4.3-dev branch.

Discussion
----------

[DomCrawler] Improve Crawler HTML5 parser need detection

| Q             | A
| ------------- | ---
| Branch?       | master
| Bug fix?      | kind of
| New feature?  | no
| BC breaks?    | no
| Deprecations? | no>
| Tests pass?   | yes
| Fixed tickets | -
| License       | MIT
| Doc PR        | -

Live from #eu-fossa

Follow up of #29306

This PR introduces a better detection mechanism to choose when to parse using the HTML5 parser or not, and fix a subcrawler parsing issue as well.

@stof I'd be super interested by your review :) !

Commits
-------

9bbdab6 [DomCrawler] Improve Crawler HTML5 parser need detection
  • Loading branch information
fabpot committed Apr 6, 2019
2 parents 09dee17 + 9bbdab6 commit f82f1c0
Show file tree
Hide file tree
Showing 4 changed files with 59 additions and 63 deletions.
25 changes: 12 additions & 13 deletions src/Symfony/Component/DomCrawler/Crawler.php
Expand Up @@ -61,24 +61,15 @@ class Crawler implements \Countable, \IteratorAggregate
private $html5Parser;

/**
* @param mixed $node A Node to use as the base for the crawling
* @param string $uri The current URI
* @param string $baseHref The base href value
* @param bool|null $useHtml5Parser Whether the Crawler should use the HTML5 parser or the native DOM parser
* @param mixed $node A Node to use as the base for the crawling
* @param string $uri The current URI
* @param string $baseHref The base href value
*/
public function __construct($node = null, string $uri = null, string $baseHref = null, bool $useHtml5Parser = null)
public function __construct($node = null, string $uri = null, string $baseHref = null)
{
$this->uri = $uri;
$this->baseHref = $baseHref ?: $uri;

if ($useHtml5Parser && !class_exists(HTML5::class)) {
throw new \LogicException('Using the DomCrawler HTML5 parser requires the html5-php library. Try running "composer require masterminds/html5".');
}

if ($useHtml5Parser ?? class_exists(HTML5::class)) {
$this->html5Parser = new HTML5(['disable_html_ns' => true]);
}

$this->add($node);
}

Expand Down Expand Up @@ -198,6 +189,13 @@ public function addContent($content, $type = null)
*/
public function addHtmlContent($content, $charset = 'UTF-8')
{
// Use HTML5 parser if the content is HTML5 and the library is available
if (!$this->html5Parser
&& class_exists(HTML5::class)
&& '<!doctype html>' === strtolower(substr(ltrim($content), 0, 15))) {
$this->html5Parser = new HTML5(['disable_html_ns' => true]);
}

$dom = null !== $this->html5Parser ? $this->parseHtml5($content, $charset) : $this->parseXhtml($content, $charset);
$this->addDocument($dom);

Expand Down Expand Up @@ -1219,6 +1217,7 @@ private function createSubCrawler($nodes)
$crawler->isHtml = $this->isHtml;
$crawler->document = $this->document;
$crawler->namespaces = $this->namespaces;
$crawler->html5Parser = $this->html5Parser;

return $crawler;
}
Expand Down
73 changes: 33 additions & 40 deletions src/Symfony/Component/DomCrawler/Tests/AbstractCrawlerTest.php
Expand Up @@ -16,14 +16,12 @@

abstract class AbstractCrawlerTest extends TestCase
{
/**
* @param mixed $node
* @param string|null $uri
* @param string|null $baseHref
*
* @return Crawler
*/
abstract public function createCrawler($node = null, string $uri = null, string $baseHref = null);
abstract public function getDoctype(): string;

protected function createCrawler($node = null, string $uri = null, string $baseHref = null)
{
return new Crawler($node, $uri, $baseHref);
}

public function testConstructor()
{
Expand Down Expand Up @@ -74,7 +72,7 @@ public function testAdd()
$this->assertEquals('foo', $crawler->filterXPath('//div')->attr('class'), '->add() adds nodes from a \DOMNode');

$crawler = $this->createCrawler();
$crawler->add('<html><body>Foo</body></html>');
$crawler->add($this->getDoctype().'<html><body>Foo</body></html>');
$this->assertEquals('Foo', $crawler->filterXPath('//body')->text(), '->add() adds nodes from a string');
}

Expand All @@ -94,22 +92,21 @@ public function testAddInvalidType()
public function testAddMultipleDocumentNode()
{
$crawler = $this->createTestCrawler();
$crawler->addHtmlContent('<html><div class="foo"></html>', 'UTF-8');
$crawler->addHtmlContent($this->getDoctype().'<html><div class="foo"></html>', 'UTF-8');
}

public function testAddHtmlContent()
{
$crawler = $this->createCrawler();
$crawler->addHtmlContent('<html><div class="foo"></html>', 'UTF-8');
$crawler->addHtmlContent($this->getDoctype().'<html><div class="foo"></html>', 'UTF-8');

$this->assertEquals('foo', $crawler->filterXPath('//div')->attr('class'), '->addHtmlContent() adds nodes from an HTML string');
}

public function testAddHtmlContentWithBaseTag()
{
$crawler = $this->createCrawler();

$crawler->addHtmlContent('<html><head><base href="http://symfony.com"></head><a href="/contact"></a></html>', 'UTF-8');
$crawler->addHtmlContent($this->getDoctype().'<html><head><base href="http://symfony.com"></head><a href="/contact"></a></html>', 'UTF-8');

$this->assertEquals('http://symfony.com', $crawler->filterXPath('//base')->attr('href'), '->addHtmlContent() adds nodes from an HTML string');
$this->assertEquals('http://symfony.com/contact', $crawler->filterXPath('//a')->link()->getUri(), '->addHtmlContent() adds nodes from an HTML string');
Expand All @@ -121,15 +118,15 @@ public function testAddHtmlContentWithBaseTag()
public function testAddHtmlContentCharset()
{
$crawler = $this->createCrawler();
$crawler->addHtmlContent('<html><div class="foo">Tiếng Việt</html>', 'UTF-8');
$crawler->addHtmlContent($this->getDoctype().'<html><div class="foo">Tiếng Việt</html>', 'UTF-8');

$this->assertEquals('Tiếng Việt', $crawler->filterXPath('//div')->text());
}

public function testAddHtmlContentInvalidBaseTag()
{
$crawler = $this->createCrawler(null, 'http://symfony.com');
$crawler->addHtmlContent('<html><head><base target="_top"></head><a href="/contact"></a></html>', 'UTF-8');
$crawler->addHtmlContent($this->getDoctype().'<html><head><base target="_top"></head><a href="/contact"></a></html>', 'UTF-8');

$this->assertEquals('http://symfony.com/contact', current($crawler->filterXPath('//a')->links())->getUri(), '->addHtmlContent() correctly handles a non-existent base tag href attribute');
}
Expand All @@ -141,55 +138,55 @@ public function testAddHtmlContentCharsetGbk()
{
$crawler = $this->createCrawler();
//gbk encode of <html><p>中文</p></html>
$crawler->addHtmlContent(base64_decode('PGh0bWw+PHA+1tDOxDwvcD48L2h0bWw+'), 'gbk');
$crawler->addHtmlContent($this->getDoctype().base64_decode('PGh0bWw+PHA+1tDOxDwvcD48L2h0bWw+'), 'gbk');

$this->assertEquals('中文', $crawler->filterXPath('//p')->text());
}

public function testAddXmlContent()
{
$crawler = $this->createCrawler();
$crawler->addXmlContent('<html><div class="foo"></div></html>', 'UTF-8');
$crawler->addXmlContent($this->getDoctype().'<html><div class="foo"></div></html>', 'UTF-8');

$this->assertEquals('foo', $crawler->filterXPath('//div')->attr('class'), '->addXmlContent() adds nodes from an XML string');
}

public function testAddXmlContentCharset()
{
$crawler = $this->createCrawler();
$crawler->addXmlContent('<html><div class="foo">Tiếng Việt</div></html>', 'UTF-8');
$crawler->addXmlContent($this->getDoctype().'<html><div class="foo">Tiếng Việt</div></html>', 'UTF-8');

$this->assertEquals('Tiếng Việt', $crawler->filterXPath('//div')->text());
}

public function testAddContent()
{
$crawler = $this->createCrawler();
$crawler->addContent('<html><div class="foo"></html>', 'text/html; charset=UTF-8');
$crawler->addContent($this->getDoctype().'<html><div class="foo"></html>', 'text/html; charset=UTF-8');
$this->assertEquals('foo', $crawler->filterXPath('//div')->attr('class'), '->addContent() adds nodes from an HTML string');

$crawler = $this->createCrawler();
$crawler->addContent('<html><div class="foo"></html>', 'text/html; charset=UTF-8; dir=RTL');
$crawler->addContent($this->getDoctype().'<html><div class="foo"></html>', 'text/html; charset=UTF-8; dir=RTL');
$this->assertEquals('foo', $crawler->filterXPath('//div')->attr('class'), '->addContent() adds nodes from an HTML string with extended content type');

$crawler = $this->createCrawler();
$crawler->addContent('<html><div class="foo"></html>');
$crawler->addContent($this->getDoctype().'<html><div class="foo"></html>');
$this->assertEquals('foo', $crawler->filterXPath('//div')->attr('class'), '->addContent() uses text/html as the default type');

$crawler = $this->createCrawler();
$crawler->addContent('<html><div class="foo"></div></html>', 'text/xml; charset=UTF-8');
$crawler->addContent($this->getDoctype().'<html><div class="foo"></div></html>', 'text/xml; charset=UTF-8');
$this->assertEquals('foo', $crawler->filterXPath('//div')->attr('class'), '->addContent() adds nodes from an XML string');

$crawler = $this->createCrawler();
$crawler->addContent('<html><div class="foo"></div></html>', 'text/xml');
$crawler->addContent($this->getDoctype().'<html><div class="foo"></div></html>', 'text/xml');
$this->assertEquals('foo', $crawler->filterXPath('//div')->attr('class'), '->addContent() adds nodes from an XML string');

$crawler = $this->createCrawler();
$crawler->addContent('foo bar', 'text/plain');
$this->assertCount(0, $crawler, '->addContent() does nothing if the type is not (x|ht)ml');

$crawler = $this->createCrawler();
$crawler->addContent('<html><meta http-equiv="Content-Type" content="text/html; charset=utf-8" /><span>中文</span></html>');
$crawler->addContent($this->getDoctype().'<html><meta http-equiv="Content-Type" content="text/html; charset=utf-8" /><span>中文</span></html>');
$this->assertEquals('中文', $crawler->filterXPath('//span')->text(), '->addContent() guess wrong charset');
}

Expand All @@ -199,7 +196,7 @@ public function testAddContent()
public function testAddContentNonUtf8()
{
$crawler = $this->createCrawler();
$crawler->addContent(iconv('UTF-8', 'SJIS', '<html><head><meta charset="Shift_JIS"></head><body>日本語</body></html>'));
$crawler->addContent(iconv('UTF-8', 'SJIS', $this->getDoctype().'<html><head><meta charset="Shift_JIS"></head><body>日本語</body></html>'));
$this->assertEquals('日本語', $crawler->filterXPath('//body')->text(), '->addContent() can recognize "Shift_JIS" in html5 meta charset tag');
}

Expand Down Expand Up @@ -314,7 +311,7 @@ public function testAttr()
public function testMissingAttrValueIsNull()
{
$crawler = $this->createCrawler();
$crawler->addContent('<html><div non-empty-attr="sample value" empty-attr=""></div></html>', 'text/html; charset=UTF-8');
$crawler->addContent($this->getDoctype().'<html><div non-empty-attr="sample value" empty-attr=""></div></html>', 'text/html; charset=UTF-8');
$div = $crawler->filterXPath('//div');

$this->assertEquals('sample value', $div->attr('non-empty-attr'), '->attr() reads non-empty attributes correctly');
Expand Down Expand Up @@ -670,7 +667,6 @@ public function testSelectButton()
public function testSelectButtonWithSingleQuotesInNameAttribute()
{
$html = <<<'HTML'
<!DOCTYPE html>
<html lang="en">
<body>
<div id="action">
Expand All @@ -683,15 +679,14 @@ public function testSelectButtonWithSingleQuotesInNameAttribute()
</html>
HTML;

$crawler = $this->createCrawler($html);
$crawler = $this->createCrawler($this->getDoctype().$html);

$this->assertCount(1, $crawler->selectButton('Click \'Here\''));
}

public function testSelectButtonWithDoubleQuotesInNameAttribute()
{
$html = <<<'HTML'
<!DOCTYPE html>
<html lang="en">
<body>
<div id="action">
Expand All @@ -704,7 +699,7 @@ public function testSelectButtonWithDoubleQuotesInNameAttribute()
</html>
HTML;

$crawler = $this->createCrawler($html);
$crawler = $this->createCrawler($this->getDoctype().$html);

$this->assertCount(1, $crawler->selectButton('Click "Here"'));
}
Expand Down Expand Up @@ -763,7 +758,6 @@ public function testImage()
public function testSelectLinkAndLinkFiltered()
{
$html = <<<'HTML'
<!DOCTYPE html>
<html lang="en">
<body>
<div id="action">
Expand All @@ -776,7 +770,7 @@ public function testSelectLinkAndLinkFiltered()
</html>
HTML;

$crawler = $this->createCrawler($html);
$crawler = $this->createCrawler($this->getDoctype().$html);
$filtered = $crawler->filterXPath("descendant-or-self::*[@id = 'login-form']");

$this->assertCount(0, $filtered->selectLink('Login'));
Expand All @@ -793,7 +787,7 @@ public function testSelectLinkAndLinkFiltered()

public function testChaining()
{
$crawler = $this->createCrawler('<div name="a"><div name="b"><div name="c"></div></div></div>');
$crawler = $this->createCrawler($this->getDoctype().'<div name="a"><div name="b"><div name="c"></div></div></div>');

$this->assertEquals('a', $crawler->filterXPath('//div')->filterXPath('div')->filterXPath('div')->attr('name'));
}
Expand Down Expand Up @@ -965,7 +959,6 @@ public function testChildren()
public function testFilteredChildren()
{
$html = <<<'HTML'
<!DOCTYPE html>
<html lang="en">
<body>
<div id="foo">
Expand All @@ -981,7 +974,7 @@ public function testFilteredChildren()
</html>
HTML;

$crawler = $this->createCrawler($html);
$crawler = $this->createCrawler($this->getDoctype().$html);
$foo = $crawler->filter('#foo');

$this->assertEquals(3, $foo->children()->count());
Expand Down Expand Up @@ -1018,7 +1011,7 @@ public function testParents()
*/
public function testBaseTag($baseValue, $linkValue, $expectedUri, $currentUri = null, $description = '')
{
$crawler = $this->createCrawler('<html><base href="'.$baseValue.'"><a href="'.$linkValue.'"></a></html>', $currentUri);
$crawler = $this->createCrawler($this->getDoctype().'<html><base href="'.$baseValue.'"><a href="'.$linkValue.'"></a></html>', $currentUri);
$this->assertEquals($expectedUri, $crawler->filterXPath('//a')->link()->getUri(), $description);
}

Expand All @@ -1038,7 +1031,7 @@ public function getBaseTagData()
*/
public function testBaseTagWithForm($baseValue, $actionValue, $expectedUri, $currentUri = null, $description = null)
{
$crawler = $this->createCrawler('<html><base href="'.$baseValue.'"><form method="post" action="'.$actionValue.'"><button type="submit" name="submit"/></form></html>', $currentUri);
$crawler = $this->createCrawler($this->getDoctype().'<html><base href="'.$baseValue.'"><form method="post" action="'.$actionValue.'"><button type="submit" name="submit"/></form></html>', $currentUri);
$this->assertEquals($expectedUri, $crawler->filterXPath('//button')->form()->getUri(), $description);
}

Expand Down Expand Up @@ -1113,7 +1106,7 @@ public function testEvaluateThrowsAnExceptionIfDocumentIsEmpty()
public function testInheritedClassCallChildrenWithoutArgument()
{
$dom = new \DOMDocument();
$dom->loadHTML('
$dom->loadHTML($this->getDoctype().'
<html>
<body>
<a href="foo">Foo</a>
Expand Down Expand Up @@ -1165,15 +1158,15 @@ public function testInheritedClassCallChildrenWithoutArgument()
public function testAddHtmlContentUnsupportedCharset()
{
$crawler = $this->createCrawler();
$crawler->addHtmlContent(file_get_contents(__DIR__.'/Fixtures/windows-1250.html'), 'Windows-1250');
$crawler->addHtmlContent($this->getDoctype().file_get_contents(__DIR__.'/Fixtures/windows-1250.html'), 'Windows-1250');

$this->assertEquals('Žťčýů', $crawler->filterXPath('//p')->text());
}

public function createTestCrawler($uri = null)
{
$dom = new \DOMDocument();
$dom->loadHTML('
$dom->loadHTML($this->getDoctype().'
<html>
<body>
<a href="foo">Foo</a>
Expand Down
14 changes: 10 additions & 4 deletions src/Symfony/Component/DomCrawler/Tests/Html5ParserCrawlerTest.php
Expand Up @@ -11,12 +11,18 @@

namespace Symfony\Component\DomCrawler\Tests;

use Symfony\Component\DomCrawler\Crawler;

class Html5ParserCrawlerTest extends AbstractCrawlerTest
{
public function createCrawler($node = null, string $uri = null, string $baseHref = null)
public function getDoctype(): string
{
return '<!DOCTYPE html>';
}

public function testAddHtml5()
{
return new Crawler($node, $uri, $baseHref, true);
// Ensure a bug specific to the DOM extension is fixed (see https://github.com/symfony/symfony/issues/28596)
$crawler = $this->createCrawler();
$crawler->add($this->getDoctype().'<html><body><h1><p>Foo</p></h1></body></html>');
$this->assertEquals('Foo', $crawler->filterXPath('//h1')->text(), '->add() adds nodes from a string');
}
}
Expand Up @@ -11,13 +11,11 @@

namespace Symfony\Component\DomCrawler\Tests;

use Symfony\Component\DomCrawler\Crawler;

class NativeParserCrawlerTest extends AbstractCrawlerTest
{
public function createCrawler($node = null, string $uri = null, string $baseHref = null)
public function getDoctype(): string
{
return new Crawler($node, $uri, $baseHref, false);
return '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">';
}

public function testAddHtmlContentWithErrors()
Expand All @@ -26,7 +24,7 @@ public function testAddHtmlContentWithErrors()

$crawler = $this->createCrawler();
$crawler->addHtmlContent(<<<'EOF'
<!DOCTYPE html>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html>
<head>
</head>
Expand All @@ -51,7 +49,7 @@ public function testAddXmlContentWithErrors()

$crawler = $this->createCrawler();
$crawler->addXmlContent(<<<'EOF'
<!DOCTYPE html>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html>
<head>
</head>
Expand Down

0 comments on commit f82f1c0

Please sign in to comment.