Skip to content

Commit

Permalink
Improve consistency between PHP and Libxml versions, add support for …
Browse files Browse the repository at this point in the history
…UTF8 characters
  • Loading branch information
loranmutafov committed Mar 13, 2017
1 parent 9ab2b79 commit 35a7b00
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 8 deletions.
3 changes: 1 addition & 2 deletions composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,7 @@
}
],
"require": {
"php": ">=5.5.0,<8.0",
"lib-libxml": ">=2.7.8"
"php": ">=5.5.0,<8.0"
},
"require-dev": {
"phpunit/phpunit": "~4.8"
Expand Down
33 changes: 29 additions & 4 deletions src/Amara/Varcon/HtmlCrawler.php
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,15 @@ public function setXpathExpressions(array $xpathExpressions)
*/
public function crawlAndModify($content, callable $callable)
{
$dom = new DOMDocument();
$dom->loadHTML($content, LIBXML_HTML_NOIMPLIED|LIBXML_HTML_NODEFDTD);
$document = new DOMDocument();
$document->loadHTML(mb_convert_encoding(
sprintf('<div>%s</div>', $content),
'HTML-ENTITIES',
'UTF-8'
));
$this->stripDoctypeHtmlBodyAndHeadElements($document);

$xpath = new DOMXPath($dom);
$xpath = new DOMXPath($document);

$textNodes = $xpath->query(implode('|', $this->xpathExpressions));

Expand All @@ -51,6 +56,26 @@ public function crawlAndModify($content, callable $callable)
$textNode->nodeValue = $callable($textNode->nodeValue);
}

return $dom->saveHTML($dom->documentElement);
return $document->saveHTML($document->documentElement);
}

/**
* This method is a short hack to avoid incompatibilities between different PHP and Libxml setups. It has the same
* effect as passing the LIBXML_HTML_NOIMPLIED and LIBXML_HTML_NODEFDTD flags to loadHtml's options
*
* @param DOMDocument $document
*/
private function stripDoctypeHtmlBodyAndHeadElements(DOMDocument $document)
{
$container = $document->getElementsByTagName('div')->item(0);
$container = $container->parentNode->removeChild($container);

while ($document->firstChild) {
$document->removeChild($document->firstChild);
}

while ($container->firstChild) {
$document->appendChild($container->firstChild);
}
}
}
4 changes: 2 additions & 2 deletions tests/Amara/Varcon/HtmlTranslatorTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,9 @@ public function testTranslatePreservesWhitespace()
$htmlTranslator = new HtmlTranslator();

$this->assertSame(
'<p>Colour <strong>pyjama</strong> test</p>',
'<p>Colour <strong>pyjama</strong> паралелепипед</p>', // Tests UTF8 characters as well
$htmlTranslator->translate(
'<p>Color <strong>pajama</strong> test</p>',
'<p>Color <strong>pajama</strong> паралелепипед</p>',
'A',
'B'
)
Expand Down

0 comments on commit 35a7b00

Please sign in to comment.