diff --git a/src/Amara/Varcon/HtmlCrawler.php b/src/Amara/Varcon/HtmlCrawler.php index ebf7114..1ac8d1e 100644 --- a/src/Amara/Varcon/HtmlCrawler.php +++ b/src/Amara/Varcon/HtmlCrawler.php @@ -3,7 +3,7 @@ namespace Amara\Varcon; use DOMDocument; -use DOMText; +use DOMNode; use DOMXPath; /** @@ -40,8 +40,8 @@ public function setXpathExpressions(array $xpathExpressions) public function crawlAndModify($content, callable $callable) { $document = new DOMDocument(); - $document->loadHTML(mb_convert_encoding( - sprintf('
%s
', $content), + @$document->loadHTML(mb_convert_encoding( + sprintf('
%s
', $content), /** @see stripDoctypeHtmlBodyAndHeadElements */ 'HTML-ENTITIES', 'UTF-8' )); @@ -51,7 +51,7 @@ public function crawlAndModify($content, callable $callable) $textNodes = $xpath->query(implode('|', $this->xpathExpressions)); - /** @var DOMText $textNode */ + /** @var DOMNode $textNode */ foreach ($textNodes as $textNode) { $textNode->nodeValue = $callable($textNode->nodeValue); } @@ -63,17 +63,23 @@ public function crawlAndModify($content, callable $callable) * This method is a short hack to avoid incompatibilities between different PHP and Libxml setups. It has the same * effect as passing the LIBXML_HTML_NOIMPLIED and LIBXML_HTML_NODEFDTD flags to loadHtml's options * + * It works by, first of all, wrapping all of the contents in a div, and then extracting only them back to the + * DOM document. This way, we can get rid of the Doctype and all tags so kindly inserted by loadHtml + * * @param DOMDocument $document */ private function stripDoctypeHtmlBodyAndHeadElements(DOMDocument $document) { + // First step - extract the div wrapper from the document $container = $document->getElementsByTagName('div')->item(0); $container = $container->parentNode->removeChild($container); + // Remove all document children while ($document->firstChild) { $document->removeChild($document->firstChild); } + // Append the div wrapper's children as children of the document while ($container->firstChild) { $document->appendChild($container->firstChild); } diff --git a/src/Amara/Varcon/TranslatorInterface.php b/src/Amara/Varcon/TranslatorInterface.php index 334ea15..f407c76 100644 --- a/src/Amara/Varcon/TranslatorInterface.php +++ b/src/Amara/Varcon/TranslatorInterface.php @@ -7,8 +7,25 @@ */ interface TranslatorInterface { + /** + * In case of multiple translations available, don't translate (recommended) + * + * @var int + */ const QUESTIONABLE_IGNORE = 0; + + /** + * In case of multiple translations available, use the first + * + * @var int + */ const QUESTIONABLE_INCLUDE = 1; + + /** + * In case of multiple translations available, mark them ?like/so? (useful for debugging) + * + * @var int + */ const QUESTIONABLE_MARK = 2; /** diff --git a/tests/Amara/Varcon/HtmlTranslatorTest.php b/tests/Amara/Varcon/HtmlTranslatorTest.php index f21e855..a386b64 100644 --- a/tests/Amara/Varcon/HtmlTranslatorTest.php +++ b/tests/Amara/Varcon/HtmlTranslatorTest.php @@ -32,11 +32,19 @@ public function testTranslate($html, $translatedHtml) { $translator = $this->prophesize(Translator::class); $translator->translate( - Argument::any(), - Argument::any(), - Argument::any(), - Argument::any() - )->willReturn('Translated'); // Keep in mind we ignore whitespace this way + Argument::type('string'), + 'A', + 'B', + 0 + )->will(function ($arguments) { + $string = array_shift($arguments); + + return str_replace( + ['Text', 'text'], + ['Translated', 'translated'], + $string + ); + }); $htmlTranslator = new HtmlTranslator($translator->reveal()); @@ -50,24 +58,25 @@ public function provideTranslate() { return [ [ - $html = '

Text text text

', - $translatedHtml = '

Translated

', + $html = '

Text & text text

', + $translatedHtml = '

Translated & translated translated

', ], [ - $html = '

Text text text

', - $translatedHtml = '

TranslatedTranslatedTranslated

', + // Some day, this will stay as • ..some ..day + $html = '

• Text & text: text

', + $translatedHtml = '

• Translated & translated: translated

', ], [ $html = 'Text text text', - $translatedHtml = 'Translated', + $translatedHtml = 'Translated translated translated', ], [ $html = '', - $translatedHtml = '', + $translatedHtml = '', ], [ $html = '', - $translatedHtml = '', + $translatedHtml = '', ], ]; }