Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add an HTML crawler and translator #12

Merged
merged 6 commits into from
Mar 14, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 98 additions & 0 deletions src/Amara/Varcon/HtmlCrawler.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
<?php

namespace Amara\Varcon;

use DOMDocument;
use DOMNode;
use DOMXPath;

/**
* An extractor of HTML text nodes as well as attributes, which are known to be important for SEO
*/
class HtmlCrawler
{
/**
* @var array
*/
private $xpathExpressions = [
'//text()',
'//img/@alt',
'//img/@title',
'//meta[@name="description"]/@content',
];

/**
* @param array $xpathExpressions
*/
public function setXpathExpressions(array $xpathExpressions)
{
$this->xpathExpressions = $xpathExpressions;
}

/**
* Crawl the HTML content provided and apply a callable (usually a wrapper of the Translator's translate function)
*
* @param string $content
* @param callable $callable
*
* @return string
*/
public function crawlAndModify($content, callable $callable)
{
$document = new DOMDocument();
@$document->loadHTML(mb_convert_encoding(
sprintf('<div>%s</div>', $content), /** @see stripDoctypeHtmlBodyAndHeadElements */
'HTML-ENTITIES',
'UTF-8'
));
$this->stripDoctypeHtmlBodyAndHeadElements($document);

$xpath = new DOMXPath($document);
$nodes = $xpath->query(implode('|', $this->xpathExpressions));

$isContentModified = false;

/** @var DOMNode $node */
foreach ($nodes as $node) {
$modifiedNodeValue = $callable($node->nodeValue);

if ($node->nodeValue != $modifiedNodeValue) {
$isContentModified = true;

$node->nodeValue = $modifiedNodeValue;
}
}

if (false === $isContentModified) {
return $content;
}

return trim($document->saveHTML());
}

/**
* This method is a short hack to avoid incompatibilities between different PHP and Libxml setups. It has the same
* effect as passing the LIBXML_HTML_NOIMPLIED and LIBXML_HTML_NODEFDTD flags to loadHtml's options
*
* It works by, first of all, wrapping all of the contents in a div, and then extracting only them back to the
* DOM document. This way, we can get rid of the Doctype and all tags so kindly inserted by loadHtml
*
* @param DOMDocument $document
*/
private function stripDoctypeHtmlBodyAndHeadElements(DOMDocument $document)
{
// First step - extract the div wrapper from the document
$container = $document->getElementsByTagName('div')->item(0);
$container = $container->parentNode->removeChild($container);

// Remove all document children
while ($document->firstChild) {
$document->removeChild($document->firstChild);
}

// Append the div wrapper's children as children of the document
while ($container->firstChild) {
$document->appendChild($container->firstChild);
}
}
}
48 changes: 48 additions & 0 deletions src/Amara/Varcon/HtmlTranslator.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
<?php

namespace Amara\Varcon;

/**
* Glues the HtmlCrawler with a Translator
*/
class HtmlTranslator implements TranslatorInterface
{
/**
* @var TranslatorInterface
*/
private $translator;

/**
* @var HtmlCrawler
*/
private $htmlCrawler;

/**
* @param TranslatorInterface $translator
* @param HtmlCrawler $htmlCrawler
*/
public function __construct(TranslatorInterface $translator = null, HtmlCrawler $htmlCrawler = null)
{
if (null === $translator) {
$translator = new Translator;
}
if (null === $htmlCrawler) {
$htmlCrawler = new HtmlCrawler;
}

$this->translator = $translator;
$this->htmlCrawler = $htmlCrawler;
}

/**
* {@inheritdoc}
*/
public function translate($htmlContent, $fromSpelling, $toSpelling, $questionable = self::QUESTIONABLE_IGNORE)
{
$callable = function ($extractedString) use ($fromSpelling, $toSpelling, $questionable) {
return $this->translator->translate($extractedString, $fromSpelling, $toSpelling, $questionable);
};

return $this->htmlCrawler->crawlAndModify($htmlContent, $callable);
}
}
15 changes: 2 additions & 13 deletions src/Amara/Varcon/Translator.php
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,8 @@
/**
* A translator/converter for different variations of the English spellings
*/
class Translator
class Translator implements TranslatorInterface
{
const QUESTIONABLE_IGNORE = 0;
const QUESTIONABLE_INCLUDE = 1;
const QUESTIONABLE_MARK = 2;

/**
* @var TranslationProviderInterface
*/
Expand All @@ -31,14 +27,7 @@ public function __construct(TranslationProviderInterface $provider = null)
}

/**
* Translate/convert a string to another spelling
*
* @param string $string
* @param string $fromSpelling
* @param string $toSpelling
* @param int $questionable
*
* @return string
* {@inheritdoc}
*/
public function translate($string, $fromSpelling, $toSpelling, $questionable = self::QUESTIONABLE_IGNORE)
{
Expand Down
42 changes: 42 additions & 0 deletions src/Amara/Varcon/TranslatorInterface.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
<?php

namespace Amara\Varcon;

/**
* An interface ensuring any future translators implement the translate method in the expected way
*/
interface TranslatorInterface
{
/**
* In case of multiple translations available, don't translate (recommended)
*
* @var int
*/
const QUESTIONABLE_IGNORE = 0;

/**
* In case of multiple translations available, use the first
*
* @var int
*/
const QUESTIONABLE_INCLUDE = 1;

/**
* In case of multiple translations available, mark them ?like/so? (useful for debugging)
*
* @var int
*/
const QUESTIONABLE_MARK = 2;

/**
* Translate/convert a string to another spelling
*
* @param string $string
* @param string $fromSpelling
* @param string $toSpelling
* @param int $questionable
*
* @return string
*/
public function translate($string, $fromSpelling, $toSpelling, $questionable = self::QUESTIONABLE_IGNORE);
}
55 changes: 55 additions & 0 deletions tests/Amara/Varcon/HtmlCrawlerTest.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
<?php

namespace Amara\Varcon\Tests;

use Amara\Varcon\HtmlCrawler;

class HtmlCrawlerTest extends \PHPUnit_Framework_TestCase
{
/**
* @dataProvider provideCrawlAndModifyWithChangedXpathExpressions
*
* @param string $html
* @param string $changedHtml
*/
public function testCrawlAndModifyWithChangedXpathExpressions($html, $changedHtml)
{
$htmlCrawler = new HtmlCrawler();
$xpathExpressions = [
'//span/@unsupported', // Asserts unsupported attributes also get translated
'//strong/text()'
];

$htmlCrawler->setXpathExpressions($xpathExpressions);

$callable = function () {
return 'Changed';
};

$this->assertSame(
$changedHtml,
$htmlCrawler->crawlAndModify($html, $callable)
);
}

/**
* @return array
*/
public function provideCrawlAndModifyWithChangedXpathExpressions()
{
return [
[
$html = '<p>Text <strong>text</strong> text</p>',
$changedHtml = '<p>Text <strong>Changed</strong> text</p>',
],
[
$html = '<span unsupported="Text">Text text text</span>',
$changedHtml = '<span unsupported="Changed">Text text text</span>',
],
[
$html = '<span>Unchanged</span>',
$changedHtml = '<span>Unchanged</span>',
],
];
}
}
82 changes: 82 additions & 0 deletions tests/Amara/Varcon/HtmlTranslatorTest.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
<?php

namespace Amara\Varcon\Tests;

use Amara\Varcon\HtmlTranslator;
use Amara\Varcon\Translator;
use Prophecy\Argument;

class HtmlTranslatorTest extends \PHPUnit_Framework_TestCase
{
public function testTranslatePreservesWhitespace()
{
$htmlTranslator = new HtmlTranslator();

$this->assertSame(
'<p>Colour <strong>pyjama</strong> &#1087;&#1072;&#1088;&#1072;&#1083;&#1077;&#1083;&#1077;&#1087;&#1080;&#1087;&#1077;&#1076;</p>', // Tests UTF8 characters as well
$htmlTranslator->translate(
'<p>Color <strong>pajama</strong> паралелепипед</p>',
'A',
'B'
)
);
}

/**
* @dataProvider provideTranslate
*
* @param string $html
* @param string $translatedHtml
*/
public function testTranslate($html, $translatedHtml)
{
$translator = $this->prophesize(Translator::class);
$translator->translate(
Argument::type('string'),
'A',
'B',
0
)->will(function ($arguments) {
$string = array_shift($arguments);

return str_replace(
['Text', 'text'],
['Translated', 'translated'],
$string
);
});

$htmlTranslator = new HtmlTranslator($translator->reveal());

$this->assertSame($translatedHtml, $htmlTranslator->translate($html, 'A', 'B'));
}

/**
* @return array
*/
public function provideTranslate()
{
return [
[
$html = '<p>Text & text text</p><p>More text</p><p>More text</p>',
$translatedHtml = '<p>Translated &amp; translated translated</p><p>More translated</p><p>More translated</p>',
],
[
$html = '<p>&bull; Text &amp; <strong>text</strong>: text</p>',
$translatedHtml = '<p>&bull; Translated &amp; <strong>translated</strong>: translated</p>',
],
[
$html = '<img src="#" alt="Text text text">',
$translatedHtml = '<img src="#" alt="Translated translated translated">',
],
[
$html = '<img src="#" title="Text text text">',
$translatedHtml = '<img src="#" title="Translated translated translated">',
],
[
$html = '<meta name="description" content="Text text text">',
$translatedHtml = '<meta name="description" content="Translated translated translated">',
],
];
}
}
Loading