Skip to content

Commit

Permalink
EZP-29289: Migrating ezxmltext with invalid name or id attributes
Browse files Browse the repository at this point in the history
  • Loading branch information
vidarl committed Jun 21, 2018
1 parent 7ce20f8 commit 2c5efc1
Show file tree
Hide file tree
Showing 5 changed files with 122 additions and 10 deletions.
14 changes: 10 additions & 4 deletions bundle/Command/ConvertXmlTextToRichTextCommand.php
Expand Up @@ -64,7 +64,13 @@ protected function configure()
'disable-duplicate-id-check',
null,
InputOption::VALUE_NONE,
'Disable the check for duplicate html ids in every attribute. This might increase execution time on large databases'
'Disable the check for duplicate html ids in every attribute. This might decrease execution time on large databases'
)
->addOption(
'disable-id-value-check',
null,
InputOption::VALUE_NONE,
'Disable the check for non-validating id/name values. This might decrease execution time on large databases'
)
->addOption(
'test-content-object',
Expand Down Expand Up @@ -123,7 +129,7 @@ protected function execute(InputInterface $input, OutputInterface $output)
$dryRun = true;
}

$this->convertFields($dryRun, $testContentId, !$input->getOption('disable-duplicate-id-check'), $output);
$this->convertFields($dryRun, $testContentId, !$input->getOption('disable-duplicate-id-check'), !$input->getOption('disable-id-value-check'), $output);
}

protected function getContentTypeIds($contentTypeIdentifiers)
Expand Down Expand Up @@ -330,7 +336,7 @@ protected function updateFieldRow($dryRun, $id, $version, $datatext)
}
}

protected function convertFields($dryRun, $contentId, $checkDuplicateIds, OutputInterface $output)
protected function convertFields($dryRun, $contentId, $checkDuplicateIds, $checkIdValues, OutputInterface $output)
{
$count = $this->getRowCountOfContentObjectAttributes('ezxmltext', $contentId);

Expand All @@ -345,7 +351,7 @@ protected function convertFields($dryRun, $contentId, $checkDuplicateIds, Output
$inputValue = $row['data_text'];
}

$converted = $this->converter->convert($this->createDocument($inputValue), $checkDuplicateIds, $row['id']);
$converted = $this->converter->convert($this->createDocument($inputValue), $checkDuplicateIds, $checkIdValues, $row['id']);

$this->updateFieldRow($dryRun, $row['id'], $row['version'], $converted);

Expand Down
49 changes: 44 additions & 5 deletions lib/FieldType/XmlText/Converter/RichText.php
Expand Up @@ -157,9 +157,44 @@ protected function reportNonUniqueIds(DOMDocument $document, $contentFieldId)
$id = $node->attributes->getNamedItem('id')->nodeValue;
// id has format "duplicated_id_foo_bar_idm45226413447104" where "foo_bar" is the duplicated id
$duplicatedId = substr($id, strlen('duplicated_id_'), strrpos($id, '_') - strlen('duplicated_id_'));
if ($this->logger !== null) {
$this->logger->warning("Duplicated id in original ezxmltext for contentobject_attribute.id=$contentFieldId, automatically generated new id : $duplicatedId --> $id");
}
$this->logger->warning("Duplicated id in original ezxmltext for contentobject_attribute.id=$contentFieldId, automatically generated new id : $duplicatedId --> $id");
}
}

protected function validateAttributeValues(DOMDocument $document, $contentFieldId)
{
$xpath = new DOMXPath($document);
$whitelist1st = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_';
$replaceStr1st = 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa';

$whitelist = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_-';
$replaceStr = '';
/*
* We want to pick elements which has id value
* #1 not starting with a..z or '_'
* #2 not a..z, '0..9', '_' or '-' after 1st character
* So, no xpath v2 to our disposal...
* 1st line : we check the 1st char(substring) in id, converts it to 'a' if it in whitelist(translate), then check if it string now starts with 'a'(starts-with), then we invert result(not)
* : So we replace first char with 'a' if it is whitelisted, then we select the element if id value does not start with 'a'
* 2nd line: now we check remaining(omit 1st char) part of string (substring), removes any character that *is* whitelisted(translate), then check if there are any non-whitelisted characters left(string-lenght)
* 3rd line: Due to the not() in 1st line, we pick all elements not matching that 1st line. That also includes elements not having a xml:id at all..
* : So, we want to make sure we only pick elements which has a xml:id attribute.
*/
$nodes = $xpath->query("//*[
(
not(starts-with(translate(substring(@xml:id, 1, 1), '$whitelist1st', '$replaceStr1st'), 'a'))
or string-length(translate(substring(@xml:id, 2), '$whitelist', '$replaceStr')) > 0
) and string-length(@xml:id) > 0]");

if ($contentFieldId === null) {
$contentFieldId = '[unknown]';
}
foreach ($nodes as $node) {
$orgValue = $node->attributes->getNamedItem('id')->nodeValue;
$newValue = 'rewrite_' . $node->attributes->getNamedItem('id')->nodeValue;
$newValue = preg_replace("/[^$whitelist]/", '_', $newValue);
$node->attributes->getNamedItem('id')->nodeValue = $newValue;
$this->logger->warning("Replaced non-validating id value in richtext for contentobject_attribute.id=$contentFieldId, changed from : $orgValue --> $newValue");
}
}

Expand Down Expand Up @@ -317,10 +352,11 @@ protected function checkEmptyEmbedTags(DOMDocument $inputDocument)
*
* @param DOMDocument $inputDocument
* @param bool $checkDuplicateIds
* @param bool $checkIdValues
* @param null|int $contentFieldId
* @return string
*/
public function convert(DOMDocument $inputDocument, $checkDuplicateIds = false, $contentFieldId = null)
public function convert(DOMDocument $inputDocument, $checkDuplicateIds = false, $checkIdValues = false, $contentFieldId = null)
{
$this->removeComments($inputDocument);

Expand All @@ -329,6 +365,9 @@ public function convert(DOMDocument $inputDocument, $checkDuplicateIds = false,
if ($checkDuplicateIds) {
$this->reportNonUniqueIds($convertedDocument, $contentFieldId);
}
if ($checkIdValues) {
$this->validateAttributeValues($convertedDocument, $contentFieldId);
}

// Needed by some disabled output escaping (eg. legacy ezxml paragraph <line/> elements)
$convertedDocumentNormalized = new DOMDocument();
Expand All @@ -339,7 +378,7 @@ public function convert(DOMDocument $inputDocument, $checkDuplicateIds = false,

$result = $convertedDocumentNormalized->saveXML();

if (!empty($errors) && $this->logger !== null) {
if (!empty($errors)) {
$this->logger->error(
"Validation errors when converting ezxmltext for contentobject_attribute.id=$contentFieldId",
['result' => $result, 'errors' => $errors, 'xmlString' => $inputDocument->saveXML()]
Expand Down
2 changes: 1 addition & 1 deletion tests/lib/FieldType/Converter/RichTextTest.php
Expand Up @@ -137,7 +137,7 @@ public function testConvert($inputFilePath, $outputFilePath)
$richText = new RichText($apiRepositoryStub);
$richText->setImageContentTypes([27]);

$result = $richText->convert($inputDocument, true);
$result = $richText->convert($inputDocument, true, true);

$convertedDocument = $this->createDocument($result, false);
$expectedDocument = $this->createDocument($outputFilePath);
Expand Down
@@ -0,0 +1,33 @@
<?xml version="1.0" encoding="utf-8"?>
<section
xmlns:image="http://ez.no/namespaces/ezpublish3/image/"
xmlns:xhtml="http://ez.no/namespaces/ezpublish3/xhtml/"
xmlns:custom="http://ez.no/namespaces/ezpublish3/custom/">
<paragraph align="justify">Here is an anchor
<anchor name="1name"/>
</paragraph>
<paragraph align="justify">Here is an anchor
<anchor name="n1ame"/>
</paragraph>
<paragraph align="justify">Here is an anchor
<anchor name="-1name"/>
</paragraph>
<paragraph align="justify">Here is an anchor
<anchor name="_name"/>
</paragraph>
<paragraph align="justify">Here is an anchor
<anchor name="aname"/>
</paragraph>
<paragraph align="justify">Here is an anchor
<anchor name="#aname"/>
</paragraph>
<paragraph align="justify">Here is an anchor
<anchor name="a@name"/>
</paragraph>
<paragraph align="justify">Here is an anchor
<anchor name="an£ame"/>
</paragraph>
<paragraph align="justify">Here is an anchor
<anchor name="aname["/>
</paragraph>
</section>
@@ -0,0 +1,34 @@
<?xml version="1.0" encoding="UTF-8"?>
<section
xmlns="http://docbook.org/ns/docbook"
xmlns:xlink="http://www.w3.org/1999/xlink"
xmlns:ezxhtml="http://ez.no/xmlns/ezpublish/docbook/xhtml"
xmlns:ezcustom="http://ez.no/xmlns/ezpublish/docbook/custom" version="5.0-variant ezpublish-1.0">
<para ezxhtml:textalign="justify">Here is an anchor
<anchor xml:id="rewrite_1name"/>
</para>
<para ezxhtml:textalign="justify">Here is an anchor
<anchor xml:id="n1ame"/>
</para>
<para ezxhtml:textalign="justify">Here is an anchor
<anchor xml:id="rewrite_-1name"/>
</para>
<para ezxhtml:textalign="justify">Here is an anchor
<anchor xml:id="_name"/>
</para>
<para ezxhtml:textalign="justify">Here is an anchor
<anchor xml:id="aname"/>
</para>
<para ezxhtml:textalign="justify">Here is an anchor
<anchor xml:id="rewrite__aname"/>
</para>
<para ezxhtml:textalign="justify">Here is an anchor
<anchor xml:id="rewrite_a_name"/>
</para>
<para ezxhtml:textalign="justify">Here is an anchor
<anchor xml:id="rewrite_an__ame"/>
</para>
<para ezxhtml:textalign="justify">Here is an anchor
<anchor xml:id="rewrite_aname_"/>
</para>
</section>

0 comments on commit 2c5efc1

Please sign in to comment.