Skip to content

Commit

Permalink
Made page-save HTML formatting much more efficient
Browse files Browse the repository at this point in the history
Replaced the existing xpath-heavy system with a more manual traversal
approach. Fixes following slow areas of old system:
- Old system would repeat ID-setting action for elements (Headers could
  be processed up to three times).
- Old system had a few very open xpath queries for headers.
- Old system would update links on every ID change, which triggers it's
  own xpath query for links, leading to exponential scaling issues.

New system only does one xpath query for links when changes are needed.
Added test to cover.

For BookStackApp#3932
  • Loading branch information
ssddanbrown committed Feb 22, 2023
1 parent c803961 commit 3149575
Show file tree
Hide file tree
Showing 2 changed files with 73 additions and 45 deletions.
99 changes: 54 additions & 45 deletions app/Entities/Tools/PageContent.php
Expand Up @@ -19,20 +19,15 @@

class PageContent
{
protected Page $page;

/**
* PageContent constructor.
*/
public function __construct(Page $page)
{
$this->page = $page;
public function __construct(
protected Page $page
) {
}

/**
* Update the content of the page with new provided HTML.
*/
public function setNewHTML(string $html)
public function setNewHTML(string $html): void
{
$html = $this->extractBase64ImagesFromHtml($html);
$this->page->html = $this->formatHtml($html);
Expand All @@ -43,7 +38,7 @@ public function setNewHTML(string $html)
/**
* Update the content of the page with new provided Markdown content.
*/
public function setNewMarkdown(string $markdown)
public function setNewMarkdown(string $markdown): void
{
$markdown = $this->extractBase64ImagesFromMarkdown($markdown);
$this->page->markdown = $markdown;
Expand All @@ -57,7 +52,7 @@ public function setNewMarkdown(string $markdown)
*/
protected function extractBase64ImagesFromHtml(string $htmlText): string
{
if (empty($htmlText) || strpos($htmlText, 'data:image') === false) {
if (empty($htmlText) || !str_contains($htmlText, 'data:image')) {
return $htmlText;
}

Expand Down Expand Up @@ -91,7 +86,7 @@ protected function extractBase64ImagesFromHtml(string $htmlText): string
* Attempting to capture the whole data uri using regex can cause PHP
* PCRE limits to be hit with larger, multi-MB, files.
*/
protected function extractBase64ImagesFromMarkdown(string $markdown)
protected function extractBase64ImagesFromMarkdown(string $markdown): string
{
$matches = [];
$contentLength = strlen($markdown);
Expand Down Expand Up @@ -183,32 +178,13 @@ protected function formatHtml(string $htmlText): string
$childNodes = $body->childNodes;
$xPath = new DOMXPath($doc);

// Set ids on top-level nodes
// Map to hold used ID references
$idMap = [];
foreach ($childNodes as $index => $childNode) {
[$oldId, $newId] = $this->setUniqueId($childNode, $idMap);
if ($newId && $newId !== $oldId) {
$this->updateLinks($xPath, '#' . $oldId, '#' . $newId);
}
}
// Map to hold changing ID references
$changeMap = [];

// Set ids on nested header nodes
$nestedHeaders = $xPath->query('//body//*//h1|//body//*//h2|//body//*//h3|//body//*//h4|//body//*//h5|//body//*//h6');
foreach ($nestedHeaders as $nestedHeader) {
[$oldId, $newId] = $this->setUniqueId($nestedHeader, $idMap);
if ($newId && $newId !== $oldId) {
$this->updateLinks($xPath, '#' . $oldId, '#' . $newId);
}
}

// Ensure no duplicate ids within child items
$idElems = $xPath->query('//body//*//*[@id]');
foreach ($idElems as $domElem) {
[$oldId, $newId] = $this->setUniqueId($domElem, $idMap);
if ($newId && $newId !== $oldId) {
$this->updateLinks($xPath, '#' . $oldId, '#' . $newId);
}
}
$this->updateIdsRecursively($body, 0, $idMap, $changeMap);
$this->updateLinks($xPath, $changeMap);

// Generate inner html as a string
$html = '';
Expand All @@ -223,20 +199,53 @@ protected function formatHtml(string $htmlText): string
}

/**
* Update the all links to the $old location to instead point to $new.
* For the given DOMNode, traverse its children recursively and update IDs
* where required (Top-level, headers & elements with IDs).
* Will update the provided $changeMap array with changes made, where keys are the old
* ids and the corresponding values are the new ids.
*/
protected function updateIdsRecursively(DOMNode $element, int $depth, array &$idMap, array &$changeMap): void
{
/* @var DOMNode $child */
foreach ($element->childNodes as $child) {
if ($child instanceof DOMElement && ($depth === 0 || in_array($child->nodeName, ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']) || $child->getAttribute('id'))) {
[$oldId, $newId] = $this->setUniqueId($child, $idMap);
if ($newId && $newId !== $oldId && !isset($idMap[$oldId])) {
$changeMap[$oldId] = $newId;
}
}

if ($child->hasChildNodes()) {
$this->updateIdsRecursively($child, $depth + 1, $idMap, $changeMap);
}
}
}

/**
* Update the all links in the given xpath to apply requires changes within the
* given $changeMap array.
*/
protected function updateLinks(DOMXPath $xpath, string $old, string $new)
protected function updateLinks(DOMXPath $xpath, array $changeMap): void
{
$old = str_replace('"', '', $old);
$matchingLinks = $xpath->query('//body//*//*[@href="' . $old . '"]');
foreach ($matchingLinks as $domElem) {
$domElem->setAttribute('href', $new);
if (empty($changeMap)) {
return;
}

$links = $xpath->query('//body//*//*[@href]');
/** @var DOMElement $domElem */
foreach ($links as $domElem) {
$href = ltrim($domElem->getAttribute('href'), '#');
$newHref = $changeMap[$href] ?? null;
if ($newHref) {
$domElem->setAttribute('href', '#' . $newHref);
}
}
}

/**
* Set a unique id on the given DOMElement.
* A map for existing ID's should be passed in to check for current existence.
* A map for existing ID's should be passed in to check for current existence,
* and this will be updated with any new IDs set upon elements.
* Returns a pair of strings in the format [old_id, new_id].
*/
protected function setUniqueId(DOMNode $element, array &$idMap): array
Expand All @@ -247,7 +256,7 @@ protected function setUniqueId(DOMNode $element, array &$idMap): array

// Stop if there's an existing valid id that has not already been used.
$existingId = $element->getAttribute('id');
if (strpos($existingId, 'bkmrk') === 0 && !isset($idMap[$existingId])) {
if (str_starts_with($existingId, 'bkmrk') && !isset($idMap[$existingId])) {
$idMap[$existingId] = true;

return [$existingId, $existingId];
Expand All @@ -258,7 +267,7 @@ protected function setUniqueId(DOMNode $element, array &$idMap): array
// the same content is passed through.
$contentId = 'bkmrk-' . mb_substr(strtolower(preg_replace('/\s+/', '-', trim($element->nodeValue))), 0, 20);
$newId = urlencode($contentId);
$loopIndex = 0;
$loopIndex = 1;

while (isset($idMap[$newId])) {
$newId = urlencode($contentId . '-' . $loopIndex);
Expand Down
19 changes: 19 additions & 0 deletions tests/Entity/PageContentTest.php
Expand Up @@ -732,4 +732,23 @@ public function test_non_breaking_spaces_are_preserved()

$this->assertStringContainsString('<p id="bkmrk-%C2%A0">&nbsp;</p>', $page->refresh()->html);
}

public function test_page_save_with_many_headers_and_links_is_reasonable()
{
$page = $this->entities->page();

$content = '';
for ($i = 0; $i < 500; $i++) {
$content .= "<table><tbody><tr><td><h5 id='header-{$i}'>Simple Test</h5><a href='#header-{$i}'></a></td></tr></tbody></table>";
}

$time = time();
$this->asEditor()->put($page->getUrl(), [
'name' => $page->name,
'html' => $content,
])->assertRedirect();

$timeElapsed = time() - $time;
$this->assertLessThan(3, $timeElapsed);
}
}

0 comments on commit 3149575

Please sign in to comment.