Skip to content

Commit fdbe9ba

Browse files
author
epriestley
committedFeb 4, 2020
Improve Remarkup parsing performance for certain large input blocks
Summary: Fixes T13487. In PHI1628, an install has a 4MB remarkup corpus which takes a long time to render. This is broadly expected, but a few reasonable improvements fell out of running it through the profiler. Test Plan: - Saw local cold-cache end-to-end rendering time drop from 12s to 4s for the highly secret input corpus. - Verified output has the same hashes before/after. - Ran all remarkup unit tests. Maniphest Tasks: T13487 Differential Revision: https://secure.phabricator.com/D20968
1 parent 0e82bd0 commit fdbe9ba

File tree

3 files changed

+146
-77
lines changed

3 files changed

+146
-77
lines changed
 

‎src/infrastructure/markup/blockrule/PhutilRemarkupNoteBlockRule.php

+22-16
Original file line numberDiff line numberDiff line change
@@ -100,22 +100,28 @@ public function markupText($text, $children) {
100100
}
101101

102102
private function getRegEx() {
103-
$words = array(
104-
'NOTE',
105-
'IMPORTANT',
106-
'WARNING',
107-
);
108-
109-
foreach ($words as $k => $word) {
110-
$words[$k] = preg_quote($word, '/');
103+
static $regex;
104+
105+
if ($regex === null) {
106+
$words = array(
107+
'NOTE',
108+
'IMPORTANT',
109+
'WARNING',
110+
);
111+
112+
foreach ($words as $k => $word) {
113+
$words[$k] = preg_quote($word, '/');
114+
}
115+
$words = implode('|', $words);
116+
117+
$regex =
118+
'/^(?:'.
119+
'(?:\((?P<hideword>'.$words.')\))'.
120+
'|'.
121+
'(?:(?P<showword>'.$words.'):))\s*'.
122+
'/';
111123
}
112-
$words = implode('|', $words);
113-
114-
return
115-
'/^(?:'.
116-
'(?:\((?P<hideword>'.$words.')\))'.
117-
'|'.
118-
'(?:(?P<showword>'.$words.'):))\s*'.
119-
'/';
124+
125+
return $regex;
120126
}
121127
}

‎src/infrastructure/markup/remarkup/PhutilRemarkupEngine.php

+86-36
Original file line numberDiff line numberDiff line change
@@ -153,33 +153,54 @@ private function splitTextIntoBlocks($text, $depth = 0) {
153153
$block_rules = $this->blockRules;
154154
$blocks = array();
155155
$cursor = 0;
156-
$prev_block = array();
156+
157+
$can_merge = array();
158+
foreach ($block_rules as $key => $block_rule) {
159+
if ($block_rule instanceof PhutilRemarkupDefaultBlockRule) {
160+
$can_merge[$key] = true;
161+
}
162+
}
163+
164+
$last_block = null;
165+
$last_block_key = -1;
166+
167+
// See T13487. For very large inputs, block separation can dominate
168+
// runtime. This is written somewhat clumsily to attempt to handle
169+
// very large inputs as gracefully as is practical.
157170

158171
while (isset($text[$cursor])) {
159172
$starting_cursor = $cursor;
160-
foreach ($block_rules as $block_rule) {
173+
foreach ($block_rules as $block_key => $block_rule) {
161174
$num_lines = $block_rule->getMatchingLineCount($text, $cursor);
162175

163176
if ($num_lines) {
164-
if ($blocks) {
165-
$prev_block = last($blocks);
166-
}
167-
168-
$curr_block = array(
177+
$current_block = array(
169178
'start' => $cursor,
170179
'num_lines' => $num_lines,
171180
'rule' => $block_rule,
172-
'is_empty' => self::isEmptyBlock($text, $cursor, $num_lines),
181+
'empty' => self::isEmptyBlock($text, $cursor, $num_lines),
173182
'children' => array(),
183+
'merge' => isset($can_merge[$block_key]),
174184
);
175185

176-
if ($prev_block
177-
&& self::shouldMergeBlocks($text, $prev_block, $curr_block)) {
178-
$blocks[last_key($blocks)]['num_lines'] += $curr_block['num_lines'];
179-
$blocks[last_key($blocks)]['is_empty'] =
180-
$blocks[last_key($blocks)]['is_empty'] && $curr_block['is_empty'];
186+
$should_merge = self::shouldMergeParagraphBlocks(
187+
$text,
188+
$last_block,
189+
$current_block);
190+
191+
if ($should_merge) {
192+
$last_block['num_lines'] =
193+
($last_block['num_lines'] + $current_block['num_lines']);
194+
195+
$last_block['empty'] =
196+
($last_block['empty'] && $current_block['empty']);
197+
198+
$blocks[$last_block_key] = $last_block;
181199
} else {
182-
$blocks[] = $curr_block;
200+
$blocks[] = $current_block;
201+
202+
$last_block = $current_block;
203+
$last_block_key++;
183204
}
184205

185206
$cursor += $num_lines;
@@ -192,9 +213,20 @@ private function splitTextIntoBlocks($text, $depth = 0) {
192213
}
193214
}
194215

216+
// See T13487. It's common for blocks to be small, and this loop seems to
217+
// measure as faster if we manually concatenate blocks than if we
218+
// "array_slice()" and "implode()" blocks. This is a bit muddy.
219+
195220
foreach ($blocks as $key => $block) {
196-
$lines = array_slice($text, $block['start'], $block['num_lines']);
197-
$blocks[$key]['text'] = implode('', $lines);
221+
$min = $block['start'];
222+
$max = $min + $block['num_lines'];
223+
224+
$lines = '';
225+
for ($ii = $min; $ii < $max; $ii++) {
226+
$lines .= $text[$ii];
227+
}
228+
229+
$blocks[$key]['text'] = $lines;
198230
}
199231

200232
// Stop splitting child blocks apart if we get too deep. This arrests
@@ -246,30 +278,48 @@ private function flattenOutput(array $output) {
246278
return $output;
247279
}
248280

249-
private static function shouldMergeBlocks($text, $prev_block, $curr_block) {
250-
$block_rules = ipull(array($prev_block, $curr_block), 'rule');
281+
private static function shouldMergeParagraphBlocks(
282+
$text,
283+
$last_block,
284+
$current_block) {
251285

252-
$default_rule = 'PhutilRemarkupDefaultBlockRule';
253-
try {
254-
assert_instances_of($block_rules, $default_rule);
286+
// If we're at the beginning of the input, we can't merge.
287+
if ($last_block === null) {
288+
return false;
289+
}
255290

256-
// If the last block was empty keep merging
257-
if ($prev_block['is_empty']) {
258-
return true;
259-
}
291+
// If the previous block wasn't a default block, we can't merge.
292+
if (!$last_block['merge']) {
293+
return false;
294+
}
260295

261-
// If this line is blank keep merging
262-
if ($curr_block['is_empty']) {
263-
return true;
264-
}
296+
// If the current block isn't a default block, we can't merge.
297+
if (!$current_block['merge']) {
298+
return false;
299+
}
265300

266-
// If the current line and the last line have content, keep merging
267-
if (strlen(trim($text[$curr_block['start'] - 1]))) {
268-
if (strlen(trim($text[$curr_block['start']]))) {
269-
return true;
270-
}
271-
}
272-
} catch (Exception $e) {}
301+
// If the last block was empty, we definitely want to merge.
302+
if ($last_block['empty']) {
303+
return true;
304+
}
305+
306+
// If this block is empty, we definitely want to merge.
307+
if ($current_block['empty']) {
308+
return true;
309+
}
310+
311+
// Check if the last line of the previous block or the first line of this
312+
// block have any non-whitespace text. If they both do, we're going to
313+
// merge.
314+
315+
// If either of them are a blank line or a line with only whitespace, we
316+
// do not merge: this means we've found a paragraph break.
317+
318+
$tail = $text[$current_block['start'] - 1];
319+
$head = $text[$current_block['start']];
320+
if (strlen(trim($tail)) && strlen(trim($head))) {
321+
return true;
322+
}
273323

274324
return false;
275325
}

‎src/infrastructure/markup/rule/PhabricatorObjectRemarkupRule.php

+38-25
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,9 @@
22

33
abstract class PhabricatorObjectRemarkupRule extends PhutilRemarkupRule {
44

5+
private $referencePattern;
6+
private $embedPattern;
7+
58
const KEY_RULE_OBJECT = 'rule.object';
69
const KEY_MENTIONED_OBJECTS = 'rule.object.mentioned';
710

@@ -192,38 +195,48 @@ public function apply($text) {
192195
}
193196

194197
private function getObjectEmbedPattern() {
195-
$prefix = $this->getObjectNamePrefix();
196-
$prefix = preg_quote($prefix);
197-
$id = $this->getObjectIDPattern();
198+
if ($this->embedPattern === null) {
199+
$prefix = $this->getObjectNamePrefix();
200+
$prefix = preg_quote($prefix);
201+
$id = $this->getObjectIDPattern();
198202

199-
return '(\B{'.$prefix.'('.$id.')([,\s](?:[^}\\\\]|\\\\.)*)?}\B)u';
203+
$this->embedPattern =
204+
'(\B{'.$prefix.'('.$id.')([,\s](?:[^}\\\\]|\\\\.)*)?}\B)u';
205+
}
206+
207+
return $this->embedPattern;
200208
}
201209

202210
private function getObjectReferencePattern() {
203-
$prefix = $this->getObjectNamePrefix();
204-
$prefix = preg_quote($prefix);
205-
206-
$id = $this->getObjectIDPattern();
207-
208-
// If the prefix starts with a word character (like "D"), we want to
209-
// require a word boundary so that we don't match "XD1" as "D1". If the
210-
// prefix does not start with a word character, we want to require no word
211-
// boundary for the same reasons. Test if the prefix starts with a word
212-
// character.
213-
if ($this->getObjectNamePrefixBeginsWithWordCharacter()) {
214-
$boundary = '\\b';
215-
} else {
216-
$boundary = '\\B';
217-
}
211+
if ($this->referencePattern === null) {
212+
$prefix = $this->getObjectNamePrefix();
213+
$prefix = preg_quote($prefix);
214+
215+
$id = $this->getObjectIDPattern();
216+
217+
// If the prefix starts with a word character (like "D"), we want to
218+
// require a word boundary so that we don't match "XD1" as "D1". If the
219+
// prefix does not start with a word character, we want to require no word
220+
// boundary for the same reasons. Test if the prefix starts with a word
221+
// character.
222+
if ($this->getObjectNamePrefixBeginsWithWordCharacter()) {
223+
$boundary = '\\b';
224+
} else {
225+
$boundary = '\\B';
226+
}
218227

219-
// The "(?<![#@-])" prevents us from linking "#abcdef" or similar, and
220-
// "ABC-T1" (see T5714), and from matching "@T1" as a task (it is a user)
221-
// (see T9479).
228+
// The "(?<![#@-])" prevents us from linking "#abcdef" or similar, and
229+
// "ABC-T1" (see T5714), and from matching "@T1" as a task (it is a user)
230+
// (see T9479).
222231

223-
// The "\b" allows us to link "(abcdef)" or similar without linking things
224-
// in the middle of words.
232+
// The "\b" allows us to link "(abcdef)" or similar without linking things
233+
// in the middle of words.
234+
235+
$this->referencePattern =
236+
'((?<![#@-])'.$boundary.$prefix.'('.$id.')(?:#([-\w\d]+))?(?!\w))u';
237+
}
225238

226-
return '((?<![#@-])'.$boundary.$prefix.'('.$id.')(?:#([-\w\d]+))?(?!\w))u';
239+
return $this->referencePattern;
227240
}
228241

229242

0 commit comments

Comments
 (0)
Failed to load comments.