@@ -153,33 +153,54 @@ private function splitTextIntoBlocks($text, $depth = 0) {
153
153
$ block_rules = $ this ->blockRules ;
154
154
$ blocks = array ();
155
155
$ cursor = 0 ;
156
- $ prev_block = array ();
156
+
157
+ $ can_merge = array ();
158
+ foreach ($ block_rules as $ key => $ block_rule ) {
159
+ if ($ block_rule instanceof PhutilRemarkupDefaultBlockRule) {
160
+ $ can_merge [$ key ] = true ;
161
+ }
162
+ }
163
+
164
+ $ last_block = null ;
165
+ $ last_block_key = -1 ;
166
+
167
+ // See T13487. For very large inputs, block separation can dominate
168
+ // runtime. This is written somewhat clumsily to attempt to handle
169
+ // very large inputs as gracefully as is practical.
157
170
158
171
while (isset ($ text [$ cursor ])) {
159
172
$ starting_cursor = $ cursor ;
160
- foreach ($ block_rules as $ block_rule ) {
173
+ foreach ($ block_rules as $ block_key => $ block_rule ) {
161
174
$ num_lines = $ block_rule ->getMatchingLineCount ($ text , $ cursor );
162
175
163
176
if ($ num_lines ) {
164
- if ($ blocks ) {
165
- $ prev_block = last ($ blocks );
166
- }
167
-
168
- $ curr_block = array (
177
+ $ current_block = array (
169
178
'start ' => $ cursor ,
170
179
'num_lines ' => $ num_lines ,
171
180
'rule ' => $ block_rule ,
172
- 'is_empty ' => self ::isEmptyBlock ($ text , $ cursor , $ num_lines ),
181
+ 'empty ' => self ::isEmptyBlock ($ text , $ cursor , $ num_lines ),
173
182
'children ' => array (),
183
+ 'merge ' => isset ($ can_merge [$ block_key ]),
174
184
);
175
185
176
- if ($ prev_block
177
- && self ::shouldMergeBlocks ($ text , $ prev_block , $ curr_block )) {
178
- $ blocks [last_key ($ blocks )]['num_lines ' ] += $ curr_block ['num_lines ' ];
179
- $ blocks [last_key ($ blocks )]['is_empty ' ] =
180
- $ blocks [last_key ($ blocks )]['is_empty ' ] && $ curr_block ['is_empty ' ];
186
+ $ should_merge = self ::shouldMergeParagraphBlocks (
187
+ $ text ,
188
+ $ last_block ,
189
+ $ current_block );
190
+
191
+ if ($ should_merge ) {
192
+ $ last_block ['num_lines ' ] =
193
+ ($ last_block ['num_lines ' ] + $ current_block ['num_lines ' ]);
194
+
195
+ $ last_block ['empty ' ] =
196
+ ($ last_block ['empty ' ] && $ current_block ['empty ' ]);
197
+
198
+ $ blocks [$ last_block_key ] = $ last_block ;
181
199
} else {
182
- $ blocks [] = $ curr_block ;
200
+ $ blocks [] = $ current_block ;
201
+
202
+ $ last_block = $ current_block ;
203
+ $ last_block_key ++;
183
204
}
184
205
185
206
$ cursor += $ num_lines ;
@@ -192,9 +213,20 @@ private function splitTextIntoBlocks($text, $depth = 0) {
192
213
}
193
214
}
194
215
216
+ // See T13487. It's common for blocks to be small, and this loop seems to
217
+ // measure as faster if we manually concatenate blocks than if we
218
+ // "array_slice()" and "implode()" blocks. This is a bit muddy.
219
+
195
220
foreach ($ blocks as $ key => $ block ) {
196
- $ lines = array_slice ($ text , $ block ['start ' ], $ block ['num_lines ' ]);
197
- $ blocks [$ key ]['text ' ] = implode ('' , $ lines );
221
+ $ min = $ block ['start ' ];
222
+ $ max = $ min + $ block ['num_lines ' ];
223
+
224
+ $ lines = '' ;
225
+ for ($ ii = $ min ; $ ii < $ max ; $ ii ++) {
226
+ $ lines .= $ text [$ ii ];
227
+ }
228
+
229
+ $ blocks [$ key ]['text ' ] = $ lines ;
198
230
}
199
231
200
232
// Stop splitting child blocks apart if we get too deep. This arrests
@@ -246,30 +278,48 @@ private function flattenOutput(array $output) {
246
278
return $ output ;
247
279
}
248
280
249
- private static function shouldMergeBlocks ($ text , $ prev_block , $ curr_block ) {
250
- $ block_rules = ipull (array ($ prev_block , $ curr_block ), 'rule ' );
281
+ private static function shouldMergeParagraphBlocks (
282
+ $ text ,
283
+ $ last_block ,
284
+ $ current_block ) {
251
285
252
- $ default_rule = 'PhutilRemarkupDefaultBlockRule ' ;
253
- try {
254
- assert_instances_of ($ block_rules , $ default_rule );
286
+ // If we're at the beginning of the input, we can't merge.
287
+ if ($ last_block === null ) {
288
+ return false ;
289
+ }
255
290
256
- // If the last block was empty keep merging
257
- if ($ prev_block [ ' is_empty ' ]) {
258
- return true ;
259
- }
291
+ // If the previous block wasn't a default block, we can't merge.
292
+ if (! $ last_block [ ' merge ' ]) {
293
+ return false ;
294
+ }
260
295
261
- // If this line is blank keep merging
262
- if ($ curr_block [ ' is_empty ' ]) {
263
- return true ;
264
- }
296
+ // If the current block isn't a default block, we can't merge.
297
+ if (! $ current_block [ ' merge ' ]) {
298
+ return false ;
299
+ }
265
300
266
- // If the current line and the last line have content, keep merging
267
- if (strlen (trim ($ text [$ curr_block ['start ' ] - 1 ]))) {
268
- if (strlen (trim ($ text [$ curr_block ['start ' ]]))) {
269
- return true ;
270
- }
271
- }
272
- } catch (Exception $ e ) {}
301
+ // If the last block was empty, we definitely want to merge.
302
+ if ($ last_block ['empty ' ]) {
303
+ return true ;
304
+ }
305
+
306
+ // If this block is empty, we definitely want to merge.
307
+ if ($ current_block ['empty ' ]) {
308
+ return true ;
309
+ }
310
+
311
+ // Check if the last line of the previous block or the first line of this
312
+ // block have any non-whitespace text. If they both do, we're going to
313
+ // merge.
314
+
315
+ // If either of them are a blank line or a line with only whitespace, we
316
+ // do not merge: this means we've found a paragraph break.
317
+
318
+ $ tail = $ text [$ current_block ['start ' ] - 1 ];
319
+ $ head = $ text [$ current_block ['start ' ]];
320
+ if (strlen (trim ($ tail )) && strlen (trim ($ head ))) {
321
+ return true ;
322
+ }
273
323
274
324
return false ;
275
325
}
0 commit comments