Skip to content

Commit 8eefaba

Browse files
committed
fix `regexp-match' performance for short matches on long strings
More specifically, for a string of length N and a match that only looks at the first M characters, the complexity of `regexp-match' is now O(M) instead of O(N). This allows `regexp-split' to be O(N) for a string instead of O(N^2). Also, fixed a bug in non-greedy matching that could affect both long strings and input ports.
1 parent 09b4a55 commit 8eefaba

File tree

7 files changed

+268
-96
lines changed

7 files changed

+268
-96
lines changed

collects/tests/racket/rx.rktl

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -256,13 +256,36 @@
256256
;; CL-PPCRE, which probably is from Perl originally.
257257
;; The tests have been modified to avoid various incompatibilities.
258258

259+
(define (make-reluctant-port bstr)
260+
;; Handing out a single character at a time stresses
261+
;; the regexp matcher's lazy reading of a port:
262+
(define pos 0)
263+
(define len (bytes-length bstr))
264+
(make-input-port
265+
'reluctant-bytes
266+
(lambda (s)
267+
(if (pos . >= . len)
268+
eof
269+
(begin
270+
(bytes-set! s 0 (bytes-ref bstr pos))
271+
(set! pos (add1 pos))
272+
1)))
273+
(lambda (s skip evt)
274+
(if ((+ pos skip) . >= . len)
275+
eof
276+
(begin
277+
(bytes-set! s 0 (bytes-ref bstr (+ pos skip)))
278+
1)))
279+
void))
280+
259281
(map (lambda (t)
260282
(if (pair? t)
261283
(begin
262284
(test (caddr t) regexp-match (byte-pregexp (car t)) (cadr t))
263285
(test (caddr t) regexp-match (byte-pregexp (car t)) (bytes-append #"xxxxxxxxxx" (cadr t)) 10)
264286
(test (caddr t) regexp-match (byte-pregexp (car t)) (bytes-append (cadr t) #"xxxxxxxxxx") 0 (bytes-length (cadr t)))
265287
(test (caddr t) regexp-match (byte-pregexp (car t)) (open-input-bytes (cadr t)))
288+
(test (caddr t) regexp-match (byte-pregexp (car t)) (make-reluctant-port (cadr t)))
266289
(test (and (caddr t)
267290
(map (lambda (v)
268291
(and v (bytes->string/latin-1 v)))

collects/tests/racket/string.rktl

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -289,4 +289,22 @@
289289
(test "x y" string-join '("x" "y") " ")
290290
(test "x" string-join '("x") " "))
291291

292+
;; String splitting can take longer than byte-string splitting,
293+
;; but it should have the same computational complexity.
294+
(let ()
295+
(define N 100000)
296+
(define-values (b bcpu breal bgc)
297+
(time-apply
298+
(lambda () (regexp-split #rx#"." (make-bytes N)))
299+
null))
300+
(define-values (s scpu sreal sgc)
301+
(time-apply
302+
(lambda () (regexp-split #rx"." (make-string N)))
303+
null))
304+
(test #f
305+
'split
306+
(and ((* 100 (- bcpu bgc)) . < . (- scpu sgc))
307+
"suspiciously long time for regexp string split")))
308+
309+
292310
(report-errs)

src/racket/src/mzmark_regexp.inc

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ static int mark_regwork_MARK(void *p, struct NewGC *gc) {
4343
gcMARK2(r->counters, gc);
4444
gcMARK2(r->peekskip, gc);
4545
gcMARK2(r->prefix, gc);
46+
gcMARK2(r->lazy_string, gc);
4647
gcMARK2(r->rewind_stack, gc);
4748
return
4849
gcBYTES_TO_WORDS(sizeof(Regwork));
@@ -60,6 +61,7 @@ static int mark_regwork_FIXUP(void *p, struct NewGC *gc) {
6061
gcFIXUP2(r->counters, gc);
6162
gcFIXUP2(r->peekskip, gc);
6263
gcFIXUP2(r->prefix, gc);
64+
gcFIXUP2(r->lazy_string, gc);
6365
gcFIXUP2(r->rewind_stack, gc);
6466
return
6567
gcBYTES_TO_WORDS(sizeof(Regwork));
@@ -69,3 +71,28 @@ static int mark_regwork_FIXUP(void *p, struct NewGC *gc) {
6971
#define mark_regwork_IS_CONST_SIZE 1
7072

7173

74+
static int mark_lazy_string_SIZE(void *p, struct NewGC *gc) {
75+
return
76+
gcBYTES_TO_WORDS(sizeof(rx_lazy_str_t));
77+
}
78+
79+
static int mark_lazy_string_MARK(void *p, struct NewGC *gc) {
80+
rx_lazy_str_t *ls = (rx_lazy_str_t *)p;
81+
gcMARK2(ls->s, gc);
82+
gcMARK2(ls->chars, gc);
83+
return
84+
gcBYTES_TO_WORDS(sizeof(rx_lazy_str_t));
85+
}
86+
87+
static int mark_lazy_string_FIXUP(void *p, struct NewGC *gc) {
88+
rx_lazy_str_t *ls = (rx_lazy_str_t *)p;
89+
gcFIXUP2(ls->s, gc);
90+
gcFIXUP2(ls->chars, gc);
91+
return
92+
gcBYTES_TO_WORDS(sizeof(rx_lazy_str_t));
93+
}
94+
95+
#define mark_lazy_string_IS_ATOMIC 0
96+
#define mark_lazy_string_IS_CONST_SIZE 1
97+
98+

src/racket/src/mzmarksrc.c

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2177,11 +2177,21 @@ mark_regwork {
21772177
gcMARK2(r->counters, gc);
21782178
gcMARK2(r->peekskip, gc);
21792179
gcMARK2(r->prefix, gc);
2180+
gcMARK2(r->lazy_string, gc);
21802181
gcMARK2(r->rewind_stack, gc);
21812182
size:
21822183
gcBYTES_TO_WORDS(sizeof(Regwork));
21832184
}
21842185

2186+
mark_lazy_string {
2187+
mark:
2188+
rx_lazy_str_t *ls = (rx_lazy_str_t *)p;
2189+
gcMARK2(ls->s, gc);
2190+
gcMARK2(ls->chars, gc);
2191+
size:
2192+
gcBYTES_TO_WORDS(sizeof(rx_lazy_str_t));
2193+
}
2194+
21852195
END regexp;
21862196

21872197
/**********************************************************************/

0 commit comments

Comments
 (0)