Skip to content

Commit

Permalink
Merge pull request #66 from blackwhale/master
Browse files Browse the repository at this point in the history
std.regex fix .* optimization issues (issue 6072)
  • Loading branch information
andralex committed May 31, 2011
2 parents 06dd205 + c7be162 commit cbe8659
Show file tree
Hide file tree
Showing 2 changed files with 147 additions and 35 deletions.
9 changes: 8 additions & 1 deletion changelog.dd
Expand Up @@ -14,19 +14,26 @@ $(VERSION 053, ddd mm, 2011, =================================================,
$(LI Added unsigned to std.traits)
)
$(LIBBUGSFIXED
$(LI $(BUGZILLA 4367): std.regex: Captures is not a random access range)
$(LI $(BUGZILLA 4574): std.regex: breaks with empy string regex)
$(LI $(BUGZILLA 4644): assertExceptionThrown to assert that a particular exception was thrown)
$(LI $(BUGZILLA 4944): Missing tzname even though we have tzset)
$(LI $(BUGZILLA 5019): In std.regex, empty capture at end of string causes error)
$(LI $(BUGZILLA 5169): Add(?:) Non-capturing parentheses group support to std.regex)
$(LI $(BUGZILLA 5451): Three ideas for RedBlackTree)
$(LI $(BUGZILLA 5474): unaryFun byRef is borked for custom parameter name)
$(LI $(BUGZILLA 5485): TLS sections handled incorrectly in FreeBSD)
$(LI $(BUGZILLA 5511): std.regex optional capture with no-match cause error)
$(LI $(BUGZILLA 5616): std.datetime: not cross-platform)
$(LI $(BUGZILLA 5654): BigInt returns ZERO with strings of single digit number with leading zeros)
$(LI $(BUGZILLA 5661): std.algorithm.move does not work on elaborate struct)
$(LI $(BUGZILLA 5661): std.algorithm.move does not work on elaborate struct)
$(LI $(BUGZILLA 5731): std.datetime.SysTime prints UTC offsets backwards)
$(LI $(BUGZILLA 5761): std.datetime: Date.this(int day) conversion fails for Dec 30 of leap years)
$(LI $(BUGZILLA 5780): [patch] std.traits.hasIndirections incorrectly handles static arrays)
$(LI $(BUGZILLA 5781): std.datetime: On Windows, times off by one hour in some years due to DST rule changes)
$(LI $(BUGZILLA 5794): std.datetime StopWatch (and perhaps benchmark) examples need a small fix)
$(LI $(BUGZILLA 5857): std.regex (...){n,m} is bogus when (...) contains repetitions)
$(LI $(BUGZILLA 6076): std.regex: "c.*|d" matches "mm")
)

)
173 changes: 139 additions & 34 deletions std/regex.d
Expand Up @@ -345,48 +345,149 @@ Returns the number of parenthesized captures
debug(std_regex) writefln("error: %s", msg);
throw new Exception(msg);
}
//adjust jumps, after removing instructions at 'place'
void fixup(ubyte[] prog, size_t place, uint change)
{
for (size_t pc=0;pc<prog.length;)
{
switch (prog[pc])
{
case REend:
return;

case REcounter: //jump forward
if(pc < place)
{
auto dest = cast(uint *)&prog[pc + 1 + uint.sizeof];
if (pc + *dest > place)
*dest -= change;
}
pc += 1 + 2*uint.sizeof;
break;

case REloop, REloopg: //jump back
if (pc > place)
{
auto dest = cast(uint *)&prog[pc + 1 + 2*uint.sizeof];
if (pc + *dest > place)
*dest += change;
}
pc += 1 + 3*uint.sizeof;
break;

case REneglookahead://jump or call forward
case RElookahead:
case REor:
case REgoto:
if (pc < place)
{
auto dest = cast(uint *)&prog[pc+1];
if (pc + *dest > place)
*dest -= change;
}
pc += 1 + uint.sizeof;
break;

case REret:
case REanychar:
case REanystarg:
case REanystar:
case REbol:
case REeol:
case REwordboundary:
case REnotwordboundary:
case REdigit:
case REnotdigit:
case REspace:
case REnotspace:
case REword:
case REnotword:
pc++;
break;

case REchar:
case REichar:
case REbackref:
pc += 2;
break;

case REdchar:
case REidchar:
pc += 1 + dchar.sizeof;
break;

case REstring:
case REistring:
auto len = *cast(size_t *)&prog[pc + 1];
assert(len % E.sizeof == 0);
pc += 1 + size_t.sizeof + len;
break;

case REtestbit:
case REbit:
case REnotbit:
auto pu = cast(ushort *)&prog[pc + 1];
auto len = pu[1];
pc += 1 + 2 * ushort.sizeof + len;
break;

case RErange:
case REnotrange:
auto len = *cast(uint *)&prog[pc + 1];
pc += 1 + uint.sizeof + len;
break;

case REsave:
pc += 1 + uint.sizeof;
break;

default:
writeln("%d",prog[pc]);
assert(0);
}
}
}
//Fixup counter numbers, simplify instructions
private void postprocess(ubyte[] prog)
{
uint counter = 0;
size_t len;
ushort* pu;
nCounters = 0;

size_t pc = 0;
for (;;)
{
switch (prog.front)
switch (prog[pc])
{
case REend:
return;

case REcounter:
size_t offs = 1 + 2*uint.sizeof;
size_t offs = pc + 1 + 2*uint.sizeof;
bool anyloop = counter == 0 && prog[offs] == REanychar
&& (prog[offs+1] == REloop || prog[offs+1] == REloopg);
uint* puint = cast(uint*)&prog[offs+2];
if (anyloop && puint[0] == 0 && puint[1] == inf)
{
prog[0] = prog[offs+1] == REloop ? REanystar : REanystarg;
std.array.replaceInPlace(
prog, 1,
2*(1 + uint.sizeof) + 1 + 3*uint.sizeof,
cast(ubyte[])[]);
prog.popFront();
prog[pc] = prog[offs+1] == REloop ? REanystar : REanystarg;
uint change = 2*(1 + uint.sizeof) + 1 + 3*uint.sizeof - 1;
std.array.replaceInPlace(prog, pc + 1,
pc + change + 1, cast(ubyte[])[]);
fixup(prog, pc, change);
pc++;
}
else
{
*cast(uint*)&prog[1] = counter;
*cast(uint *)&prog[pc+1] = counter;
counter++;
nCounters = max(nCounters, counter);
prog.popFrontN(1 + 2*uint.sizeof);
pc += 1 + 2*uint.sizeof;
}
break;

case REloop, REloopg:
counter--;
prog.popFrontN(1 + 3*uint.sizeof);
pc += 1 + 3*uint.sizeof;
break;

case REret:
Expand All @@ -401,57 +502,52 @@ Returns the number of parenthesized captures
case REnotspace:
case REword:
case REnotword:
prog.popFront();
pc++;
break;

case REbackref:
prog.popFrontN(2);
break;

case REchar:
case REichar:
prog.popFrontN(2);
pc += 2;
break;

case REdchar:
case REidchar:
prog.popFrontN(1+dchar.sizeof);
pc += 1 + dchar.sizeof;
break;

case REstring:
case REistring:
len = *cast(size_t *)&prog[1];
len = *cast(size_t *)&prog[pc+1];
assert(len % E.sizeof == 0);
prog.popFrontN(1 + size_t.sizeof + len);
pc += 1 + size_t.sizeof + len;
break;

case REtestbit:
case REbit:
case REnotbit:
pu = cast(ushort *)&prog[1];
pu = cast(ushort *)&prog[pc+1];
len = pu[1];
prog.popFrontN(1 + 2 * ushort.sizeof + len);
pc += 1 + 2 * ushort.sizeof + len;
break;

case RErange:
case REnotrange:
len = *cast(uint *)&prog[1];
prog.popFrontN(1 + uint.sizeof + len);
len = *cast(uint *)&prog[pc+1];
pc += 1 + uint.sizeof + len;
break;


case REneglookahead:
case RElookahead:
case REor:
case REgoto:
prog.popFrontN(1 + uint.sizeof);
pc += 1 + uint.sizeof;
break;

case REsave:
prog.popFrontN(1 + uint.sizeof);
pc += 1 + uint.sizeof;
break;

case REneglookahead:
case RElookahead:
prog.popFrontN(1 + uint.sizeof);
break;
default:
assert(0);
}
Expand Down Expand Up @@ -1459,7 +1555,7 @@ struct RegexMatch(Range = string)
// Engine
alias .Regex!(Unqual!E) Regex;
private alias Regex.regmatch_t regmatch_t;
enum stackSize = 640*1024;
enum stackSize = 32*1024;
/**
Get or set the engine of the match.
*/
Expand Down Expand Up @@ -1965,7 +2061,7 @@ Returns $(D hit) (converted to $(D string) if necessary).
auto stateSize = (counters.empty ? 0 : (curCounter+1)*uint.sizeof)
+ matchesToSave*regmatch_t.sizeof;
if (memory.length < lastState + stateSize + StateTail.sizeof)
memory.length += memory.length/2; //reallocates on heap
memory.length += memory.length; //reallocates on heap
auto matchPtr = cast(regmatch_t*)&memory[lastState];
matchPtr[0..matchesToSave] = pmatch[1..matchesToSave+1];
if (!counters.empty)
Expand Down Expand Up @@ -2240,7 +2336,7 @@ Returns $(D hit) (converted to $(D string) if necessary).
src = input.length;
else
{
auto p = memchr(&input[src],'\n', input.length-src);
auto p = memchr(input.ptr+src,'\n', input.length-src);
src = p ? p - &input[src] : input.length;
}
while (src > ss)
Expand Down Expand Up @@ -3489,3 +3585,12 @@ unittest
assert(pres == array(retro(heads)));
assert(posts == tails);
}

//issue 6076
//regression on .*
unittest
{
auto re = regex("c.*|d");
auto m = match("mm", re);
assert(m.empty);
}

0 comments on commit cbe8659

Please sign in to comment.