Skip to content

Commit 42c640a

Browse files
hollmmaxMax Hollmann
authored andcommitted
std.unicode: DFA-based UTF8 validate
1 parent 9a3540d commit 42c640a

File tree

1 file changed

+136
-91
lines changed

1 file changed

+136
-91
lines changed

lib/std/unicode.zig

Lines changed: 136 additions & 91 deletions
Original file line numberDiff line numberDiff line change
@@ -231,6 +231,128 @@ pub fn utf8ValidateSlice(input: []const u8) bool {
231231
}
232232

233233
fn utf8ValidateSliceImpl(input: []const u8, comptime surrogates: Surrogates) bool {
234+
const DFA = struct {
235+
const ByteClass = enum {
236+
// base ASCII
237+
ascii,
238+
// continuation bytes
239+
cont1,
240+
cont2,
241+
cont3,
242+
// starting bytes of 2-byte codepoint
243+
two1,
244+
two2,
245+
// starting bytes of 3-byte codepoint
246+
three1,
247+
three2,
248+
three3,
249+
// starting bytes of 4-byte codepoint
250+
four1,
251+
four2,
252+
four3,
253+
four4,
254+
};
255+
256+
pub fn byte_class(byte: u8) ByteClass {
257+
return switch (byte) {
258+
0x00...0x7f => .ascii,
259+
0x80...0x8f => .cont1,
260+
0x90...0x9f => .cont2,
261+
0xa0...0xbf => .cont3,
262+
0xc0...0xc1 => .two1,
263+
0xc2...0xdf => .two2,
264+
0xe0...0xe0 => .three1,
265+
0xe1...0xec => .three2,
266+
0xed...0xed => .three3,
267+
0xee...0xef => .three2,
268+
0xf0...0xf0 => .four1,
269+
0xf1...0xf3 => .four2,
270+
0xf4...0xf4 => .four3,
271+
0xf5...0xff => .four4,
272+
};
273+
}
274+
275+
const State = enum(u5) {
276+
ok = 8,
277+
one = 13,
278+
two1 = 23,
279+
two2 = 18,
280+
three1 = 3,
281+
three2 = 28,
282+
fail = 0,
283+
};
284+
285+
const start: State = .ok;
286+
const accept: []State = .{.ok};
287+
const fail: ?State = .fail;
288+
289+
fn step(byte: u8, state: State) State {
290+
const class = byte_class(byte);
291+
return switch (state) {
292+
.ok => switch (class) {
293+
.ascii => .ok,
294+
.cont1 => .one,
295+
.cont2 => .one,
296+
.cont3 => .one,
297+
else => .fail,
298+
},
299+
.one => switch (class) {
300+
.two2 => .ok,
301+
.cont1 => .two1,
302+
.cont2 => .two1,
303+
.cont3 => .two2,
304+
else => .fail,
305+
},
306+
.two1 => switch (class) {
307+
.three2 => .ok,
308+
.three3 => .ok,
309+
.cont1 => .three1,
310+
.cont2 => .three2,
311+
.cont3 => .three2,
312+
else => .fail,
313+
},
314+
.two2 => switch (class) {
315+
.three1 => .ok,
316+
.three2 => .ok,
317+
.three3 => switch (surrogates) {
318+
.cannot_encode_surrogate_half => .fail,
319+
.can_encode_surrogate_half => .ok,
320+
},
321+
.cont1 => .three1,
322+
.cont2 => .three2,
323+
.cont3 => .three2,
324+
else => .fail,
325+
},
326+
.three1 => switch (class) {
327+
.four2 => .ok,
328+
.four3 => .ok,
329+
else => .fail,
330+
},
331+
.three2 => switch (class) {
332+
.four1 => .ok,
333+
.four2 => .ok,
334+
else => .fail,
335+
},
336+
.fail => .fail,
337+
};
338+
}
339+
340+
const shift_table = blk: {
341+
@setEvalBranchQuota(30000);
342+
var t = [_]u32{0} ** 256;
343+
for (&t, 0..) |*r, c| {
344+
for (std.enums.values(State)) |s| {
345+
r.* |= @truncate(@as(u32, @intFromEnum(step(c, s))) << @intFromEnum(s));
346+
}
347+
// Make sure the states didn't overlap and destroy themselves
348+
for (std.enums.values(State)) |s| {
349+
std.debug.assert(@as(u5, @truncate(r.* >> @intFromEnum(s))) == @intFromEnum(step(c, s)));
350+
}
351+
}
352+
break :blk t;
353+
};
354+
};
355+
234356
var remaining = input;
235357

236358
if (std.simd.suggestVectorLength(u8)) |chunk_len| {
@@ -248,101 +370,24 @@ fn utf8ValidateSliceImpl(input: []const u8, comptime surrogates: Surrogates) boo
248370
}
249371
}
250372

251-
// default lowest and highest continuation byte
252-
const lo_cb = 0b10000000;
253-
const hi_cb = 0b10111111;
254-
255-
const min_non_ascii_codepoint = 0x80;
256-
257-
// The first nibble is used to identify the continuation byte range to
258-
// accept. The second nibble is the size.
259-
const xx = 0xF1; // invalid: size 1
260-
const as = 0xF0; // ASCII: size 1
261-
const s1 = 0x02; // accept 0, size 2
262-
const s2 = switch (surrogates) {
263-
.cannot_encode_surrogate_half => 0x13, // accept 1, size 3
264-
.can_encode_surrogate_half => 0x03, // accept 0, size 3
265-
};
266-
const s3 = 0x03; // accept 0, size 3
267-
const s4 = switch (surrogates) {
268-
.cannot_encode_surrogate_half => 0x23, // accept 2, size 3
269-
.can_encode_surrogate_half => 0x03, // accept 0, size 3
270-
};
271-
const s5 = 0x34; // accept 3, size 4
272-
const s6 = 0x04; // accept 0, size 4
273-
const s7 = 0x44; // accept 4, size 4
274-
275-
// Information about the first byte in a UTF-8 sequence.
276-
const first = comptime ([_]u8{as} ** 128) ++ ([_]u8{xx} ** 64) ++ [_]u8{
277-
xx, xx, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1,
278-
s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1,
279-
s2, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s4, s3, s3,
280-
s5, s6, s6, s6, s7, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx,
281-
};
282-
283-
const n = remaining.len;
284-
var i: usize = 0;
285-
while (i < n) {
286-
const first_byte = remaining[i];
287-
if (first_byte < min_non_ascii_codepoint) {
288-
i += 1;
289-
continue;
290-
}
291-
292-
const info = first[first_byte];
293-
if (info == xx) {
294-
return false; // Illegal starter byte.
295-
}
296-
297-
const size = info & 7;
298-
if (i + size > n) {
299-
return false; // Short or invalid.
300-
}
301-
302-
// Figure out the acceptable low and high continuation bytes, starting
303-
// with our defaults.
304-
var accept_lo: u8 = lo_cb;
305-
var accept_hi: u8 = hi_cb;
306-
307-
switch (info >> 4) {
308-
0 => {},
309-
1 => accept_lo = 0xA0,
310-
2 => accept_hi = 0x9F,
311-
3 => accept_lo = 0x90,
312-
4 => accept_hi = 0x8F,
313-
else => unreachable,
373+
var state: u32 = @intFromEnum(DFA.State.ok);
374+
// Manually unrolled to insert early return.
375+
const UNROLL = 8;
376+
while (remaining.len > UNROLL) {
377+
for (0..UNROLL) |i| {
378+
const byte = remaining[remaining.len - 1 - i];
379+
state = DFA.shift_table[byte] >> @truncate(state);
314380
}
315-
316-
const c1 = remaining[i + 1];
317-
if (c1 < accept_lo or accept_hi < c1) {
381+
remaining = remaining[0 .. remaining.len - UNROLL];
382+
if (@as(u5, @truncate(state)) == @intFromEnum(DFA.State.fail)) {
318383
return false;
319384
}
320-
321-
switch (size) {
322-
2 => i += 2,
323-
3 => {
324-
const c2 = remaining[i + 2];
325-
if (c2 < lo_cb or hi_cb < c2) {
326-
return false;
327-
}
328-
i += 3;
329-
},
330-
4 => {
331-
const c2 = remaining[i + 2];
332-
if (c2 < lo_cb or hi_cb < c2) {
333-
return false;
334-
}
335-
const c3 = remaining[i + 3];
336-
if (c3 < lo_cb or hi_cb < c3) {
337-
return false;
338-
}
339-
i += 4;
340-
},
341-
else => unreachable,
342-
}
343385
}
344-
345-
return true;
386+
for (0..remaining.len) |i| {
387+
const byte = remaining[remaining.len - 1 - i];
388+
state = DFA.shift_table[byte] >> @truncate(state);
389+
}
390+
return @as(u5, @truncate(state)) == @intFromEnum(DFA.State.ok);
346391
}
347392

348393
/// Utf8View iterates the code points of a utf-8 encoded string.

0 commit comments

Comments
 (0)