Skip to content

Commit e0d3885

Browse files
committed
std.unicode: DFA-based UTF8 validate
1 parent 9a3540d commit e0d3885

File tree

1 file changed

+161
-92
lines changed

1 file changed

+161
-92
lines changed

lib/std/unicode.zig

Lines changed: 161 additions & 92 deletions
Original file line numberDiff line numberDiff line change
@@ -231,11 +231,157 @@ pub fn utf8ValidateSlice(input: []const u8) bool {
231231
}
232232

233233
fn utf8ValidateSliceImpl(input: []const u8, comptime surrogates: Surrogates) bool {
234+
const DFA = struct {
235+
const ByteClass = enum {
236+
// base ASCII
237+
ascii,
238+
// continuation bytes
239+
cont1,
240+
cont2,
241+
cont3,
242+
// starting bytes of 2-byte codepoint
243+
two1,
244+
two2,
245+
// starting bytes of 3-byte codepoint
246+
three1,
247+
three2,
248+
three3,
249+
// starting bytes of 4-byte codepoint
250+
four1,
251+
four2,
252+
four3,
253+
four4,
254+
};
255+
256+
pub fn byte_class(byte: u8) ByteClass {
257+
return switch (byte) {
258+
0x00...0x7f => .ascii,
259+
0x80...0x8f => .cont1,
260+
0x90...0x9f => .cont2,
261+
0xa0...0xbf => .cont3,
262+
0xc0...0xc1 => .two1,
263+
0xc2...0xdf => .two2,
264+
0xe0...0xe0 => .three1,
265+
0xe1...0xec => .three2,
266+
0xed...0xed => .three3,
267+
0xee...0xef => .three2,
268+
0xf0...0xf0 => .four1,
269+
0xf1...0xf3 => .four2,
270+
0xf4...0xf4 => .four3,
271+
0xf5...0xff => .four4,
272+
};
273+
}
274+
275+
const State = enum {
276+
ok,
277+
one,
278+
two1,
279+
two2,
280+
three1,
281+
three2,
282+
fail,
283+
};
284+
285+
fn offset_from_state(state: State) u5 {
286+
return switch (state) {
287+
.ok => 8,
288+
.one => 13,
289+
.two1 => 23,
290+
.two2 => 18,
291+
.three1 => 3,
292+
.three2 => 28,
293+
.fail => 0,
294+
};
295+
}
296+
297+
fn state_from_offset(offset: u5) State {
298+
return switch (offset) {
299+
8 => .ok,
300+
13 => .one,
301+
23 => .two1,
302+
18 => .two2,
303+
3 => .three1,
304+
28 => .three2,
305+
0 => .fail,
306+
else => unreachable,
307+
};
308+
}
309+
310+
const start: State = .ok;
311+
const accept: []State = .{ .ok };
312+
const fail: ?State = .fail;
313+
314+
fn step(byte: u8, state: State) State {
315+
const class = byte_class(byte);
316+
return switch (state) {
317+
.ok => switch (class) {
318+
.ascii => .ok,
319+
.cont1 => .one,
320+
.cont2 => .one,
321+
.cont3 => .one,
322+
else => .fail,
323+
},
324+
.one => switch (class) {
325+
.two2 => .ok,
326+
.cont1 => .two1,
327+
.cont2 => .two1,
328+
.cont3 => .two2,
329+
else => .fail,
330+
},
331+
.two1 => switch (class) {
332+
.three2 => .ok,
333+
.three3 => .ok,
334+
.cont1 => .three1,
335+
.cont2 => .three2,
336+
.cont3 => .three2,
337+
else => .fail,
338+
},
339+
.two2 => switch (class) {
340+
.three1 => .ok,
341+
.three2 => .ok,
342+
.three3 => switch (surrogates) {
343+
.cannot_encode_surrogate_half => .fail,
344+
.can_encode_surrogate_half => .ok,
345+
},
346+
.cont1 => .three1,
347+
.cont2 => .three2,
348+
.cont3 => .three2,
349+
else => .fail,
350+
},
351+
.three1 => switch (class) {
352+
.four2 => .ok,
353+
.four3 => .ok,
354+
else => .fail,
355+
},
356+
.three2 => switch (class) {
357+
.four1 => .ok,
358+
.four2 => .ok,
359+
else => .fail,
360+
},
361+
.fail => .fail,
362+
};
363+
}
364+
365+
const shift_table = blk: {
366+
@setEvalBranchQuota(30000);
367+
var t = [_]u32{0} ** 256;
368+
for (&t, 0..) |*r, c| {
369+
for (std.enums.values(State)) |s| {
370+
r.* |= @truncate(@as(u32, offset_from_state(step(c, s))) << offset_from_state(s));
371+
}
372+
// Make sure the states didn't overlap and destroy themselves
373+
for (std.enums.values(State)) |s| {
374+
std.debug.assert(@as(u5, @truncate(r.* >> offset_from_state(s))) == offset_from_state(step(c, s)));
375+
}
376+
}
377+
break :blk t;
378+
};
379+
};
380+
234381
var remaining = input;
235382

236383
if (std.simd.suggestVectorLength(u8)) |chunk_len| {
237384
const Chunk = @Vector(chunk_len, u8);
238-
239385
// Fast path. Check for and skip ASCII characters at the start of the input.
240386
while (remaining.len >= chunk_len) {
241387
const chunk: Chunk = remaining[0..chunk_len].*;
@@ -248,101 +394,24 @@ fn utf8ValidateSliceImpl(input: []const u8, comptime surrogates: Surrogates) boo
248394
}
249395
}
250396

251-
// default lowest and highest continuation byte
252-
const lo_cb = 0b10000000;
253-
const hi_cb = 0b10111111;
254-
255-
const min_non_ascii_codepoint = 0x80;
256-
257-
// The first nibble is used to identify the continuation byte range to
258-
// accept. The second nibble is the size.
259-
const xx = 0xF1; // invalid: size 1
260-
const as = 0xF0; // ASCII: size 1
261-
const s1 = 0x02; // accept 0, size 2
262-
const s2 = switch (surrogates) {
263-
.cannot_encode_surrogate_half => 0x13, // accept 1, size 3
264-
.can_encode_surrogate_half => 0x03, // accept 0, size 3
265-
};
266-
const s3 = 0x03; // accept 0, size 3
267-
const s4 = switch (surrogates) {
268-
.cannot_encode_surrogate_half => 0x23, // accept 2, size 3
269-
.can_encode_surrogate_half => 0x03, // accept 0, size 3
270-
};
271-
const s5 = 0x34; // accept 3, size 4
272-
const s6 = 0x04; // accept 0, size 4
273-
const s7 = 0x44; // accept 4, size 4
274-
275-
// Information about the first byte in a UTF-8 sequence.
276-
const first = comptime ([_]u8{as} ** 128) ++ ([_]u8{xx} ** 64) ++ [_]u8{
277-
xx, xx, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1,
278-
s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1,
279-
s2, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s4, s3, s3,
280-
s5, s6, s6, s6, s7, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx,
281-
};
282-
283-
const n = remaining.len;
284-
var i: usize = 0;
285-
while (i < n) {
286-
const first_byte = remaining[i];
287-
if (first_byte < min_non_ascii_codepoint) {
288-
i += 1;
289-
continue;
290-
}
291-
292-
const info = first[first_byte];
293-
if (info == xx) {
294-
return false; // Illegal starter byte.
295-
}
296-
297-
const size = info & 7;
298-
if (i + size > n) {
299-
return false; // Short or invalid.
397+
var state: u32 = DFA.offset_from_state(DFA.State.ok);
398+
// Manually unrolled to insert early return.
399+
const UNROLL = 8;
400+
while (remaining.len > UNROLL) {
401+
for (0..UNROLL) |i| {
402+
const byte = remaining[remaining.len - 1 - i];
403+
state = DFA.shift_table[byte] >> @truncate(state);
300404
}
301-
302-
// Figure out the acceptable low and high continuation bytes, starting
303-
// with our defaults.
304-
var accept_lo: u8 = lo_cb;
305-
var accept_hi: u8 = hi_cb;
306-
307-
switch (info >> 4) {
308-
0 => {},
309-
1 => accept_lo = 0xA0,
310-
2 => accept_hi = 0x9F,
311-
3 => accept_lo = 0x90,
312-
4 => accept_hi = 0x8F,
313-
else => unreachable,
314-
}
315-
316-
const c1 = remaining[i + 1];
317-
if (c1 < accept_lo or accept_hi < c1) {
405+
remaining = remaining[0..remaining.len - UNROLL];
406+
if (@as(u5, @truncate(state)) == DFA.offset_from_state(DFA.State.fail)) {
318407
return false;
319408
}
320-
321-
switch (size) {
322-
2 => i += 2,
323-
3 => {
324-
const c2 = remaining[i + 2];
325-
if (c2 < lo_cb or hi_cb < c2) {
326-
return false;
327-
}
328-
i += 3;
329-
},
330-
4 => {
331-
const c2 = remaining[i + 2];
332-
if (c2 < lo_cb or hi_cb < c2) {
333-
return false;
334-
}
335-
const c3 = remaining[i + 3];
336-
if (c3 < lo_cb or hi_cb < c3) {
337-
return false;
338-
}
339-
i += 4;
340-
},
341-
else => unreachable,
342-
}
343409
}
344-
345-
return true;
410+
for (0..remaining.len) |i| {
411+
const byte = remaining[remaining.len - 1 - i];
412+
state = DFA.shift_table[byte] >> @truncate(state);
413+
}
414+
return @as(u5, @truncate(state)) == DFA.offset_from_state(DFA.State.ok);
346415
}
347416

348417
/// Utf8View iterates the code points of a utf-8 encoded string.

0 commit comments

Comments
 (0)