@@ -231,6 +231,128 @@ pub fn utf8ValidateSlice(input: []const u8) bool {
231
231
}
232
232
233
233
fn utf8ValidateSliceImpl (input : []const u8 , comptime surrogates : Surrogates ) bool {
234
+ const DFA = struct {
235
+ const ByteClass = enum {
236
+ // base ASCII
237
+ ascii ,
238
+ // continuation bytes
239
+ cont1 ,
240
+ cont2 ,
241
+ cont3 ,
242
+ // starting bytes of 2-byte codepoint
243
+ two1 ,
244
+ two2 ,
245
+ // starting bytes of 3-byte codepoint
246
+ three1 ,
247
+ three2 ,
248
+ three3 ,
249
+ // starting bytes of 4-byte codepoint
250
+ four1 ,
251
+ four2 ,
252
+ four3 ,
253
+ four4 ,
254
+ };
255
+
256
+ pub fn byte_class (byte : u8 ) ByteClass {
257
+ return switch (byte ) {
258
+ 0x00... 0x7f = > .ascii ,
259
+ 0x80... 0x8f = > .cont1 ,
260
+ 0x90... 0x9f = > .cont2 ,
261
+ 0xa0... 0xbf = > .cont3 ,
262
+ 0xc0... 0xc1 = > .two1 ,
263
+ 0xc2... 0xdf = > .two2 ,
264
+ 0xe0... 0xe0 = > .three1 ,
265
+ 0xe1... 0xec = > .three2 ,
266
+ 0xed... 0xed = > .three3 ,
267
+ 0xee... 0xef = > .three2 ,
268
+ 0xf0... 0xf0 = > .four1 ,
269
+ 0xf1... 0xf3 = > .four2 ,
270
+ 0xf4... 0xf4 = > .four3 ,
271
+ 0xf5... 0xff = > .four4 ,
272
+ };
273
+ }
274
+
275
+ const State = enum (u5 ) {
276
+ ok = 8 ,
277
+ one = 13 ,
278
+ two1 = 23 ,
279
+ two2 = 18 ,
280
+ three1 = 3 ,
281
+ three2 = 28 ,
282
+ fail = 0 ,
283
+ };
284
+
285
+ const start : State = .ok ;
286
+ const accept : []State = .{.ok };
287
+ const fail : ? State = .fail ;
288
+
289
+ fn step (byte : u8 , state : State ) State {
290
+ const class = byte_class (byte );
291
+ return switch (state ) {
292
+ .ok = > switch (class ) {
293
+ .ascii = > .ok ,
294
+ .cont1 = > .one ,
295
+ .cont2 = > .one ,
296
+ .cont3 = > .one ,
297
+ else = > .fail ,
298
+ },
299
+ .one = > switch (class ) {
300
+ .two2 = > .ok ,
301
+ .cont1 = > .two1 ,
302
+ .cont2 = > .two1 ,
303
+ .cont3 = > .two2 ,
304
+ else = > .fail ,
305
+ },
306
+ .two1 = > switch (class ) {
307
+ .three2 = > .ok ,
308
+ .three3 = > .ok ,
309
+ .cont1 = > .three1 ,
310
+ .cont2 = > .three2 ,
311
+ .cont3 = > .three2 ,
312
+ else = > .fail ,
313
+ },
314
+ .two2 = > switch (class ) {
315
+ .three1 = > .ok ,
316
+ .three2 = > .ok ,
317
+ .three3 = > switch (surrogates ) {
318
+ .cannot_encode_surrogate_half = > .fail ,
319
+ .can_encode_surrogate_half = > .ok ,
320
+ },
321
+ .cont1 = > .three1 ,
322
+ .cont2 = > .three2 ,
323
+ .cont3 = > .three2 ,
324
+ else = > .fail ,
325
+ },
326
+ .three1 = > switch (class ) {
327
+ .four2 = > .ok ,
328
+ .four3 = > .ok ,
329
+ else = > .fail ,
330
+ },
331
+ .three2 = > switch (class ) {
332
+ .four1 = > .ok ,
333
+ .four2 = > .ok ,
334
+ else = > .fail ,
335
+ },
336
+ .fail = > .fail ,
337
+ };
338
+ }
339
+
340
+ const shift_table = blk : {
341
+ @setEvalBranchQuota (30000 );
342
+ var t = [_ ]u32 {0 } ** 256 ;
343
+ for (& t , 0.. ) | * r , c | {
344
+ for (std .enums .values (State )) | s | {
345
+ r .* |= @truncate (@as (u32 , @intFromEnum (step (c , s ))) << @intFromEnum (s ));
346
+ }
347
+ // Make sure the states didn't overlap and destroy themselves
348
+ for (std .enums .values (State )) | s | {
349
+ std .debug .assert (@as (u5 , @truncate (r .* >> @intFromEnum (s ))) == @intFromEnum (step (c , s )));
350
+ }
351
+ }
352
+ break :blk t ;
353
+ };
354
+ };
355
+
234
356
var remaining = input ;
235
357
236
358
if (std .simd .suggestVectorLength (u8 )) | chunk_len | {
@@ -248,101 +370,24 @@ fn utf8ValidateSliceImpl(input: []const u8, comptime surrogates: Surrogates) boo
248
370
}
249
371
}
250
372
251
- // default lowest and highest continuation byte
252
- const lo_cb = 0b10000000 ;
253
- const hi_cb = 0b10111111 ;
254
-
255
- const min_non_ascii_codepoint = 0x80 ;
256
-
257
- // The first nibble is used to identify the continuation byte range to
258
- // accept. The second nibble is the size.
259
- const xx = 0xF1 ; // invalid: size 1
260
- const as = 0xF0 ; // ASCII: size 1
261
- const s1 = 0x02 ; // accept 0, size 2
262
- const s2 = switch (surrogates ) {
263
- .cannot_encode_surrogate_half = > 0x13 , // accept 1, size 3
264
- .can_encode_surrogate_half = > 0x03 , // accept 0, size 3
265
- };
266
- const s3 = 0x03 ; // accept 0, size 3
267
- const s4 = switch (surrogates ) {
268
- .cannot_encode_surrogate_half = > 0x23 , // accept 2, size 3
269
- .can_encode_surrogate_half = > 0x03 , // accept 0, size 3
270
- };
271
- const s5 = 0x34 ; // accept 3, size 4
272
- const s6 = 0x04 ; // accept 0, size 4
273
- const s7 = 0x44 ; // accept 4, size 4
274
-
275
- // Information about the first byte in a UTF-8 sequence.
276
- const first = comptime ([_ ]u8 {as } ** 128 ) ++ ([_ ]u8 {xx } ** 64 ) ++ [_ ]u8 {
277
- xx , xx , s1 , s1 , s1 , s1 , s1 , s1 , s1 , s1 , s1 , s1 , s1 , s1 , s1 , s1 ,
278
- s1 , s1 , s1 , s1 , s1 , s1 , s1 , s1 , s1 , s1 , s1 , s1 , s1 , s1 , s1 , s1 ,
279
- s2 , s3 , s3 , s3 , s3 , s3 , s3 , s3 , s3 , s3 , s3 , s3 , s3 , s4 , s3 , s3 ,
280
- s5 , s6 , s6 , s6 , s7 , xx , xx , xx , xx , xx , xx , xx , xx , xx , xx , xx ,
281
- };
282
-
283
- const n = remaining .len ;
284
- var i : usize = 0 ;
285
- while (i < n ) {
286
- const first_byte = remaining [i ];
287
- if (first_byte < min_non_ascii_codepoint ) {
288
- i += 1 ;
289
- continue ;
290
- }
291
-
292
- const info = first [first_byte ];
293
- if (info == xx ) {
294
- return false ; // Illegal starter byte.
295
- }
296
-
297
- const size = info & 7 ;
298
- if (i + size > n ) {
299
- return false ; // Short or invalid.
300
- }
301
-
302
- // Figure out the acceptable low and high continuation bytes, starting
303
- // with our defaults.
304
- var accept_lo : u8 = lo_cb ;
305
- var accept_hi : u8 = hi_cb ;
306
-
307
- switch (info >> 4 ) {
308
- 0 = > {},
309
- 1 = > accept_lo = 0xA0 ,
310
- 2 = > accept_hi = 0x9F ,
311
- 3 = > accept_lo = 0x90 ,
312
- 4 = > accept_hi = 0x8F ,
313
- else = > unreachable ,
373
+ var state : u32 = @intFromEnum (DFA .State .ok );
374
+ // Manually unrolled to insert early return.
375
+ const UNROLL = 8 ;
376
+ while (remaining .len > UNROLL ) {
377
+ for (0.. UNROLL ) | i | {
378
+ const byte = remaining [remaining .len - 1 - i ];
379
+ state = DFA .shift_table [byte ] >> @truncate (state );
314
380
}
315
-
316
- const c1 = remaining [i + 1 ];
317
- if (c1 < accept_lo or accept_hi < c1 ) {
381
+ remaining = remaining [0 .. remaining .len - UNROLL ];
382
+ if (@as (u5 , @truncate (state )) == @intFromEnum (DFA .State .fail )) {
318
383
return false ;
319
384
}
320
-
321
- switch (size ) {
322
- 2 = > i += 2 ,
323
- 3 = > {
324
- const c2 = remaining [i + 2 ];
325
- if (c2 < lo_cb or hi_cb < c2 ) {
326
- return false ;
327
- }
328
- i += 3 ;
329
- },
330
- 4 = > {
331
- const c2 = remaining [i + 2 ];
332
- if (c2 < lo_cb or hi_cb < c2 ) {
333
- return false ;
334
- }
335
- const c3 = remaining [i + 3 ];
336
- if (c3 < lo_cb or hi_cb < c3 ) {
337
- return false ;
338
- }
339
- i += 4 ;
340
- },
341
- else = > unreachable ,
342
- }
343
385
}
344
-
345
- return true ;
386
+ for (0.. remaining .len ) | i | {
387
+ const byte = remaining [remaining .len - 1 - i ];
388
+ state = DFA .shift_table [byte ] >> @truncate (state );
389
+ }
390
+ return @as (u5 , @truncate (state )) == @intFromEnum (DFA .State .ok );
346
391
}
347
392
348
393
/// Utf8View iterates the code points of a utf-8 encoded string.
0 commit comments