@@ -231,11 +231,157 @@ pub fn utf8ValidateSlice(input: []const u8) bool {
231
231
}
232
232
233
233
fn utf8ValidateSliceImpl (input : []const u8 , comptime surrogates : Surrogates ) bool {
234
+ const DFA = struct {
235
+ const ByteClass = enum {
236
+ // base ASCII
237
+ ascii ,
238
+ // continuation bytes
239
+ cont1 ,
240
+ cont2 ,
241
+ cont3 ,
242
+ // starting bytes of 2-byte codepoint
243
+ two1 ,
244
+ two2 ,
245
+ // starting bytes of 3-byte codepoint
246
+ three1 ,
247
+ three2 ,
248
+ three3 ,
249
+ // starting bytes of 4-byte codepoint
250
+ four1 ,
251
+ four2 ,
252
+ four3 ,
253
+ four4 ,
254
+ };
255
+
256
+ pub fn byte_class (byte : u8 ) ByteClass {
257
+ return switch (byte ) {
258
+ 0x00... 0x7f = > .ascii ,
259
+ 0x80... 0x8f = > .cont1 ,
260
+ 0x90... 0x9f = > .cont2 ,
261
+ 0xa0... 0xbf = > .cont3 ,
262
+ 0xc0... 0xc1 = > .two1 ,
263
+ 0xc2... 0xdf = > .two2 ,
264
+ 0xe0... 0xe0 = > .three1 ,
265
+ 0xe1... 0xec = > .three2 ,
266
+ 0xed... 0xed = > .three3 ,
267
+ 0xee... 0xef = > .three2 ,
268
+ 0xf0... 0xf0 = > .four1 ,
269
+ 0xf1... 0xf3 = > .four2 ,
270
+ 0xf4... 0xf4 = > .four3 ,
271
+ 0xf5... 0xff = > .four4 ,
272
+ };
273
+ }
274
+
275
+ const State = enum {
276
+ ok ,
277
+ one ,
278
+ two1 ,
279
+ two2 ,
280
+ three1 ,
281
+ three2 ,
282
+ fail ,
283
+ };
284
+
285
+ fn offset_from_state (state : State ) u5 {
286
+ return switch (state ) {
287
+ .ok = > 8 ,
288
+ .one = > 13 ,
289
+ .two1 = > 23 ,
290
+ .two2 = > 18 ,
291
+ .three1 = > 3 ,
292
+ .three2 = > 28 ,
293
+ .fail = > 0 ,
294
+ };
295
+ }
296
+
297
+ fn state_from_offset (offset : u5 ) State {
298
+ return switch (offset ) {
299
+ 8 = > .ok ,
300
+ 13 = > .one ,
301
+ 23 = > .two1 ,
302
+ 18 = > .two2 ,
303
+ 3 = > .three1 ,
304
+ 28 = > .three2 ,
305
+ 0 = > .fail ,
306
+ else = > unreachable ,
307
+ };
308
+ }
309
+
310
+ const start : State = .ok ;
311
+ const accept : []State = .{ .ok };
312
+ const fail : ? State = .fail ;
313
+
314
+ fn step (byte : u8 , state : State ) State {
315
+ const class = byte_class (byte );
316
+ return switch (state ) {
317
+ .ok = > switch (class ) {
318
+ .ascii = > .ok ,
319
+ .cont1 = > .one ,
320
+ .cont2 = > .one ,
321
+ .cont3 = > .one ,
322
+ else = > .fail ,
323
+ },
324
+ .one = > switch (class ) {
325
+ .two2 = > .ok ,
326
+ .cont1 = > .two1 ,
327
+ .cont2 = > .two1 ,
328
+ .cont3 = > .two2 ,
329
+ else = > .fail ,
330
+ },
331
+ .two1 = > switch (class ) {
332
+ .three2 = > .ok ,
333
+ .three3 = > .ok ,
334
+ .cont1 = > .three1 ,
335
+ .cont2 = > .three2 ,
336
+ .cont3 = > .three2 ,
337
+ else = > .fail ,
338
+ },
339
+ .two2 = > switch (class ) {
340
+ .three1 = > .ok ,
341
+ .three2 = > .ok ,
342
+ .three3 = > switch (surrogates ) {
343
+ .cannot_encode_surrogate_half = > .fail ,
344
+ .can_encode_surrogate_half = > .ok ,
345
+ },
346
+ .cont1 = > .three1 ,
347
+ .cont2 = > .three2 ,
348
+ .cont3 = > .three2 ,
349
+ else = > .fail ,
350
+ },
351
+ .three1 = > switch (class ) {
352
+ .four2 = > .ok ,
353
+ .four3 = > .ok ,
354
+ else = > .fail ,
355
+ },
356
+ .three2 = > switch (class ) {
357
+ .four1 = > .ok ,
358
+ .four2 = > .ok ,
359
+ else = > .fail ,
360
+ },
361
+ .fail = > .fail ,
362
+ };
363
+ }
364
+
365
+ const shift_table = blk : {
366
+ @setEvalBranchQuota (30000 );
367
+ var t = [_ ]u32 {0 } ** 256 ;
368
+ for (& t , 0.. ) | * r , c | {
369
+ for (std .enums .values (State )) | s | {
370
+ r .* |= @truncate (@as (u32 , offset_from_state (step (c , s ))) << offset_from_state (s ));
371
+ }
372
+ // Make sure the states didn't overlap and destroy themselves
373
+ for (std .enums .values (State )) | s | {
374
+ std .debug .assert (@as (u5 , @truncate (r .* >> offset_from_state (s ))) == offset_from_state (step (c , s )));
375
+ }
376
+ }
377
+ break :blk t ;
378
+ };
379
+ };
380
+
234
381
var remaining = input ;
235
382
236
383
if (std .simd .suggestVectorLength (u8 )) | chunk_len | {
237
384
const Chunk = @Vector (chunk_len , u8 );
238
-
239
385
// Fast path. Check for and skip ASCII characters at the start of the input.
240
386
while (remaining .len >= chunk_len ) {
241
387
const chunk : Chunk = remaining [0.. chunk_len ].* ;
@@ -248,101 +394,24 @@ fn utf8ValidateSliceImpl(input: []const u8, comptime surrogates: Surrogates) boo
248
394
}
249
395
}
250
396
251
- // default lowest and highest continuation byte
252
- const lo_cb = 0b10000000 ;
253
- const hi_cb = 0b10111111 ;
254
-
255
- const min_non_ascii_codepoint = 0x80 ;
256
-
257
- // The first nibble is used to identify the continuation byte range to
258
- // accept. The second nibble is the size.
259
- const xx = 0xF1 ; // invalid: size 1
260
- const as = 0xF0 ; // ASCII: size 1
261
- const s1 = 0x02 ; // accept 0, size 2
262
- const s2 = switch (surrogates ) {
263
- .cannot_encode_surrogate_half = > 0x13 , // accept 1, size 3
264
- .can_encode_surrogate_half = > 0x03 , // accept 0, size 3
265
- };
266
- const s3 = 0x03 ; // accept 0, size 3
267
- const s4 = switch (surrogates ) {
268
- .cannot_encode_surrogate_half = > 0x23 , // accept 2, size 3
269
- .can_encode_surrogate_half = > 0x03 , // accept 0, size 3
270
- };
271
- const s5 = 0x34 ; // accept 3, size 4
272
- const s6 = 0x04 ; // accept 0, size 4
273
- const s7 = 0x44 ; // accept 4, size 4
274
-
275
- // Information about the first byte in a UTF-8 sequence.
276
- const first = comptime ([_ ]u8 {as } ** 128 ) ++ ([_ ]u8 {xx } ** 64 ) ++ [_ ]u8 {
277
- xx , xx , s1 , s1 , s1 , s1 , s1 , s1 , s1 , s1 , s1 , s1 , s1 , s1 , s1 , s1 ,
278
- s1 , s1 , s1 , s1 , s1 , s1 , s1 , s1 , s1 , s1 , s1 , s1 , s1 , s1 , s1 , s1 ,
279
- s2 , s3 , s3 , s3 , s3 , s3 , s3 , s3 , s3 , s3 , s3 , s3 , s3 , s4 , s3 , s3 ,
280
- s5 , s6 , s6 , s6 , s7 , xx , xx , xx , xx , xx , xx , xx , xx , xx , xx , xx ,
281
- };
282
-
283
- const n = remaining .len ;
284
- var i : usize = 0 ;
285
- while (i < n ) {
286
- const first_byte = remaining [i ];
287
- if (first_byte < min_non_ascii_codepoint ) {
288
- i += 1 ;
289
- continue ;
290
- }
291
-
292
- const info = first [first_byte ];
293
- if (info == xx ) {
294
- return false ; // Illegal starter byte.
295
- }
296
-
297
- const size = info & 7 ;
298
- if (i + size > n ) {
299
- return false ; // Short or invalid.
397
+ var state : u32 = DFA .offset_from_state (DFA .State .ok );
398
+ // Manually unrolled to insert early return.
399
+ const UNROLL = 8 ;
400
+ while (remaining .len > UNROLL ) {
401
+ for (0.. UNROLL ) | i | {
402
+ const byte = remaining [remaining .len - 1 - i ];
403
+ state = DFA .shift_table [byte ] >> @truncate (state );
300
404
}
301
-
302
- // Figure out the acceptable low and high continuation bytes, starting
303
- // with our defaults.
304
- var accept_lo : u8 = lo_cb ;
305
- var accept_hi : u8 = hi_cb ;
306
-
307
- switch (info >> 4 ) {
308
- 0 = > {},
309
- 1 = > accept_lo = 0xA0 ,
310
- 2 = > accept_hi = 0x9F ,
311
- 3 = > accept_lo = 0x90 ,
312
- 4 = > accept_hi = 0x8F ,
313
- else = > unreachable ,
314
- }
315
-
316
- const c1 = remaining [i + 1 ];
317
- if (c1 < accept_lo or accept_hi < c1 ) {
405
+ remaining = remaining [0.. remaining .len - UNROLL ];
406
+ if (@as (u5 , @truncate (state )) == DFA .offset_from_state (DFA .State .fail )) {
318
407
return false ;
319
408
}
320
-
321
- switch (size ) {
322
- 2 = > i += 2 ,
323
- 3 = > {
324
- const c2 = remaining [i + 2 ];
325
- if (c2 < lo_cb or hi_cb < c2 ) {
326
- return false ;
327
- }
328
- i += 3 ;
329
- },
330
- 4 = > {
331
- const c2 = remaining [i + 2 ];
332
- if (c2 < lo_cb or hi_cb < c2 ) {
333
- return false ;
334
- }
335
- const c3 = remaining [i + 3 ];
336
- if (c3 < lo_cb or hi_cb < c3 ) {
337
- return false ;
338
- }
339
- i += 4 ;
340
- },
341
- else = > unreachable ,
342
- }
343
409
}
344
-
345
- return true ;
410
+ for (0.. remaining .len ) | i | {
411
+ const byte = remaining [remaining .len - 1 - i ];
412
+ state = DFA .shift_table [byte ] >> @truncate (state );
413
+ }
414
+ return @as (u5 , @truncate (state )) == DFA .offset_from_state (DFA .State .ok );
346
415
}
347
416
348
417
/// Utf8View iterates the code points of a utf-8 encoded string.
0 commit comments