benchmarks/definitions/test/unicode/word.toml

analysis = '''
These benchmarks test the functionality of `\w`. A `\w` is generally either
equivalent to `[0-9A-Za-z_]`, or it's equivalent to the definition offered
in [UTS#18][uts18]. UTS#18 defines it as the union of the `Alphabetic`,
`Join_Control`, `gc=Mark`, `gc=Decimal_number` and `gc=Connector_Punctuation`
properties.

Where regex engines differ is the extent to which this can be toggled. Some
regex engines support both definitions. Some support *only* the ASCII variant
and some support *only* the Unicode variant.

These benchmarks aren't really intended to be used for collecting timing
measurements. These are just about verifying functionality.

[uts18]: https://unicode.org/reports/tr18/
'''

[[bench]]
model = "count"
name = "ascii-only"
regex = '\w'
haystack = 'δ'
count = [
  # .NET doesn't permit disabling the Unicode variant of \w.
  { engine = 'dotnet.*', count = 1 },
  # ICU doesn't permit disabling the Unicode variant of \w.
  { engine = 'icu', count = 1 },
  { engine = '.*', count = 0 },
]
engines = [
  'dotnet',
  'dotnet/compiled',
  'dotnet/nobacktrack',
  'go/regexp',
  'hyperscan',
  'icu',
  'java/hotspot',
  'javascript/v8',
  'pcre2',
  'pcre2/jit',
  'perl',
  'python/re',
  'python/regex',
  're2',
  'regress',
  'rust/regex',
  'rust/regex/lite',
]

[[bench]]
model = "count"
name = "unicode-alphabetic"
regex = '\w'
unicode = true
haystack = 'δ'
count = [
  # None of these regex engines support the Unicode variant of \w.
  { engine = 'go/regexp|javascript.*|re2|regress|rust/regex/lite', count = 0 },
  { engine = '.*', count = 1 },
]
engines = [
  'dotnet',
  'dotnet/compiled',
  'dotnet/nobacktrack',
  'go/regexp',
  'hyperscan',
  'icu',
  'java/hotspot',
  'javascript/v8',
  'pcre2',
  'pcre2/jit',
  'perl',
  'python/re',
  'python/regex',
  're2',
  'regress',
  'rust/regex',
  'rust/regex/lite',
]

[[bench]]
model = "count"
name = "unicode-join-control"
regex = '\w'
unicode = true
haystack = "\u200D"
count = [
  # None of these regex engines support the Unicode variant of \w.
  { engine = 'go/regexp|javascript.*|re2|regress|rust/regex/lite', count = 0 },
  # None of these include \p{Join_Control} in their definition of \w.
  { engine = 'dotnet.*|hyperscan|pcre2.*|python/re', count = 0 },
  { engine = '.*', count = 1 },
]
engines = [
  'dotnet',
  'dotnet/compiled',
  'dotnet/nobacktrack',
  'go/regexp',
  'hyperscan',
  'icu',
  'java/hotspot',
  'javascript/v8',
  'pcre2',
  'pcre2/jit',
  'perl',
  'python/re',
  'python/regex',
  're2',
  'regress',
  'rust/regex',
  'rust/regex/lite',
]

[[bench]]
model = "count"
name = "unicode-mark"
regex = '\w'
unicode = true
haystack = "\u0322"
count = [
  # None of these regex engines support the Unicode variant of \w.
  { engine = 'go/regexp|javascript.*|re2|regress|rust/regex/lite', count = 0 },
  # None of these include \p{gc=Mark} in their definition of \w.
  { engine = 'hyperscan|pcre2.*|python/re', count = 0 },
  { engine = '.*', count = 1 },
]
engines = [
  'dotnet',
  'dotnet/compiled',
  'dotnet/nobacktrack',
  'go/regexp',
  'hyperscan',
  'icu',
  'java/hotspot',
  'javascript/v8',
  'pcre2',
  'pcre2/jit',
  'perl',
  'python/re',
  'python/regex',
  're2',
  'regress',
  'rust/regex',
  'rust/regex/lite',
]

[[bench]]
model = "count"
name = "unicode-decimal-number"
regex = '\w'
unicode = true
haystack = '᠕'
count = [
  # None of these regex engines support the Unicode variant of \w.
  { engine = 'go/regexp|javascript.*|re2|regress|rust/regex/lite', count = 0 },
  { engine = '.*', count = 1 },
]
engines = [
  'dotnet',
  'dotnet/compiled',
  'dotnet/nobacktrack',
  'go/regexp',
  'hyperscan',
  'icu',
  'java/hotspot',
  'javascript/v8',
  'pcre2',
  'pcre2/jit',
  'perl',
  'python/re',
  'python/regex',
  're2',
  'regress',
  'rust/regex',
  'rust/regex/lite',
]

[[bench]]
model = "count"
name = "unicode-connector-punctuation"
regex = '\w'
unicode = true
haystack = '⁀'
count = [
  # None of these regex engines support the Unicode variant of \w.
  { engine = 'go/regexp|javascript.*|re2|regress|rust/regex/lite', count = 0 },
  # None of these include \p{gc=Connector_Punctuation} in their definition
  # of \w, although, they all at least include `_`, which is part of
  # `Connector_Punctuation`.
  { engine = 'hyperscan|pcre2.*|python/re', count = 0 },
  { engine = '.*', count = 1 },
]
engines = [
  'dotnet',
  'dotnet/compiled',
  'dotnet/nobacktrack',
  'go/regexp',
  'hyperscan',
  'icu',
  'java/hotspot',
  'javascript/v8',
  'pcre2',
  'pcre2/jit',
  'perl',
  'python/re',
  'python/regex',
  're2',
  'regress',
  'rust/regex',
  'rust/regex/lite',
]