Skip to content

Commit 75cbe88

Browse files
committed
cli: add --no-unicode, deprecate --no-pcre2-unicode
This adds a universal --no-unicode flag that is intended to work for all supported regex engines. There is no point in retaining --no-pcre2-unicode, so we make them aliases to the new flags and deprecate them.
1 parent 711426a commit 75cbe88

File tree

5 files changed

+90
-34
lines changed

5 files changed

+90
-34
lines changed

CHANGELOG.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,14 @@ TBD
22
===
33
TODO
44

5+
Deprecations:
6+
7+
* The `--no-pcre2-unicode` flag is deprecated. Instead, use the `--no-unicode`
8+
flag, which applies to both the default regex engine and PCRE2. For now,
9+
`--no-pcre2-unicode` and `--pcre2-unicode` are aliases to `--no-unicode`
10+
and `--unicode`, respectively. The `--[no-]pcre2-unicode` flags may be
11+
removed in a future release.
12+
513
Performance improvements:
614

715
* [PERF #1381](https://github.com/BurntSushi/ripgrep/pull/1381):
@@ -27,6 +35,8 @@ Feature enhancements:
2735
Add `--no-require-git` flag to allow ripgrep to respect gitignores anywhere.
2836
* [FEATURE #1420](https://github.com/BurntSushi/ripgrep/pull/1420):
2937
Add `--no-ignore-exclude` to disregard rules in `.git/info/exclude` files.
38+
* FEATURE:
39+
Add `--no-unicode` flag. This works on all supported regex engines.
3040

3141
Bug fixes:
3242

complete/_rg

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,8 @@ _rg() {
144144
+ '(ignore-vcs)' # VCS ignore-file options
145145
"--no-ignore-vcs[don't respect version control ignore files]"
146146
$no'--ignore-vcs[respect version control ignore files]'
147+
148+
+ '(require-git)' # git specific settings
147149
"--no-require-git[don't require git repository to respect gitignore rules]"
148150
$no'--require-git[require git repository to respect gitignore rules]'
149151

@@ -270,6 +272,10 @@ _rg() {
270272
{-w,--word-regexp}'[only show matches surrounded by word boundaries]'
271273
{-x,--line-regexp}'[only show matches surrounded by line boundaries]'
272274

275+
+ '(unicode)' # Unicode options
276+
$no'--unicode[enable Unicode mode]'
277+
'--no-unicode[disable Unicode mode]'
278+
273279
+ '(zip)' # Compression options
274280
'(--pre)'{-z,--search-zip}'[search in compressed files]'
275281
$no"--no-search-zip[don't search in compressed files]"

src/app.rs

Lines changed: 57 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -603,6 +603,7 @@ pub fn all_args_and_flags() -> Vec<RGArg> {
603603
flag_no_messages(&mut args);
604604
flag_no_pcre2_unicode(&mut args);
605605
flag_no_require_git(&mut args);
606+
flag_no_unicode(&mut args);
606607
flag_null(&mut args);
607608
flag_null_data(&mut args);
608609
flag_one_file_system(&mut args);
@@ -1890,42 +1891,21 @@ This flag can be disabled with the --messages flag.
18901891
fn flag_no_pcre2_unicode(args: &mut Vec<RGArg>) {
18911892
const SHORT: &str = "Disable Unicode mode for PCRE2 matching.";
18921893
const LONG: &str = long!("\
1893-
When PCRE2 matching is enabled, this flag will disable Unicode mode, which is
1894-
otherwise enabled by default. If PCRE2 matching is not enabled, then this flag
1895-
has no effect.
1896-
1897-
When PCRE2's Unicode mode is enabled, several different types of patterns
1898-
become Unicode aware. This includes '\\b', '\\B', '\\w', '\\W', '\\d', '\\D',
1899-
'\\s' and '\\S'. Similarly, the '.' meta character will match any Unicode
1900-
codepoint instead of any byte. Caseless matching will also use Unicode simple
1901-
case folding instead of ASCII-only case insensitivity.
1902-
1903-
Unicode mode in PCRE2 represents a critical trade off in the user experience
1904-
of ripgrep. In particular, unlike the default regex engine, PCRE2 does not
1905-
support the ability to search possibly invalid UTF-8 with Unicode features
1906-
enabled. Instead, PCRE2 *requires* that everything it searches when Unicode
1907-
mode is enabled is valid UTF-8. (Or valid UTF-16/UTF-32, but for the purposes
1908-
of ripgrep, we only discuss UTF-8.) This means that if you have PCRE2's Unicode
1909-
mode enabled and you attempt to search invalid UTF-8, then the search for that
1910-
file will halt and print an error. For this reason, when PCRE2's Unicode mode
1911-
is enabled, ripgrep will automatically \"fix\" invalid UTF-8 sequences by
1912-
replacing them with the Unicode replacement codepoint.
1894+
DEPRECATED. Use --no-unicode instead.
19131895
1914-
If you would rather see the encoding errors surfaced by PCRE2 when Unicode mode
1915-
is enabled, then pass the --no-encoding flag to disable all transcoding.
1916-
1917-
Related flags: --pcre2
1918-
1919-
This flag can be disabled with --pcre2-unicode.
1896+
This flag is now an alias for --no-unicode. And --pcre2-unicode is an alias
1897+
for --unicode.
19201898
");
19211899
let arg = RGArg::switch("no-pcre2-unicode")
19221900
.help(SHORT).long_help(LONG)
1923-
.overrides("pcre2-unicode");
1901+
.overrides("pcre2-unicode")
1902+
.overrides("unicode");
19241903
args.push(arg);
19251904

19261905
let arg = RGArg::switch("pcre2-unicode")
19271906
.hidden()
1928-
.overrides("no-pcre2-unicode");
1907+
.overrides("no-pcre2-unicode")
1908+
.overrides("no-unicode");
19291909
args.push(arg);
19301910
}
19311911

@@ -1951,6 +1931,55 @@ This flag can be disabled with --require-git.
19511931
args.push(arg);
19521932
}
19531933

1934+
fn flag_no_unicode(args: &mut Vec<RGArg>) {
1935+
const SHORT: &str = "Disable Unicode mode.";
1936+
const LONG: &str = long!("\
1937+
By default, ripgrep will enable \"Unicode mode\" in all of its regexes. This
1938+
has a number of consequences:
1939+
1940+
* '.' will only match valid UTF-8 encoded scalar values.
1941+
* Classes like '\\w', '\\s', '\\d' are all Unicode aware and much bigger
1942+
than their ASCII only versions.
1943+
* Case insensitive matching will use Unicode case folding.
1944+
* A large array of classes like '\\p{Emoji}' are available.
1945+
* Word boundaries ('\\b' and '\\B') use the Unicode definition of a word
1946+
character.
1947+
1948+
In some cases it can be desirable to turn these things off. The --no-unicode
1949+
flag will do exactly that.
1950+
1951+
For PCRE2 specifically, Unicode mode represents a critical trade off in the
1952+
user experience of ripgrep. In particular, unlike the default regex engine,
1953+
PCRE2 does not support the ability to search possibly invalid UTF-8 with
1954+
Unicode features enabled. Instead, PCRE2 *requires* that everything it searches
1955+
when Unicode mode is enabled is valid UTF-8. (Or valid UTF-16/UTF-32, but for
1956+
the purposes of ripgrep, we only discuss UTF-8.) This means that if you have
1957+
PCRE2's Unicode mode enabled and you attempt to search invalid UTF-8, then
1958+
the search for that file will halt and print an error. For this reason, when
1959+
PCRE2's Unicode mode is enabled, ripgrep will automatically \"fix\" invalid
1960+
UTF-8 sequences by replacing them with the Unicode replacement codepoint. This
1961+
penalty does not occur when using the default regex engine.
1962+
1963+
If you would rather see the encoding errors surfaced by PCRE2 when Unicode mode
1964+
is enabled, then pass the --no-encoding flag to disable all transcoding.
1965+
1966+
The --no-unicode flag can be disabled with --unicode. Note that
1967+
--no-pcre2-unicode and --pcre2-unicode are aliases for --no-unicode and
1968+
--unicode, respectively.
1969+
");
1970+
let arg = RGArg::switch("no-unicode")
1971+
.help(SHORT).long_help(LONG)
1972+
.overrides("unicode")
1973+
.overrides("pcre2-unicode");
1974+
args.push(arg);
1975+
1976+
let arg = RGArg::switch("unicode")
1977+
.hidden()
1978+
.overrides("no-unicode")
1979+
.overrides("no-pcre2-unicode");
1980+
args.push(arg);
1981+
}
1982+
19541983
fn flag_null(args: &mut Vec<RGArg>) {
19551984
const SHORT: &str = "Print a NUL byte after file paths.";
19561985
const LONG: &str = long!("\

src/args.rs

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -654,7 +654,7 @@ impl ArgMatches {
654654
.case_smart(self.case_smart())
655655
.case_insensitive(self.case_insensitive())
656656
.multi_line(true)
657-
.unicode(true)
657+
.unicode(self.unicode())
658658
.octal(false)
659659
.word(self.is_present("word-regexp"));
660660
if self.is_present("multiline") {
@@ -720,7 +720,7 @@ impl ArgMatches {
720720
// 10MB.
721721
.max_jit_stack_size(Some(10 * (1<<20)));
722722
}
723-
if self.pcre2_unicode() {
723+
if self.unicode() {
724724
builder.utf(true).ucp(true);
725725
if self.encoding()?.has_explicit_encoding() {
726726
// SAFETY: If an encoding was specified, then we're guaranteed
@@ -1602,11 +1602,17 @@ impl ArgMatches {
16021602
self.occurrences_of("unrestricted")
16031603
}
16041604

1605-
/// Returns true if and only if PCRE2's Unicode mode should be enabled.
1605+
/// Returns true if and only if Unicode mode should be enabled.
1606+
fn unicode(&self) -> bool {
1607+
// Unicode mode is enabled by default, so only disable it when
1608+
// --no-unicode is given explicitly.
1609+
!(self.is_present("no-unicode") || self.is_present("no-pcre2-unicode"))
1610+
}
1611+
1612+
/// Returns true if and only if PCRE2 is enabled and its Unicode mode is
1613+
/// enabled.
16061614
fn pcre2_unicode(&self) -> bool {
1607-
// PCRE2 Unicode is enabled by default, so only disable it when told
1608-
// to do so explicitly.
1609-
self.is_present("pcre2") && !self.is_present("no-pcre2-unicode")
1615+
self.is_present("pcre2") && self.unicode()
16101616
}
16111617

16121618
/// Returns true if and only if file names containing each match should

tests/feature.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -834,3 +834,8 @@ rgtest!(context_sep_empty, |dir: Dir, mut cmd: TestCommand| {
834834
]);
835835
eqnice!("foo\nctx\n\nfoo\nctx\n", cmd.stdout());
836836
});
837+
838+
rgtest!(no_unicode, |dir: Dir, mut cmd: TestCommand| {
839+
dir.create("test", "δ");
840+
cmd.arg("-i").arg("--no-unicode").arg("Δ").assert_err();
841+
});

0 commit comments

Comments
 (0)