cli: add --no-unicode, deprecate --no-pcre2-unicode

BurntSushi · BurntSushi · commit 75cbe88fa299 · 2020-02-17T17:16:28.000-05:00
This adds a universal --no-unicode flag that is intended to work for all
supported regex engines. There is no point in retaining
--no-pcre2-unicode, so we make them aliases to the new flags and
deprecate them.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,14 @@ TBD
 ===
 TODO
 
+Deprecations:
+
+* The `--no-pcre2-unicode` flag is deprecated. Instead, use the `--no-unicode`
+  flag, which applies to both the default regex engine and PCRE2. For now,
+  `--no-pcre2-unicode` and `--pcre2-unicode` are aliases to `--no-unicode`
+  and `--unicode`, respectively. The `--[no-]pcre2-unicode` flags may be
+  removed in a future release.
+
 Performance improvements:
 
 * [PERF #1381](https://github.com/BurntSushi/ripgrep/pull/1381):
@@ -27,6 +35,8 @@ Feature enhancements:
   Add `--no-require-git` flag to allow ripgrep to respect gitignores anywhere.
 * [FEATURE #1420](https://github.com/BurntSushi/ripgrep/pull/1420):
   Add `--no-ignore-exclude` to disregard rules in `.git/info/exclude` files.
+* FEATURE:
+  Add `--no-unicode` flag. This works on all supported regex engines.
 
 Bug fixes:
 
diff --git a/complete/_rg b/complete/_rg
@@ -144,6 +144,8 @@ _rg() {
     + '(ignore-vcs)' # VCS ignore-file options
     "--no-ignore-vcs[don't respect version control ignore files]"
     $no'--ignore-vcs[respect version control ignore files]'
+
+    + '(require-git)' # git specific settings
     "--no-require-git[don't require git repository to respect gitignore rules]"
     $no'--require-git[require git repository to respect gitignore rules]'
 
@@ -270,6 +272,10 @@ _rg() {
     {-w,--word-regexp}'[only show matches surrounded by word boundaries]'
     {-x,--line-regexp}'[only show matches surrounded by line boundaries]'
 
+    + '(unicode)' # Unicode options
+    $no'--unicode[enable Unicode mode]'
+    '--no-unicode[disable Unicode mode]'
+
     + '(zip)' # Compression options
     '(--pre)'{-z,--search-zip}'[search in compressed files]'
     $no"--no-search-zip[don't search in compressed files]"
diff --git a/src/app.rs b/src/app.rs
@@ -603,6 +603,7 @@ pub fn all_args_and_flags() -> Vec<RGArg> {
     flag_no_messages(&mut args);
     flag_no_pcre2_unicode(&mut args);
     flag_no_require_git(&mut args);
+    flag_no_unicode(&mut args);
     flag_null(&mut args);
     flag_null_data(&mut args);
     flag_one_file_system(&mut args);
@@ -1890,42 +1891,21 @@ This flag can be disabled with the --messages flag.
 fn flag_no_pcre2_unicode(args: &mut Vec<RGArg>) {
     const SHORT: &str = "Disable Unicode mode for PCRE2 matching.";
     const LONG: &str = long!("\
-When PCRE2 matching is enabled, this flag will disable Unicode mode, which is
-otherwise enabled by default. If PCRE2 matching is not enabled, then this flag
-has no effect.
-
-When PCRE2's Unicode mode is enabled, several different types of patterns
-become Unicode aware. This includes '\\b', '\\B', '\\w', '\\W', '\\d', '\\D',
-'\\s' and '\\S'. Similarly, the '.' meta character will match any Unicode
-codepoint instead of any byte. Caseless matching will also use Unicode simple
-case folding instead of ASCII-only case insensitivity.
-
-Unicode mode in PCRE2 represents a critical trade off in the user experience
-of ripgrep. In particular, unlike the default regex engine, PCRE2 does not
-support the ability to search possibly invalid UTF-8 with Unicode features
-enabled. Instead, PCRE2 *requires* that everything it searches when Unicode
-mode is enabled is valid UTF-8. (Or valid UTF-16/UTF-32, but for the purposes
-of ripgrep, we only discuss UTF-8.) This means that if you have PCRE2's Unicode
-mode enabled and you attempt to search invalid UTF-8, then the search for that
-file will halt and print an error. For this reason, when PCRE2's Unicode mode
-is enabled, ripgrep will automatically \"fix\" invalid UTF-8 sequences by
-replacing them with the Unicode replacement codepoint.
+DEPRECATED. Use --no-unicode instead.
 
-If you would rather see the encoding errors surfaced by PCRE2 when Unicode mode
-is enabled, then pass the --no-encoding flag to disable all transcoding.
-
-Related flags: --pcre2
-
-This flag can be disabled with --pcre2-unicode.
+This flag is now an alias for --no-unicode. And --pcre2-unicode is an alias
+for --unicode.
 ");
     let arg = RGArg::switch("no-pcre2-unicode")
         .help(SHORT).long_help(LONG)
-        .overrides("pcre2-unicode");
+        .overrides("pcre2-unicode")
+        .overrides("unicode");
     args.push(arg);
 
     let arg = RGArg::switch("pcre2-unicode")
         .hidden()
-        .overrides("no-pcre2-unicode");
+        .overrides("no-pcre2-unicode")
+        .overrides("no-unicode");
     args.push(arg);
 }
 
@@ -1951,6 +1931,55 @@ This flag can be disabled with --require-git.
     args.push(arg);
 }
 
+fn flag_no_unicode(args: &mut Vec<RGArg>) {
+    const SHORT: &str = "Disable Unicode mode.";
+    const LONG: &str = long!("\
+By default, ripgrep will enable \"Unicode mode\" in all of its regexes. This
+has a number of consequences:
+
+* '.' will only match valid UTF-8 encoded scalar values.
+* Classes like '\\w', '\\s', '\\d' are all Unicode aware and much bigger
+  than their ASCII only versions.
+* Case insensitive matching will use Unicode case folding.
+* A large array of classes like '\\p{Emoji}' are available.
+* Word boundaries ('\\b' and '\\B') use the Unicode definition of a word
+  character.
+
+In some cases it can be desirable to turn these things off. The --no-unicode
+flag will do exactly that.
+
+For PCRE2 specifically, Unicode mode represents a critical trade off in the
+user experience of ripgrep. In particular, unlike the default regex engine,
+PCRE2 does not support the ability to search possibly invalid UTF-8 with
+Unicode features enabled. Instead, PCRE2 *requires* that everything it searches
+when Unicode mode is enabled is valid UTF-8. (Or valid UTF-16/UTF-32, but for
+the purposes of ripgrep, we only discuss UTF-8.) This means that if you have
+PCRE2's Unicode mode enabled and you attempt to search invalid UTF-8, then
+the search for that file will halt and print an error. For this reason, when
+PCRE2's Unicode mode is enabled, ripgrep will automatically \"fix\" invalid
+UTF-8 sequences by replacing them with the Unicode replacement codepoint. This
+penalty does not occur when using the default regex engine.
+
+If you would rather see the encoding errors surfaced by PCRE2 when Unicode mode
+is enabled, then pass the --no-encoding flag to disable all transcoding.
+
+The --no-unicode flag can be disabled with --unicode. Note that
+--no-pcre2-unicode and --pcre2-unicode are aliases for --no-unicode and
+--unicode, respectively.
+");
+    let arg = RGArg::switch("no-unicode")
+        .help(SHORT).long_help(LONG)
+        .overrides("unicode")
+        .overrides("pcre2-unicode");
+    args.push(arg);
+
+    let arg = RGArg::switch("unicode")
+        .hidden()
+        .overrides("no-unicode")
+        .overrides("no-pcre2-unicode");
+    args.push(arg);
+}
+
 fn flag_null(args: &mut Vec<RGArg>) {
     const SHORT: &str = "Print a NUL byte after file paths.";
     const LONG: &str = long!("\
diff --git a/src/args.rs b/src/args.rs
@@ -654,7 +654,7 @@ impl ArgMatches {
             .case_smart(self.case_smart())
             .case_insensitive(self.case_insensitive())
             .multi_line(true)
-            .unicode(true)
+            .unicode(self.unicode())
             .octal(false)
             .word(self.is_present("word-regexp"));
         if self.is_present("multiline") {
@@ -720,7 +720,7 @@ impl ArgMatches {
                 // 10MB.
                 .max_jit_stack_size(Some(10 * (1<<20)));
         }
-        if self.pcre2_unicode() {
+        if self.unicode() {
             builder.utf(true).ucp(true);
             if self.encoding()?.has_explicit_encoding() {
                 // SAFETY: If an encoding was specified, then we're guaranteed
@@ -1602,11 +1602,17 @@ impl ArgMatches {
         self.occurrences_of("unrestricted")
     }
 
-    /// Returns true if and only if PCRE2's Unicode mode should be enabled.
+    /// Returns true if and only if Unicode mode should be enabled.
+    fn unicode(&self) -> bool {
+        // Unicode mode is enabled by default, so only disable it when
+        // --no-unicode is given explicitly.
+        !(self.is_present("no-unicode") || self.is_present("no-pcre2-unicode"))
+    }
+
+    /// Returns true if and only if PCRE2 is enabled and its Unicode mode is
+    /// enabled.
     fn pcre2_unicode(&self) -> bool {
-        // PCRE2 Unicode is enabled by default, so only disable it when told
-        // to do so explicitly.
-        self.is_present("pcre2") && !self.is_present("no-pcre2-unicode")
+        self.is_present("pcre2") && self.unicode()
     }
 
     /// Returns true if and only if file names containing each match should
diff --git a/tests/feature.rs b/tests/feature.rs
@@ -834,3 +834,8 @@ rgtest!(context_sep_empty, |dir: Dir, mut cmd: TestCommand| {
     ]);
     eqnice!("foo\nctx\n\nfoo\nctx\n", cmd.stdout());
 });
+
+rgtest!(no_unicode, |dir: Dir, mut cmd: TestCommand| {
+    dir.create("test", "δ");
+    cmd.arg("-i").arg("--no-unicode").arg("Δ").assert_err();
+});