@@ -603,6 +603,7 @@ pub fn all_args_and_flags() -> Vec<RGArg> {
603
603
flag_no_messages ( & mut args) ;
604
604
flag_no_pcre2_unicode ( & mut args) ;
605
605
flag_no_require_git ( & mut args) ;
606
+ flag_no_unicode ( & mut args) ;
606
607
flag_null ( & mut args) ;
607
608
flag_null_data ( & mut args) ;
608
609
flag_one_file_system ( & mut args) ;
@@ -1890,42 +1891,21 @@ This flag can be disabled with the --messages flag.
1890
1891
fn flag_no_pcre2_unicode ( args : & mut Vec < RGArg > ) {
1891
1892
const SHORT : & str = "Disable Unicode mode for PCRE2 matching." ;
1892
1893
const LONG : & str = long ! ( "\
1893
- When PCRE2 matching is enabled, this flag will disable Unicode mode, which is
1894
- otherwise enabled by default. If PCRE2 matching is not enabled, then this flag
1895
- has no effect.
1896
-
1897
- When PCRE2's Unicode mode is enabled, several different types of patterns
1898
- become Unicode aware. This includes '\\ b', '\\ B', '\\ w', '\\ W', '\\ d', '\\ D',
1899
- '\\ s' and '\\ S'. Similarly, the '.' meta character will match any Unicode
1900
- codepoint instead of any byte. Caseless matching will also use Unicode simple
1901
- case folding instead of ASCII-only case insensitivity.
1902
-
1903
- Unicode mode in PCRE2 represents a critical trade off in the user experience
1904
- of ripgrep. In particular, unlike the default regex engine, PCRE2 does not
1905
- support the ability to search possibly invalid UTF-8 with Unicode features
1906
- enabled. Instead, PCRE2 *requires* that everything it searches when Unicode
1907
- mode is enabled is valid UTF-8. (Or valid UTF-16/UTF-32, but for the purposes
1908
- of ripgrep, we only discuss UTF-8.) This means that if you have PCRE2's Unicode
1909
- mode enabled and you attempt to search invalid UTF-8, then the search for that
1910
- file will halt and print an error. For this reason, when PCRE2's Unicode mode
1911
- is enabled, ripgrep will automatically \" fix\" invalid UTF-8 sequences by
1912
- replacing them with the Unicode replacement codepoint.
1894
+ DEPRECATED. Use --no-unicode instead.
1913
1895
1914
- If you would rather see the encoding errors surfaced by PCRE2 when Unicode mode
1915
- is enabled, then pass the --no-encoding flag to disable all transcoding.
1916
-
1917
- Related flags: --pcre2
1918
-
1919
- This flag can be disabled with --pcre2-unicode.
1896
+ This flag is now an alias for --no-unicode. And --pcre2-unicode is an alias
1897
+ for --unicode.
1920
1898
" ) ;
1921
1899
let arg = RGArg :: switch ( "no-pcre2-unicode" )
1922
1900
. help ( SHORT ) . long_help ( LONG )
1923
- . overrides ( "pcre2-unicode" ) ;
1901
+ . overrides ( "pcre2-unicode" )
1902
+ . overrides ( "unicode" ) ;
1924
1903
args. push ( arg) ;
1925
1904
1926
1905
let arg = RGArg :: switch ( "pcre2-unicode" )
1927
1906
. hidden ( )
1928
- . overrides ( "no-pcre2-unicode" ) ;
1907
+ . overrides ( "no-pcre2-unicode" )
1908
+ . overrides ( "no-unicode" ) ;
1929
1909
args. push ( arg) ;
1930
1910
}
1931
1911
@@ -1951,6 +1931,55 @@ This flag can be disabled with --require-git.
1951
1931
args. push ( arg) ;
1952
1932
}
1953
1933
1934
+ fn flag_no_unicode ( args : & mut Vec < RGArg > ) {
1935
+ const SHORT : & str = "Disable Unicode mode." ;
1936
+ const LONG : & str = long ! ( "\
1937
+ By default, ripgrep will enable \" Unicode mode\" in all of its regexes. This
1938
+ has a number of consequences:
1939
+
1940
+ * '.' will only match valid UTF-8 encoded scalar values.
1941
+ * Classes like '\\ w', '\\ s', '\\ d' are all Unicode aware and much bigger
1942
+ than their ASCII only versions.
1943
+ * Case insensitive matching will use Unicode case folding.
1944
+ * A large array of classes like '\\ p{Emoji}' are available.
1945
+ * Word boundaries ('\\ b' and '\\ B') use the Unicode definition of a word
1946
+ character.
1947
+
1948
+ In some cases it can be desirable to turn these things off. The --no-unicode
1949
+ flag will do exactly that.
1950
+
1951
+ For PCRE2 specifically, Unicode mode represents a critical trade off in the
1952
+ user experience of ripgrep. In particular, unlike the default regex engine,
1953
+ PCRE2 does not support the ability to search possibly invalid UTF-8 with
1954
+ Unicode features enabled. Instead, PCRE2 *requires* that everything it searches
1955
+ when Unicode mode is enabled is valid UTF-8. (Or valid UTF-16/UTF-32, but for
1956
+ the purposes of ripgrep, we only discuss UTF-8.) This means that if you have
1957
+ PCRE2's Unicode mode enabled and you attempt to search invalid UTF-8, then
1958
+ the search for that file will halt and print an error. For this reason, when
1959
+ PCRE2's Unicode mode is enabled, ripgrep will automatically \" fix\" invalid
1960
+ UTF-8 sequences by replacing them with the Unicode replacement codepoint. This
1961
+ penalty does not occur when using the default regex engine.
1962
+
1963
+ If you would rather see the encoding errors surfaced by PCRE2 when Unicode mode
1964
+ is enabled, then pass the --no-encoding flag to disable all transcoding.
1965
+
1966
+ The --no-unicode flag can be disabled with --unicode. Note that
1967
+ --no-pcre2-unicode and --pcre2-unicode are aliases for --no-unicode and
1968
+ --unicode, respectively.
1969
+ " ) ;
1970
+ let arg = RGArg :: switch ( "no-unicode" )
1971
+ . help ( SHORT ) . long_help ( LONG )
1972
+ . overrides ( "unicode" )
1973
+ . overrides ( "pcre2-unicode" ) ;
1974
+ args. push ( arg) ;
1975
+
1976
+ let arg = RGArg :: switch ( "unicode" )
1977
+ . hidden ( )
1978
+ . overrides ( "no-unicode" )
1979
+ . overrides ( "no-pcre2-unicode" ) ;
1980
+ args. push ( arg) ;
1981
+ }
1982
+
1954
1983
fn flag_null ( args : & mut Vec < RGArg > ) {
1955
1984
const SHORT : & str = "Print a NUL byte after file paths." ;
1956
1985
const LONG : & str = long ! ( "\
0 commit comments