From 8060f5ca3fa2831d9887b2dbd3034a15640a2224 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Tue, 18 Jun 2019 21:28:10 -0400 Subject: [PATCH] api: switch to extension traits This commit effectively rewrites the entire API of this crate to use extension traits on `[u8]` and `Vec`. While the `BStr` and `BString` types are still present, they are now just dumb wrappers that deref to `[u8]` and `Vec`, respectively. Their primary purpose is for convenient debug impls. The motivation for this design is laid out in #5. Closes #5 --- README.md | 45 +- bench/Cargo.lock | 266 +-- bench/src/bench.rs | 26 +- bench/src/search.rs | 8 +- examples/graphemes-std.rs | 2 +- examples/graphemes.rs | 6 +- examples/lines-std.rs | 2 +- examples/lines.rs | 8 +- examples/uppercase-std.rs | 2 +- examples/uppercase.rs | 9 +- examples/words-std.rs | 2 +- examples/words.rs | 4 +- src/bstr.rs | 3954 +------------------------------------ src/bstring.rs | 1559 +-------------- src/cow.rs | 59 +- src/ext_slice.rs | 3471 ++++++++++++++++++++++++++++++++ src/ext_vec.rs | 1070 ++++++++++ src/freqs.rs | 258 --- src/impls.rs | 88 +- src/io.rs | 54 +- src/lib.rs | 177 +- src/search/prefilter.rs | 12 +- src/search/tests.rs | 44 +- src/search/twoway.rs | 92 +- src/slice_index.rs | 292 --- src/unicode/grapheme.rs | 64 +- src/unicode/sentence.rs | 54 +- src/unicode/word.rs | 94 +- src/utf8.rs | 81 +- 29 files changed, 5210 insertions(+), 6593 deletions(-) create mode 100644 src/ext_slice.rs create mode 100644 src/ext_vec.rs delete mode 100644 src/freqs.rs delete mode 100644 src/slice_index.rs diff --git a/README.md b/README.md index 28be1f3..0c467d4 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,9 @@ bstr ==== -This crate provides a `BString` and `BStr` types that are conventionally UTF-8 -for Rust. They differ from the standard library's `String` and `str` types in -that they are not required to be valid UTF-8, but may be fully or partially -valid UTF-8. +This crate provides extension traits for `&[u8]` and `Vec` that enable +their use as byte strings, where byte strings are _conventionally_ UTF-8. This +differs from the standard library's `String` and `str` types in that they are +not required to be valid UTF-8, but may be fully or partially valid UTF-8. [![Linux build status](https://api.travis-ci.org/BurntSushi/bstr.svg)](https://travis-ci.org/BurntSushi/bstr) [![Windows build status](https://ci.appveyor.com/api/projects/status/github/BurntSushi/bstr?svg=true)](https://ci.appveyor.com/project/BurntSushi/bstr) @@ -18,7 +18,7 @@ https://docs.rs/bstr ### When should I use byte strings? See this part of the documentation for more details: -https://docs.rs/bstr/0.1.0/bstr/#when-should-i-use-byte-strings. +https://docs.rs/bstr/0.2.0/bstr/#when-should-i-use-byte-strings. The short story is that byte strings are useful when it is inconvenient or incorrect to require valid UTF-8. @@ -30,7 +30,7 @@ Add this to your `Cargo.toml`: ```toml [dependencies] -bstr = "0.1" +bstr = "0.2" ``` @@ -46,15 +46,15 @@ stdin, and print out lines containing a particular substring: use std::error::Error; use std::io::{self, Write}; -use bstr::io::BufReadExt; +use bstr::{ByteSlice, io::BufReadExt}; -fn main() -> Result<(), Box> { +fn main() -> Result<(), Box> { let stdin = io::stdin(); let mut stdout = io::BufWriter::new(io::stdout()); stdin.lock().for_byte_line_with_terminator(|line| { - if line.contains("Dimension") { - stdout.write_all(line.as_bytes())?; + if line.contains_str("Dimension") { + stdout.write_all(line)?; } Ok(true) })?; @@ -69,9 +69,9 @@ line-by-line: use std::error::Error; use std::io; -use bstr::io::BufReadExt; +use bstr::{ByteSlice, io::BufReadExt}; -fn main() -> Result<(), Box> { +fn main() -> Result<(), Box> { let stdin = io::stdin(); let mut words = 0; stdin.lock().for_byte_line_with_terminator(|line| { @@ -92,18 +92,17 @@ library APIs. (N.B. Any invalid UTF-8 bytes are passed through unchanged.) use std::error::Error; use std::io::{self, Write}; -use bstr::BString; -use bstr::io::BufReadExt; +use bstr::{ByteSlice, io::BufReadExt}; -fn main() -> Result<(), Box> { +fn main() -> Result<(), Box> { let stdin = io::stdin(); let mut stdout = io::BufWriter::new(io::stdout()); - let mut upper = BString::new(); + let mut upper = vec![]; stdin.lock().for_byte_line_with_terminator(|line| { upper.clear(); line.to_uppercase_into(&mut upper); - stdout.write_all(upper.as_bytes())?; + stdout.write_all(&upper)?; Ok(true) })?; Ok(()) @@ -118,9 +117,9 @@ as a single character and are passed through correctly: use std::error::Error; use std::io::{self, Write}; -use bstr::io::BufReadExt; +use bstr::{ByteSlice, io::BufReadExt}; -fn main() -> Result<(), Box> { +fn main() -> Result<(), Box> { let stdin = io::stdin(); let mut stdout = io::BufWriter::new(io::stdout()); @@ -131,7 +130,7 @@ fn main() -> Result<(), Box> { .take(10) .last() .unwrap_or(line.len()); - stdout.write_all(line[..end].trim_end().as_bytes())?; + stdout.write_all(line[..end].trim_end())?; stdout.write_all(b"\n")?; Ok(true) })?; @@ -146,7 +145,7 @@ This crates comes with a few features that control standard library, serde and Unicode support. * `std` - **Enabled** by default. This provides APIs that require the standard - library, such as `BString`. + library, such as `Vec`. * `unicode` - **Enabled** by default. This provides APIs that require sizable Unicode data compiled into the binary. This includes, but is not limited to, grapheme/word/sentence segmenters. When this is disabled, basic support such @@ -206,8 +205,8 @@ except for `memchr`, is optional. ### High level motivation Strictly speaking, the `bstr` crate provides very little that can't already be -achieved with a `Vec`/`&[u8]` and the ecosystem of library crates. For -example: +achieved with the standard library `Vec`/`&[u8]` APIs and the ecosystem of +library crates. For example: * The standard library's [`Utf8Error`](https://doc.rust-lang.org/std/str/struct.Utf8Error.html) diff --git a/bench/Cargo.lock b/bench/Cargo.lock index 84bcf0b..9f04ef7 100644 --- a/bench/Cargo.lock +++ b/bench/Cargo.lock @@ -13,37 +13,42 @@ name = "atty" version = "0.2.11" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ - "libc 0.2.50 (registry+https://github.com/rust-lang/crates.io-index)", - "termion 1.5.1 (registry+https://github.com/rust-lang/crates.io-index)", - "winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.58 (registry+https://github.com/rust-lang/crates.io-index)", + "termion 1.5.3 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "autocfg" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "bitflags" -version = "1.0.4" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] name = "bstr" -version = "0.1.0" +version = "0.1.4" dependencies = [ "lazy_static 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)", "memchr 2.2.0 (registry+https://github.com/rust-lang/crates.io-index)", - "regex-automata 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)", + "regex-automata 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] name = "bstr-bench" version = "0.0.1" dependencies = [ - "bstr 0.1.0", - "criterion 0.2.10 (registry+https://github.com/rust-lang/crates.io-index)", - "unicode-segmentation 1.2.1 (registry+https://github.com/rust-lang/crates.io-index)", + "bstr 0.1.4", + "criterion 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)", + "unicode-segmentation 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] name = "byteorder" -version = "1.3.1" +version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] @@ -53,16 +58,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] name = "cfg-if" -version = "0.1.7" +version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] name = "clap" -version = "2.32.0" +version = "2.33.0" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ - "bitflags 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)", - "textwrap 0.10.0 (registry+https://github.com/rust-lang/crates.io-index)", + "bitflags 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)", + "textwrap 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)", "unicode-width 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)", ] @@ -71,81 +76,93 @@ name = "cloudabi" version = "0.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ - "bitflags 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)", + "bitflags 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] name = "criterion" -version = "0.2.10" +version = "0.2.11" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ "atty 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)", "cast 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)", - "clap 2.32.0 (registry+https://github.com/rust-lang/crates.io-index)", - "criterion-plot 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)", - "csv 1.0.5 (registry+https://github.com/rust-lang/crates.io-index)", + "clap 2.33.0 (registry+https://github.com/rust-lang/crates.io-index)", + "criterion-plot 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)", + "csv 1.0.7 (registry+https://github.com/rust-lang/crates.io-index)", "itertools 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)", "lazy_static 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)", - "num-traits 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.58 (registry+https://github.com/rust-lang/crates.io-index)", + "num-traits 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)", "rand_core 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)", "rand_os 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)", "rand_xoshiro 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", - "rayon 1.0.3 (registry+https://github.com/rust-lang/crates.io-index)", - "serde 1.0.89 (registry+https://github.com/rust-lang/crates.io-index)", - "serde_derive 1.0.89 (registry+https://github.com/rust-lang/crates.io-index)", + "rayon 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)", + "rayon-core 1.5.0 (registry+https://github.com/rust-lang/crates.io-index)", + "serde 1.0.93 (registry+https://github.com/rust-lang/crates.io-index)", + "serde_derive 1.0.93 (registry+https://github.com/rust-lang/crates.io-index)", "serde_json 1.0.39 (registry+https://github.com/rust-lang/crates.io-index)", - "tinytemplate 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)", - "walkdir 2.2.7 (registry+https://github.com/rust-lang/crates.io-index)", + "tinytemplate 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)", + "walkdir 2.2.8 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] name = "criterion-plot" -version = "0.3.0" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ - "byteorder 1.3.1 (registry+https://github.com/rust-lang/crates.io-index)", + "byteorder 1.3.2 (registry+https://github.com/rust-lang/crates.io-index)", "cast 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)", "itertools 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] name = "crossbeam-deque" -version = "0.2.0" +version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ - "crossbeam-epoch 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)", - "crossbeam-utils 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)", + "crossbeam-epoch 0.7.1 (registry+https://github.com/rust-lang/crates.io-index)", + "crossbeam-utils 0.6.5 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] name = "crossbeam-epoch" -version = "0.3.1" +version = "0.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ "arrayvec 0.4.10 (registry+https://github.com/rust-lang/crates.io-index)", - "cfg-if 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)", - "crossbeam-utils 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)", + "cfg-if 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)", + "crossbeam-utils 0.6.5 (registry+https://github.com/rust-lang/crates.io-index)", "lazy_static 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)", "memoffset 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)", - "nodrop 0.1.13 (registry+https://github.com/rust-lang/crates.io-index)", "scopeguard 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "crossbeam-queue" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "crossbeam-utils 0.6.5 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "crossbeam-utils" -version = "0.2.2" +version = "0.6.5" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ - "cfg-if 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)", + "cfg-if 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)", + "lazy_static 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] name = "csv" -version = "1.0.5" +version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ "csv-core 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)", - "serde 1.0.89 (registry+https://github.com/rust-lang/crates.io-index)", + "itoa 0.4.4 (registry+https://github.com/rust-lang/crates.io-index)", + "ryu 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)", + "serde 1.0.93 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] @@ -158,7 +175,7 @@ dependencies = [ [[package]] name = "either" -version = "1.5.1" +version = "1.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] @@ -171,12 +188,12 @@ name = "itertools" version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ - "either 1.5.1 (registry+https://github.com/rust-lang/crates.io-index)", + "either 1.5.2 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] name = "itoa" -version = "0.4.3" +version = "0.4.4" source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] @@ -186,7 +203,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] name = "libc" -version = "0.2.50" +version = "0.2.58" source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] @@ -194,7 +211,7 @@ name = "memchr" version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ - "libc 0.2.50 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.58 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] @@ -209,20 +226,28 @@ source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] name = "num-traits" -version = "0.2.6" +version = "0.2.8" source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "autocfg 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)", +] [[package]] name = "num_cpus" -version = "1.10.0" +version = "1.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ - "libc 0.2.50 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.58 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "numtoa" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "proc-macro2" -version = "0.4.27" +version = "0.4.30" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ "unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", @@ -230,10 +255,10 @@ dependencies = [ [[package]] name = "quote" -version = "0.6.11" +version = "0.6.12" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ - "proc-macro2 0.4.27 (registry+https://github.com/rust-lang/crates.io-index)", + "proc-macro2 0.4.30 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] @@ -256,10 +281,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ "cloudabi 0.0.3 (registry+https://github.com/rust-lang/crates.io-index)", "fuchsia-cprng 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", - "libc 0.2.50 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.58 (registry+https://github.com/rust-lang/crates.io-index)", "rand_core 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)", "rdrand 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)", - "winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] @@ -267,29 +292,30 @@ name = "rand_xoshiro" version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ - "byteorder 1.3.1 (registry+https://github.com/rust-lang/crates.io-index)", + "byteorder 1.3.2 (registry+https://github.com/rust-lang/crates.io-index)", "rand_core 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] name = "rayon" -version = "1.0.3" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ - "crossbeam-deque 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)", - "either 1.5.1 (registry+https://github.com/rust-lang/crates.io-index)", - "rayon-core 1.4.1 (registry+https://github.com/rust-lang/crates.io-index)", + "crossbeam-deque 0.6.3 (registry+https://github.com/rust-lang/crates.io-index)", + "either 1.5.2 (registry+https://github.com/rust-lang/crates.io-index)", + "rayon-core 1.5.0 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] name = "rayon-core" -version = "1.4.1" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ - "crossbeam-deque 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)", + "crossbeam-deque 0.6.3 (registry+https://github.com/rust-lang/crates.io-index)", + "crossbeam-queue 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)", + "crossbeam-utils 0.6.5 (registry+https://github.com/rust-lang/crates.io-index)", "lazy_static 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)", - "libc 0.2.50 (registry+https://github.com/rust-lang/crates.io-index)", - "num_cpus 1.10.0 (registry+https://github.com/rust-lang/crates.io-index)", + "num_cpus 1.10.1 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] @@ -302,7 +328,7 @@ dependencies = [ [[package]] name = "redox_syscall" -version = "0.1.51" +version = "0.1.54" source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] @@ -310,20 +336,20 @@ name = "redox_termios" version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ - "redox_syscall 0.1.51 (registry+https://github.com/rust-lang/crates.io-index)", + "redox_syscall 0.1.54 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] name = "regex-automata" -version = "0.1.6" +version = "0.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ - "byteorder 1.3.1 (registry+https://github.com/rust-lang/crates.io-index)", + "byteorder 1.3.2 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] name = "ryu" -version = "0.2.7" +version = "0.2.8" source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] @@ -341,17 +367,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] name = "serde" -version = "1.0.89" +version = "1.0.93" source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] name = "serde_derive" -version = "1.0.89" +version = "1.0.93" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ - "proc-macro2 0.4.27 (registry+https://github.com/rust-lang/crates.io-index)", - "quote 0.6.11 (registry+https://github.com/rust-lang/crates.io-index)", - "syn 0.15.29 (registry+https://github.com/rust-lang/crates.io-index)", + "proc-macro2 0.4.30 (registry+https://github.com/rust-lang/crates.io-index)", + "quote 0.6.12 (registry+https://github.com/rust-lang/crates.io-index)", + "syn 0.15.38 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] @@ -359,34 +385,35 @@ name = "serde_json" version = "1.0.39" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ - "itoa 0.4.3 (registry+https://github.com/rust-lang/crates.io-index)", - "ryu 0.2.7 (registry+https://github.com/rust-lang/crates.io-index)", - "serde 1.0.89 (registry+https://github.com/rust-lang/crates.io-index)", + "itoa 0.4.4 (registry+https://github.com/rust-lang/crates.io-index)", + "ryu 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)", + "serde 1.0.93 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] name = "syn" -version = "0.15.29" +version = "0.15.38" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ - "proc-macro2 0.4.27 (registry+https://github.com/rust-lang/crates.io-index)", - "quote 0.6.11 (registry+https://github.com/rust-lang/crates.io-index)", + "proc-macro2 0.4.30 (registry+https://github.com/rust-lang/crates.io-index)", + "quote 0.6.12 (registry+https://github.com/rust-lang/crates.io-index)", "unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] name = "termion" -version = "1.5.1" +version = "1.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ - "libc 0.2.50 (registry+https://github.com/rust-lang/crates.io-index)", - "redox_syscall 0.1.51 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.58 (registry+https://github.com/rust-lang/crates.io-index)", + "numtoa 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", + "redox_syscall 0.1.54 (registry+https://github.com/rust-lang/crates.io-index)", "redox_termios 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] name = "textwrap" -version = "0.10.0" +version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ "unicode-width 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)", @@ -394,16 +421,16 @@ dependencies = [ [[package]] name = "tinytemplate" -version = "1.0.1" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ - "serde 1.0.89 (registry+https://github.com/rust-lang/crates.io-index)", + "serde 1.0.93 (registry+https://github.com/rust-lang/crates.io-index)", "serde_json 1.0.39 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] name = "unicode-segmentation" -version = "1.2.1" +version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] @@ -418,17 +445,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] name = "walkdir" -version = "2.2.7" +version = "2.2.8" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ "same-file 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)", - "winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)", "winapi-util 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] name = "winapi" -version = "0.3.6" +version = "0.3.7" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ "winapi-i686-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)", @@ -445,7 +472,7 @@ name = "winapi-util" version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ - "winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] @@ -456,57 +483,60 @@ source = "registry+https://github.com/rust-lang/crates.io-index" [metadata] "checksum arrayvec 0.4.10 (registry+https://github.com/rust-lang/crates.io-index)" = "92c7fb76bc8826a8b33b4ee5bb07a247a81e76764ab4d55e8f73e3a4d8808c71" "checksum atty 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)" = "9a7d5b8723950951411ee34d271d99dddcc2035a16ab25310ea2c8cfd4369652" -"checksum bitflags 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)" = "228047a76f468627ca71776ecdebd732a3423081fcf5125585bcd7c49886ce12" -"checksum byteorder 1.3.1 (registry+https://github.com/rust-lang/crates.io-index)" = "a019b10a2a7cdeb292db131fc8113e57ea2a908f6e7894b0c3c671893b65dbeb" +"checksum autocfg 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)" = "0e49efa51329a5fd37e7c79db4621af617cd4e3e5bc224939808d076077077bf" +"checksum bitflags 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "3d155346769a6855b86399e9bc3814ab343cd3d62c7e985113d46a0ec3c281fd" +"checksum byteorder 1.3.2 (registry+https://github.com/rust-lang/crates.io-index)" = "a7c3dd8985a7111efc5c80b44e23ecdd8c007de8ade3b96595387e812b957cf5" "checksum cast 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "926013f2860c46252efceabb19f4a6b308197505082c609025aa6706c011d427" -"checksum cfg-if 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)" = "11d43355396e872eefb45ce6342e4374ed7bc2b3a502d1b28e36d6e23c05d1f4" -"checksum clap 2.32.0 (registry+https://github.com/rust-lang/crates.io-index)" = "b957d88f4b6a63b9d70d5f454ac8011819c6efa7727858f458ab71c756ce2d3e" +"checksum cfg-if 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)" = "b486ce3ccf7ffd79fdeb678eac06a9e6c09fc88d33836340becb8fffe87c5e33" +"checksum clap 2.33.0 (registry+https://github.com/rust-lang/crates.io-index)" = "5067f5bb2d80ef5d68b4c87db81601f0b75bca627bc2ef76b141d7b846a3c6d9" "checksum cloudabi 0.0.3 (registry+https://github.com/rust-lang/crates.io-index)" = "ddfc5b9aa5d4507acaf872de71051dfd0e309860e88966e1051e462a077aac4f" -"checksum criterion 0.2.10 (registry+https://github.com/rust-lang/crates.io-index)" = "1c6e5ee5b9652d4f851418c448af105642e1f99e9a2741a8ff45c0d2c911b1e0" -"checksum criterion-plot 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "4107e4a5abb94267e0149922b8ff49dc70a87cc202820fdbfc0d3e1edbdc4b16" -"checksum crossbeam-deque 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "f739f8c5363aca78cfb059edf753d8f0d36908c348f3d8d1503f03d8b75d9cf3" -"checksum crossbeam-epoch 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)" = "927121f5407de9956180ff5e936fe3cf4324279280001cd56b669d28ee7e9150" -"checksum crossbeam-utils 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "2760899e32a1d58d5abb31129f8fae5de75220bc2176e77ff7c627ae45c918d9" -"checksum csv 1.0.5 (registry+https://github.com/rust-lang/crates.io-index)" = "9fd1c44c58078cfbeaf11fbb3eac9ae5534c23004ed770cc4bfb48e658ae4f04" +"checksum criterion 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)" = "0363053954f3e679645fc443321ca128b7b950a6fe288cf5f9335cc22ee58394" +"checksum criterion-plot 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)" = "76f9212ddf2f4a9eb2d401635190600656a1f88a932ef53d06e7fa4c7e02fb8e" +"checksum crossbeam-deque 0.6.3 (registry+https://github.com/rust-lang/crates.io-index)" = "05e44b8cf3e1a625844d1750e1f7820da46044ff6d28f4d43e455ba3e5bb2c13" +"checksum crossbeam-epoch 0.7.1 (registry+https://github.com/rust-lang/crates.io-index)" = "04c9e3102cc2d69cd681412141b390abd55a362afc1540965dad0ad4d34280b4" +"checksum crossbeam-queue 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7c979cd6cfe72335896575c6b5688da489e420d36a27a0b9eb0c73db574b4a4b" +"checksum crossbeam-utils 0.6.5 (registry+https://github.com/rust-lang/crates.io-index)" = "f8306fcef4a7b563b76b7dd949ca48f52bc1141aa067d2ea09565f3e2652aa5c" +"checksum csv 1.0.7 (registry+https://github.com/rust-lang/crates.io-index)" = "9044e25afb0924b5a5fc5511689b0918629e85d68ea591e5e87fbf1e85ea1b3b" "checksum csv-core 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "fa5cdef62f37e6ffe7d1f07a381bc0db32b7a3ff1cac0de56cb0d81e71f53d65" -"checksum either 1.5.1 (registry+https://github.com/rust-lang/crates.io-index)" = "c67353c641dc847124ea1902d69bd753dee9bb3beff9aa3662ecf86c971d1fac" +"checksum either 1.5.2 (registry+https://github.com/rust-lang/crates.io-index)" = "5527cfe0d098f36e3f8839852688e63c8fff1c90b2b405aef730615f9a7bcf7b" "checksum fuchsia-cprng 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "a06f77d526c1a601b7c4cdd98f54b5eaabffc14d5f2f0296febdc7f357c6d3ba" "checksum itertools 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)" = "5b8467d9c1cebe26feb08c640139247fac215782d35371ade9a2136ed6085358" -"checksum itoa 0.4.3 (registry+https://github.com/rust-lang/crates.io-index)" = "1306f3464951f30e30d12373d31c79fbd52d236e5e896fd92f96ec7babbbe60b" +"checksum itoa 0.4.4 (registry+https://github.com/rust-lang/crates.io-index)" = "501266b7edd0174f8530248f87f99c88fbe60ca4ef3dd486835b8d8d53136f7f" "checksum lazy_static 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "bc5729f27f159ddd61f4df6228e827e86643d4d3e7c32183cb30a1c08f604a14" -"checksum libc 0.2.50 (registry+https://github.com/rust-lang/crates.io-index)" = "aab692d7759f5cd8c859e169db98ae5b52c924add2af5fbbca11d12fefb567c1" +"checksum libc 0.2.58 (registry+https://github.com/rust-lang/crates.io-index)" = "6281b86796ba5e4366000be6e9e18bf35580adf9e63fbe2294aadb587613a319" "checksum memchr 2.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "2efc7bc57c883d4a4d6e3246905283d8dae951bb3bd32f49d6ef297f546e1c39" "checksum memoffset 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "0f9dc261e2b62d7a622bf416ea3c5245cdd5d9a7fcc428c0d06804dfce1775b3" "checksum nodrop 0.1.13 (registry+https://github.com/rust-lang/crates.io-index)" = "2f9667ddcc6cc8a43afc9b7917599d7216aa09c463919ea32c59ed6cac8bc945" -"checksum num-traits 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)" = "0b3a5d7cc97d6d30d8b9bc8fa19bf45349ffe46241e8816f50f62f6d6aaabee1" -"checksum num_cpus 1.10.0 (registry+https://github.com/rust-lang/crates.io-index)" = "1a23f0ed30a54abaa0c7e83b1d2d87ada7c3c23078d1d87815af3e3b6385fbba" -"checksum proc-macro2 0.4.27 (registry+https://github.com/rust-lang/crates.io-index)" = "4d317f9caece796be1980837fd5cb3dfec5613ebdb04ad0956deea83ce168915" -"checksum quote 0.6.11 (registry+https://github.com/rust-lang/crates.io-index)" = "cdd8e04bd9c52e0342b406469d494fcb033be4bdbe5c606016defbb1681411e1" +"checksum num-traits 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)" = "6ba9a427cfca2be13aa6f6403b0b7e7368fe982bfa16fccc450ce74c46cd9b32" +"checksum num_cpus 1.10.1 (registry+https://github.com/rust-lang/crates.io-index)" = "bcef43580c035376c0705c42792c294b66974abbfd2789b511784023f71f3273" +"checksum numtoa 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "b8f8bdf33df195859076e54ab11ee78a1b208382d3a26ec40d142ffc1ecc49ef" +"checksum proc-macro2 0.4.30 (registry+https://github.com/rust-lang/crates.io-index)" = "cf3d2011ab5c909338f7887f4fc896d35932e29146c12c8d01da6b22a80ba759" +"checksum quote 0.6.12 (registry+https://github.com/rust-lang/crates.io-index)" = "faf4799c5d274f3868a4aae320a0a182cbd2baee377b378f080e16a23e9d80db" "checksum rand_core 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)" = "7a6fdeb83b075e8266dcc8762c22776f6877a63111121f5f8c7411e5be7eed4b" "checksum rand_core 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "d0e7a549d590831370895ab7ba4ea0c1b6b011d106b5ff2da6eee112615e6dc0" "checksum rand_os 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "7b75f676a1e053fc562eafbb47838d67c84801e38fc1ba459e8f180deabd5071" "checksum rand_xoshiro 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "03b418169fb9c46533f326efd6eed2576699c44ca92d3052a066214a8d828929" -"checksum rayon 1.0.3 (registry+https://github.com/rust-lang/crates.io-index)" = "373814f27745b2686b350dd261bfd24576a6fb0e2c5919b3a2b6005f820b0473" -"checksum rayon-core 1.4.1 (registry+https://github.com/rust-lang/crates.io-index)" = "b055d1e92aba6877574d8fe604a63c8b5df60f60e5982bf7ccbb1338ea527356" +"checksum rayon 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "a4b0186e22767d5b9738a05eab7c6ac90b15db17e5b5f9bd87976dd7d89a10a4" +"checksum rayon-core 1.5.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ebbe0df8435ac0c397d467b6cad6d25543d06e8a019ef3f6af3c384597515bd2" "checksum rdrand 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "678054eb77286b51581ba43620cc911abf02758c91f93f479767aed0f90458b2" -"checksum redox_syscall 0.1.51 (registry+https://github.com/rust-lang/crates.io-index)" = "423e376fffca3dfa06c9e9790a9ccd282fafb3cc6e6397d01dbf64f9bacc6b85" +"checksum redox_syscall 0.1.54 (registry+https://github.com/rust-lang/crates.io-index)" = "12229c14a0f65c4f1cb046a3b52047cdd9da1f4b30f8a39c5063c8bae515e252" "checksum redox_termios 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "7e891cfe48e9100a70a3b6eb652fef28920c117d366339687bd5576160db0f76" -"checksum regex-automata 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "a25a7daa2eea48550e9946133d6cc9621020d29cc7069089617234bf8b6a8693" -"checksum ryu 0.2.7 (registry+https://github.com/rust-lang/crates.io-index)" = "eb9e9b8cde282a9fe6a42dd4681319bfb63f121b8a8ee9439c6f4107e58a46f7" +"checksum regex-automata 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)" = "3ed09217220c272b29ef237a974ad58515bde75f194e3ffa7e6d0bf0f3b01f86" +"checksum ryu 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)" = "b96a9549dc8d48f2c283938303c4b5a77aa29bfbc5b54b084fb1630408899a8f" "checksum same-file 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)" = "8f20c4be53a8a1ff4c1f1b2bd14570d2f634628709752f0702ecdd2b3f9a5267" "checksum scopeguard 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "94258f53601af11e6a49f722422f6e3425c52b06245a5cf9bc09908b174f5e27" -"checksum serde 1.0.89 (registry+https://github.com/rust-lang/crates.io-index)" = "92514fb95f900c9b5126e32d020f5c6d40564c27a5ea6d1d7d9f157a96623560" -"checksum serde_derive 1.0.89 (registry+https://github.com/rust-lang/crates.io-index)" = "bb6eabf4b5914e88e24eea240bb7c9f9a2cbc1bbbe8d961d381975ec3c6b806c" +"checksum serde 1.0.93 (registry+https://github.com/rust-lang/crates.io-index)" = "960e29cf7004b3b6e65fc5002981400eb3ccc017a08a2406940823e58e7179a9" +"checksum serde_derive 1.0.93 (registry+https://github.com/rust-lang/crates.io-index)" = "c4cce6663696bd38272e90bf34a0267e1226156c33f52d3f3915a2dd5d802085" "checksum serde_json 1.0.39 (registry+https://github.com/rust-lang/crates.io-index)" = "5a23aa71d4a4d43fdbfaac00eff68ba8a06a51759a89ac3304323e800c4dd40d" -"checksum syn 0.15.29 (registry+https://github.com/rust-lang/crates.io-index)" = "1825685f977249735d510a242a6727b46efe914bb67e38d30c071b1b72b1d5c2" -"checksum termion 1.5.1 (registry+https://github.com/rust-lang/crates.io-index)" = "689a3bdfaab439fd92bc87df5c4c78417d3cbe537487274e9b0b2dce76e92096" -"checksum textwrap 0.10.0 (registry+https://github.com/rust-lang/crates.io-index)" = "307686869c93e71f94da64286f9a9524c0f308a9e1c87a583de8e9c9039ad3f6" -"checksum tinytemplate 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)" = "7655088894274afb52b807bd3c87072daa1fedd155068b8705cabfd628956115" -"checksum unicode-segmentation 1.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "aa6024fc12ddfd1c6dbc14a80fa2324d4568849869b779f6bd37e5e4c03344d1" +"checksum syn 0.15.38 (registry+https://github.com/rust-lang/crates.io-index)" = "37ea458a750f59ab679b47fef9b6722c586c5742f4cfe18a120bbc807e5e01fd" +"checksum termion 1.5.3 (registry+https://github.com/rust-lang/crates.io-index)" = "6a8fb22f7cde82c8220e5aeacb3258ed7ce996142c77cba193f203515e26c330" +"checksum textwrap 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)" = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060" +"checksum tinytemplate 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "4574b75faccaacddb9b284faecdf0b544b80b6b294f3d062d325c5726a209c20" +"checksum unicode-segmentation 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "1967f4cdfc355b37fd76d2a954fb2ed3871034eb4f26d60537d88795cfc332a9" "checksum unicode-width 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "882386231c45df4700b275c7ff55b6f3698780a650026380e72dabe76fa46526" "checksum unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "fc72304796d0818e357ead4e000d19c9c174ab23dc11093ac919054d20a6a7fc" -"checksum walkdir 2.2.7 (registry+https://github.com/rust-lang/crates.io-index)" = "9d9d7ed3431229a144296213105a390676cc49c9b6a72bd19f3176c98e129fa1" -"checksum winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)" = "92c1eb33641e276cfa214a0522acad57be5c56b10cb348b3c5117db75f3ac4b0" +"checksum walkdir 2.2.8 (registry+https://github.com/rust-lang/crates.io-index)" = "c7904a7e2bb3cdf0cf5e783f44204a85a37a93151738fa349f06680f59a98b45" +"checksum winapi 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)" = "f10e386af2b13e47c89e7236a7a14a086791a2b88ebad6df9bf42040195cf770" "checksum winapi-i686-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" "checksum winapi-util 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7168bab6e1daee33b4557efd0e95d5ca70a03706d39fa5f3fe7a236f584b03c9" "checksum winapi-x86_64-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" diff --git a/bench/src/bench.rs b/bench/src/bench.rs index fe374b0..ec7b1b6 100644 --- a/bench/src/bench.rs +++ b/bench/src/bench.rs @@ -5,7 +5,7 @@ extern crate criterion; extern crate bstr; extern crate unicode_segmentation; -use bstr::BStr; +use bstr::{B, ByteSlice}; use criterion::{Bencher, Benchmark, Criterion, Throughput}; use inputs::*; @@ -48,35 +48,35 @@ fn is_ascii(c: &mut Criterion) { let corpus = SHERLOCK_HUGE; define(c, "is_ascii", "huge-ascii", corpus, move |b| { b.iter(|| { - assert!(BStr::from_bytes(corpus).is_ascii()); + assert!(corpus.is_ascii()); }); }); let corpus = SHERLOCK_SMALL; define(c, "is_ascii", "small-ascii", corpus, move |b| { b.iter(|| { - assert!(BStr::from_bytes(corpus).is_ascii()); + assert!(corpus.is_ascii()); }); }); let corpus = SHERLOCK_TINY; define(c, "is_ascii", "tiny-ascii", corpus, move |b| { b.iter(|| { - assert!(BStr::from_bytes(corpus).is_ascii()); + assert!(corpus.is_ascii()); }); }); let corpus = EMPTY; define(c, "is_ascii", "empty-ascii", corpus, move |b| { b.iter(|| { - assert!(BStr::from_bytes(corpus).is_ascii()); + assert!(corpus.is_ascii()); }); }); let corpus = "abcdefghijklm☃abcdefghijklmnopqrstuvwxyz".as_bytes(); define(c, "is_ascii", "tiny-non-ascii", corpus, move |b| { b.iter(|| { - assert!(!BStr::from_bytes(corpus).is_ascii()); + assert!(!corpus.is_ascii()); }); }); } @@ -86,7 +86,7 @@ fn to_str(c: &mut Criterion) { for &(name, corpus) in CORPORA_HUGE { define(c, "bstr/to_str", name, corpus, move |b| { b.iter(|| { - assert!(BStr::from_bytes(corpus).to_str().is_ok()); + assert!(corpus.to_str().is_ok()); }); }); } @@ -107,7 +107,7 @@ fn to_str_lossy_valid(c: &mut Criterion) { for &(name, corpus) in CORPORA_HUGE { define(c, "bstr/to_str_lossy_valid", name, corpus, move |b| { b.iter(|| { - assert!(BStr::from_bytes(corpus).to_str_lossy().len() > 0); + assert!(corpus.to_str_lossy().len() > 0); }); }); } @@ -127,7 +127,7 @@ fn trim(c: &mut Criterion) { // benchmark our impl define(c, "bstr/trim", "tiny", corpus.as_bytes(), move |b| { b.iter(|| { - assert_eq!("foo\tbar", BStr::new(corpus).trim()); + assert_eq!("foo\tbar".as_bytes(), B(corpus).trim()); }); }); @@ -145,7 +145,7 @@ fn chars(c: &mut Criterion) { define(c, "bstr/chars", name, corpus, move |b| { b.iter(|| { let mut count = 0; - for ch in BStr::from_bytes(corpus).chars() { + for ch in corpus.chars() { count += ch.len_utf8(); } assert!(count > 0); @@ -175,7 +175,7 @@ fn graphemes(c: &mut Criterion) { define(c, "bstr/graphemes", name, corpus, move |b| { b.iter(|| { let mut count = 0; - for g in BStr::from_bytes(corpus).graphemes() { + for g in corpus.graphemes() { count += g.len(); } assert!(count > 0); @@ -206,7 +206,7 @@ fn words(c: &mut Criterion) { define(c, "bstr/words", name, corpus, move |b| { b.iter(|| { let mut count = 0; - for g in BStr::from_bytes(corpus).words() { + for g in corpus.words() { count += g.len(); } assert!(count > 0); @@ -237,7 +237,7 @@ fn sentences(c: &mut Criterion) { define(c, "bstr/sentences", name, corpus, move |b| { b.iter(|| { let mut count = 0; - for g in BStr::from_bytes(corpus).sentences() { + for g in corpus.sentences() { count += g.len(); } assert!(count > 0); diff --git a/bench/src/search.rs b/bench/src/search.rs index 2374a65..eaa1bf6 100644 --- a/bench/src/search.rs +++ b/bench/src/search.rs @@ -1,6 +1,6 @@ use std::str; -use bstr::BStr; +use bstr::ByteSlice; use criterion::Criterion; use inputs::*; @@ -108,7 +108,7 @@ pub fn rfind_iter(c: &mut Criterion) { pub fn find_char(c: &mut Criterion) { let corpus = str::from_utf8(SUBTITLE_EN_HUGE).unwrap(); define(c, "bstr/find_char", "en-huge-ascii", corpus.as_bytes(), move |b| { - let corpus = BStr::new(corpus); + let corpus = corpus.as_bytes(); b.iter(|| { assert_eq!(None, corpus.find_char('γ')); }); @@ -133,7 +133,7 @@ fn define_find_iter( let name = format!("bstr/{}", group_name); define(c, &name, bench_name, corpus.as_bytes(), move |b| { - let corpus = BStr::new(corpus); + let corpus = corpus.as_bytes(); b.iter(|| { assert_eq!(expected, corpus.find_iter(needle).count()); }); @@ -159,7 +159,7 @@ fn define_rfind_iter( let name = format!("bstr/{}", group_name); define(c, &name, bench_name, corpus.as_bytes(), move |b| { - let corpus = BStr::new(corpus); + let corpus = corpus.as_bytes(); b.iter(|| { assert_eq!(expected, corpus.rfind_iter(needle).count()); }); diff --git a/examples/graphemes-std.rs b/examples/graphemes-std.rs index da3faf3..3522736 100644 --- a/examples/graphemes-std.rs +++ b/examples/graphemes-std.rs @@ -5,7 +5,7 @@ use std::io::{self, BufRead, Write}; use unicode_segmentation::UnicodeSegmentation; -fn main() -> Result<(), Box> { +fn main() -> Result<(), Box> { let stdin = io::stdin(); let mut stdin = stdin.lock(); let mut stdout = io::BufWriter::new(io::stdout()); diff --git a/examples/graphemes.rs b/examples/graphemes.rs index b680e02..309dbf6 100644 --- a/examples/graphemes.rs +++ b/examples/graphemes.rs @@ -3,9 +3,9 @@ extern crate bstr; use std::error::Error; use std::io::{self, Write}; -use bstr::io::BufReadExt; +use bstr::{ByteSlice, io::BufReadExt}; -fn main() -> Result<(), Box> { +fn main() -> Result<(), Box> { let stdin = io::stdin(); let mut stdout = io::BufWriter::new(io::stdout()); @@ -16,7 +16,7 @@ fn main() -> Result<(), Box> { .take(10) .last() .unwrap_or(line.len()); - stdout.write_all(line[..end].trim_end().as_bytes())?; + stdout.write_all(line[..end].trim_end())?; stdout.write_all(b"\n")?; Ok(true) })?; diff --git a/examples/lines-std.rs b/examples/lines-std.rs index 51da217..69fc6a5 100644 --- a/examples/lines-std.rs +++ b/examples/lines-std.rs @@ -1,7 +1,7 @@ use std::error::Error; use std::io::{self, BufRead, Write}; -fn main() -> Result<(), Box> { +fn main() -> Result<(), Box> { let stdin = io::stdin(); let mut stdin = stdin.lock(); let mut stdout = io::BufWriter::new(io::stdout()); diff --git a/examples/lines.rs b/examples/lines.rs index 8429aee..746b502 100644 --- a/examples/lines.rs +++ b/examples/lines.rs @@ -3,15 +3,15 @@ extern crate bstr; use std::error::Error; use std::io::{self, Write}; -use bstr::io::BufReadExt; +use bstr::{ByteSlice, io::BufReadExt}; -fn main() -> Result<(), Box> { +fn main() -> Result<(), Box> { let stdin = io::stdin(); let mut stdout = io::BufWriter::new(io::stdout()); stdin.lock().for_byte_line_with_terminator(|line| { - if line.contains("Dimension") { - stdout.write_all(line.as_bytes())?; + if line.contains_str("Dimension") { + stdout.write_all(line)?; } Ok(true) })?; diff --git a/examples/uppercase-std.rs b/examples/uppercase-std.rs index 3029d76..672bd71 100644 --- a/examples/uppercase-std.rs +++ b/examples/uppercase-std.rs @@ -1,7 +1,7 @@ use std::error::Error; use std::io::{self, BufRead, Write}; -fn main() -> Result<(), Box> { +fn main() -> Result<(), Box> { let stdin = io::stdin(); let mut stdin = stdin.lock(); let mut stdout = io::BufWriter::new(io::stdout()); diff --git a/examples/uppercase.rs b/examples/uppercase.rs index 08bf40a..168f878 100644 --- a/examples/uppercase.rs +++ b/examples/uppercase.rs @@ -3,18 +3,17 @@ extern crate bstr; use std::error::Error; use std::io::{self, Write}; -use bstr::BString; -use bstr::io::BufReadExt; +use bstr::{ByteSlice, io::BufReadExt}; -fn main() -> Result<(), Box> { +fn main() -> Result<(), Box> { let stdin = io::stdin(); let mut stdout = io::BufWriter::new(io::stdout()); - let mut upper = BString::new(); + let mut upper = vec![]; stdin.lock().for_byte_line_with_terminator(|line| { upper.clear(); line.to_uppercase_into(&mut upper); - stdout.write_all(upper.as_bytes())?; + stdout.write_all(&upper)?; Ok(true) })?; Ok(()) diff --git a/examples/words-std.rs b/examples/words-std.rs index 39c82e3..7eae116 100644 --- a/examples/words-std.rs +++ b/examples/words-std.rs @@ -5,7 +5,7 @@ use std::io::{self, BufRead}; use unicode_segmentation::UnicodeSegmentation; -fn main() -> Result<(), Box> { +fn main() -> Result<(), Box> { let stdin = io::stdin(); let mut stdin = stdin.lock(); diff --git a/examples/words.rs b/examples/words.rs index 64fb19d..8d8d344 100644 --- a/examples/words.rs +++ b/examples/words.rs @@ -3,9 +3,9 @@ extern crate bstr; use std::error::Error; use std::io; -use bstr::io::BufReadExt; +use bstr::{ByteSlice, io::BufReadExt}; -fn main() -> Result<(), Box> { +fn main() -> Result<(), Box> { let stdin = io::stdin(); let mut words = 0; stdin.lock().for_byte_line_with_terminator(|line| { diff --git a/src/bstr.rs b/src/bstr.rs index b1bc2f9..fd8f8b3 100644 --- a/src/bstr.rs +++ b/src/bstr.rs @@ -1,114 +1,13 @@ -#[cfg(feature = "std")] -use std::borrow::Cow; -#[cfg(feature = "std")] -use std::ffi::OsStr; -#[cfg(feature = "std")] -use std::iter; -#[cfg(feature = "std")] -use std::path::Path; - -use core::cmp; use core::mem; -use core::ops; -use core::ptr; -use core::slice; -use core::str; - -use memchr::{memchr, memrchr}; - -use ascii; -#[cfg(feature = "std")] -use bstring::BString; -use search::{PrefilterState, TwoWay}; -use slice_index::SliceIndex; -#[cfg(feature = "unicode")] -use unicode::{ - Graphemes, GraphemeIndices, - Sentences, SentenceIndices, - Words, WordIndices, WordsWithBreaks, WordsWithBreakIndices, - whitespace_len_fwd, whitespace_len_rev, -}; -use utf8::{self, Chars, CharIndices, Utf8Error}; - -/// A short-hand constructor for building a `&BStr`. -/// -/// This idiosyncratic constructor is useful for concisely building byte string -/// slices. Its primary utility is in conveniently writing byte string literals -/// in a uniform way. For example, consider this code that does not compile: -/// -/// ```ignore -/// let strs = vec![b"a", b"xy"]; -/// ``` -/// -/// The above code doesn't compile because the type of the byte string literal -/// `b"a"` is `&'static [u8; 1]`, and the type of `b"xy"` is -/// `&'static [u8; 2]`. Since their types aren't the same, they can't be stored -/// in the same `Vec`. (This is dissimilar from normal Unicode string slices, -/// where both `"a"` and `"xy"` have the same type of `&'static str`.) -/// -/// One way of getting the above code to compile is to convert byte strings to -/// slices. You might try this: -/// -/// ```ignore -/// let strs = vec![&b"a", &b"xy"]; -/// ``` -/// -/// But this just creates values with type `& &'static [u8; 1]` and -/// `& &'static [u8; 2]`. Instead, you need to force the issue like so: -/// -/// ``` -/// let strs = vec![&b"a"[..], &b"xy"[..]]; -/// // or -/// let strs = vec![b"a".as_ref(), b"xy".as_ref()]; -/// ``` -/// -/// But neither of these are particularly convenient to type, especially when -/// it's something as common as a string literal. Thus, this constructor -/// permits writing the following instead: -/// -/// ``` -/// use bstr::B; -/// -/// let strs = vec![B("a"), B(b"xy")]; -/// ``` -/// -/// Notice that this also lets you mix and match both string literals and byte -/// string literals. This can be quite convenient! -#[allow(non_snake_case)] -#[inline] -pub fn B<'a, B: ?Sized + AsRef<[u8]>>(bytes: &'a B) -> &'a BStr { - BStr::new(bytes.as_ref()) -} -/// A byte string slice that is conventionally UTF-8. -/// -/// A byte string slice is the core string type in this library, and is usually -/// seen in its borrowed form, `&BStr`. The principle difference between a -/// `&BStr` and a `&str` (Rust's standard Unicode string slice) is that a -/// `&BStr` is only *conventionally* UTF-8, where as a `&str` is guaranteed to -/// always be valid UTF-8. +/// A wrapper for `&[u8]` that provides convenient string oriented trait impls. /// /// If you need ownership or a growable byte string buffer, then use /// [`BString`](struct.BString.html). /// -/// # Literals -/// -/// A byte string literal has type `&'static BStr`. The most convenient way to -/// write a byte string literal is by using the short-hand [`B`](fn.B.html) -/// constructor function: -/// -/// ``` -/// use bstr::{B, BStr}; -/// -/// // A byte string literal can be constructed from a normal Unicode string. -/// let s = B("a byte string literal"); -/// // A byte string literal can also be constructed from a Rust byte string. -/// let s = B(b"another byte string literal"); -/// -/// // BStr::new can also be used: -/// let s = BStr::new("a byte string literal"); -/// let s = BStr::new(b"another byte string literal"); -/// ``` +/// Using a `&BStr` is just like using a `&[u8]`, since `BStr` +/// implements `Deref` to `[u8]`. So all methods available on `[u8]` +/// are also available on `BStr`. /// /// # Representation /// @@ -127,3865 +26,36 @@ pub fn B<'a, B: ?Sized + AsRef<[u8]>>(bytes: &'a B) -> &'a BStr { /// The `Display` implementation behaves as if `BStr` were first lossily /// converted to a `str`. Invalid UTF-8 bytes are substituted with the Unicode /// replacement codepoint, which looks like this: �. -/// -/// # Indexing and slicing -/// -/// A `BStr` implements indexing and slicing using `[..]` notation. Unlike -/// the standard `str` type, the `BStr` type permits callers to index -/// individual bytes. For example: -/// -/// ``` -/// use bstr::B; -/// -/// let s = B("foo☃bar"); -/// assert_eq!(&s[0..3], "foo"); -/// assert_eq!(s[2], b'o'); -/// assert_eq!(&s[3..6], "☃"); -/// -/// // Nothing stops you from indexing or slicing invalid UTF-8. -/// assert_eq!(s[3], b'\xE2'); -/// assert_eq!(&s[3..5], B(b"\xE2\x98")); -/// ``` #[derive(Hash)] pub struct BStr { - bytes: [u8], + pub(crate) bytes: [u8], } impl BStr { - /// Create a byte string slice from anything that can be borrowed as a - /// sequence of bytes. This includes, but is not limited to, `&Vec`, - /// `&[u8]`, `&String` and `&str`. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::BStr; - /// - /// assert_eq!("abc", BStr::new("abc")); - /// assert_eq!("abc", BStr::new(b"abc")); - /// ``` #[inline] - pub fn new>(bytes: &B) -> &BStr { + pub(crate) fn new>(bytes: &B) -> &BStr { BStr::from_bytes(bytes.as_ref()) } - /// Create a mutable byte string slice from anything that can be borrowed - /// as a sequence of bytes. This includes, but is not limited to, `&mut - /// Vec` and `&mut [u8]`. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::BStr; - /// - /// assert_eq!("abc", BStr::new("abc")); - /// assert_eq!("abc", BStr::new(b"abc")); - /// ``` #[inline] - pub fn new_mut>(bytes: &mut B) -> &mut BStr { + pub(crate) fn new_mut>( + bytes: &mut B, + ) -> &mut BStr { BStr::from_bytes_mut(bytes.as_mut()) } - /// Create an immutable byte string slice from an immutable byte slice. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::BStr; - /// - /// let bytes = &[b'a']; - /// let bs = BStr::from_bytes(bytes); - /// assert_eq!("a", bs); - /// ``` #[inline] - pub fn from_bytes(slice: &[u8]) -> &BStr { + pub(crate) fn from_bytes(slice: &[u8]) -> &BStr { unsafe { mem::transmute(slice) } } - /// Create a mutable byte string slice from a mutable byte slice. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::BStr; - /// - /// let bytes = &mut [b'a']; - /// { - /// let bs = BStr::from_bytes_mut(bytes); - /// bs[0] = b'b'; - /// } - /// assert_eq!(b"b", bytes); - /// ``` #[inline] - pub fn from_bytes_mut(slice: &mut [u8]) -> &mut BStr { + pub(crate) fn from_bytes_mut(slice: &mut [u8]) -> &mut BStr { unsafe { mem::transmute(slice) } } - /// Create a byte string from its constituent pointer and length, where - /// the length is the number of bytes in the byte string. - /// - /// # Safety - /// - /// This function is unsafe as there is no guarantee that the given pointer - /// is valid for `len` elements, nor whether the lifetime inferred is a - /// suitable lifetime for the returned slice. - /// - /// `data` must be a non-null pointer, even for a zero length slice. A - /// pointer that is usable for zero-length slices can be obtaining from - /// the standard library's `NonNull::dangling()` constructor. - /// - /// The total size of the given slice must be no larger than `isize::MAX` - /// bytes in memory. - /// - /// # Caveat - /// - /// The lifetime for the returned slice is inferred from its usage. To - /// prevent accidental misuse, it's suggested to tie the lifetime to - /// whichever source lifetime is safe in the context, such as by providing - /// a helper function taking the lifetime of a host value for the slice, or - /// by explicit annotation. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::BStr; - /// - /// // manifest a byte string from a single byte - /// let x = b'Z'; - /// let ptr = &x as *const u8; - /// let s = unsafe { BStr::from_raw_parts(ptr, 1) }; - /// assert_eq!(s, "Z"); - /// ``` - pub unsafe fn from_raw_parts<'a>(data: *const u8, len: usize) -> &'a BStr { - BStr::new(slice::from_raw_parts(data, len)) - } - - /// Create a mutable byte string from its constituent pointer and length, - /// where the length is the number of bytes in the byte string. - /// - /// # Safety - /// - /// This function is unsafe as there is no guarantee that the given pointer - /// is valid for `len` elements, nor whether the lifetime inferred is a - /// suitable lifetime for the returned slice. - /// - /// `data` must be a non-null pointer, even for a zero length slice. A - /// pointer that is usable for zero-length slices can be obtaining from - /// the standard library's `NonNull::dangling()` constructor. - /// - /// The total size of the given slice must be no larger than `isize::MAX` - /// bytes in memory. - /// - /// The above reasons are the same as for - /// [`from_raw_parts`](#method.from_raw_parts). In addition, for this - /// constructor, callers must guarantee that the mutable slice returned - /// is not aliased with any other reference. - /// - /// # Caveat - /// - /// The lifetime for the returned slice is inferred from its usage. To - /// prevent accidental misuse, it's suggested to tie the lifetime to - /// whichever source lifetime is safe in the context, such as by providing - /// a helper function taking the lifetime of a host value for the slice, or - /// by explicit annotation. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use std::mem; - /// use bstr::{BStr, BString}; - /// - /// // For demonstration purposes, get a mutable pointer to a byte string. - /// let mut buf = BString::from("bar"); - /// let ptr = buf.as_mut_ptr(); - /// // Drop buf without deallocating, to avoid &mut aliasing. - /// mem::forget(buf); - /// - /// // Now convert it to a mutable byte string from the raw pointer. - /// let mut s = unsafe { BStr::from_raw_parts_mut(ptr, 3) }; - /// s.make_ascii_uppercase(); - /// assert_eq!(s, "BAR"); - /// ``` - pub unsafe fn from_raw_parts_mut<'a>( - data: *mut u8, - len: usize, - ) -> &'a mut BStr { - BStr::new_mut(slice::from_raw_parts_mut(data, len)) - } - - /// Create an immutable byte string from an OS string slice. - /// - /// On Unix, this always succeeds and is zero cost. On non-Unix systems, - /// this returns `None` if the given OS string is not valid UTF-8. (For - /// example, on Windows, file paths are allowed to be a sequence of - /// arbitrary 16-bit integers. Not all such sequences can be transcoded to - /// valid UTF-8.) - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use std::ffi::OsStr; - /// - /// use bstr::BStr; - /// - /// let os_str = OsStr::new("foo"); - /// let bs = BStr::from_os_str(os_str).expect("should be valid UTF-8"); - /// assert_eq!(bs, "foo"); - /// ``` - #[cfg(feature = "std")] - #[inline] - pub fn from_os_str(os_str: &OsStr) -> Option<&BStr> { - BStr::from_os_str_imp(os_str) - } - - #[cfg(feature = "std")] - #[cfg(unix)] - #[inline] - fn from_os_str_imp(os_str: &OsStr) -> Option<&BStr> { - use std::os::unix::ffi::OsStrExt; - - Some(BStr::new(os_str.as_bytes())) - } - - #[cfg(feature = "std")] - #[cfg(not(unix))] - #[inline] - fn from_os_str_imp(os_str: &OsStr) -> Option<&BStr> { - os_str.to_str().map(BStr::new) - } - - /// Create an immutable byte string from a file path. - /// - /// On Unix, this always succeeds and is zero cost. On non-Unix systems, - /// this returns `None` if the given path is not valid UTF-8. (For example, - /// on Windows, file paths are allowed to be a sequence of arbitrary 16-bit - /// integers. Not all such sequences can be transcoded to valid UTF-8.) - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use std::path::Path; - /// - /// use bstr::BStr; - /// - /// let path = Path::new("foo"); - /// let bs = BStr::from_path(path).expect("should be valid UTF-8"); - /// assert_eq!(bs, "foo"); - /// ``` - #[cfg(feature = "std")] - #[inline] - pub fn from_path(path: &Path) -> Option<&BStr> { - BStr::from_os_str(path.as_os_str()) - } - - /// Returns the length, in bytes, of this byte string. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::BStr; - /// - /// assert_eq!(0, BStr::new("").len()); - /// assert_eq!(3, BStr::new("abc").len()); - /// assert_eq!(8, BStr::new("☃βツ").len()); - /// ``` #[inline] - pub fn len(&self) -> usize { - self.bytes.len() - } - - /// Returns true if and only if the length of this byte string is zero. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::BStr; - /// - /// assert!(BStr::new("").is_empty()); - /// assert!(!BStr::new("abc").is_empty()); - /// ``` - #[inline] - pub fn is_empty(&self) -> bool { - self.bytes.is_empty() - } - - /// Returns an immutable byte slice of this `BStr`'s contents. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::B; - /// - /// let s = B("hello"); - /// - /// assert_eq!(&[104, 101, 108, 108, 111], s.as_bytes()); - /// ``` - #[inline] - pub fn as_bytes(&self) -> &[u8] { + pub(crate) fn as_bytes(&self) -> &[u8] { &self.bytes } - - /// Returns a mutable byte slice of this `BStr`'s contents. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::BString; - /// - /// let mut s = BString::from("hello"); - /// s.as_bytes_mut()[1] = b'a'; - /// - /// assert_eq!(&[104, 97, 108, 108, 111], s.as_bytes()); - /// ``` - #[inline] - pub fn as_bytes_mut(&mut self) -> &mut [u8] { - &mut self.bytes - } - - /// Create a new owned byte string from this byte string slice. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::BStr; - /// - /// let s = BStr::new("abc"); - /// let mut owned = s.to_bstring(); - /// owned.push_char('d'); - /// assert_eq!("abcd", owned); - /// ``` - #[cfg(feature = "std")] - #[inline] - pub fn to_bstring(&self) -> BString { - BString::from_vec(self.as_bytes().to_vec()) - } - - /// Safely convert this byte string into a `&str` if it's valid UTF-8. - /// - /// If this byte string is not valid UTF-8, then an error is returned. The - /// error returned indicates the first invalid byte found and the length - /// of the error. - /// - /// In cases where a lossy conversion to `&str` is acceptable, then use one - /// of the [`to_str_lossy`](#method.to_str_lossy) - /// or [`to_str_lossy_into`](#method.to_str_lossy_into) methods. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::{B, BString}; - /// - /// # fn example() -> Result<(), bstr::Utf8Error> { - /// let s = B("☃βツ").to_str()?; - /// assert_eq!("☃βツ", s); - /// - /// let mut bstring = BString::from("☃βツ"); - /// bstring.push_byte(b'\xFF'); - /// let err = bstring.to_str().unwrap_err(); - /// assert_eq!(8, err.valid_up_to()); - /// # Ok(()) }; example().unwrap() - /// ``` - #[inline] - pub fn to_str(&self) -> Result<&str, Utf8Error> { - utf8::validate(self.as_bytes()).map(|_| { - // SAFETY: This is safe because of the guarantees provided by - // utf8::validate. - unsafe { - str::from_utf8_unchecked(self.as_bytes()) - } - }) - } - - /// Unsafely convert this byte string into a `&str`, without checking for - /// valid UTF-8. - /// - /// # Safety - /// - /// Callers *must* ensure that this byte string is valid UTF-8 before - /// calling this method. Converting a byte string into a `&str` that is - /// not valid UTF-8 is considered undefined behavior. - /// - /// This routine is useful in performance sensitive contexts where the - /// UTF-8 validity of the byte string is already known and it is - /// undesirable to pay the cost of an additional UTF-8 validation check - /// that [`to_str`](#method.to_str) performs. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::{B, BString}; - /// - /// // SAFETY: This is safe because string literals are guaranteed to be - /// // valid UTF-8 by the Rust compiler. - /// let s = unsafe { B("☃βツ").to_str_unchecked() }; - /// assert_eq!("☃βツ", s); - /// ``` - pub unsafe fn to_str_unchecked(&self) -> &str { - str::from_utf8_unchecked(self.as_bytes()) - } - - /// Convert this byte string to a valid UTF-8 string by replacing invalid - /// UTF-8 bytes with the Unicode replacement codepoint (`U+FFFD`). - /// - /// If the byte string is already valid UTF-8, then no copying or - /// allocation is performed and a borrrowed string slice is returned. If - /// the byte string is not valid UTF-8, then an owned string buffer is - /// returned with invalid bytes replaced by the replacement codepoint. - /// - /// This method uses the "substitution of maximal subparts" (Unicode - /// Standard, Chapter 3, Section 9) strategy for inserting the replacement - /// codepoint. Specifically, a replacement codepoint is inserted whenever a - /// byte is found that cannot possibly lead to a valid code unit sequence. - /// If there were previous bytes that represented a prefix of a well-formed - /// code unit sequence, then all of those bytes are substituted with a - /// single replacement codepoint. The "substitution of maximal subparts" - /// strategy is the same strategy used by - /// [W3C's Encoding standard](https://www.w3.org/TR/encoding/). - /// For a more precise description of the maximal subpart strategy, see - /// the Unicode Standard, Chapter 3, Section 9. See also - /// [Public Review Issue #121](http://www.unicode.org/review/pr-121.html). - /// - /// N.B. Rust's standard library also appears to use the same strategy, - /// but it does not appear to be an API guarantee. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use std::borrow::Cow; - /// use bstr::BString; - /// - /// let mut bstring = BString::from("☃βツ"); - /// assert_eq!(Cow::Borrowed("☃βツ"), bstring.to_str_lossy()); - /// - /// // Add a byte that makes the sequence invalid. - /// bstring.push_byte(b'\xFF'); - /// assert_eq!(Cow::Borrowed("☃βツ\u{FFFD}"), bstring.to_str_lossy()); - /// ``` - /// - /// This demonstrates the "maximal subpart" substitution logic. - /// - /// ``` - /// use bstr::B; - /// - /// // \x61 is the ASCII codepoint for 'a'. - /// // \xF1\x80\x80 is a valid 3-byte code unit prefix. - /// // \xE1\x80 is a valid 2-byte code unit prefix. - /// // \xC2 is a valid 1-byte code unit prefix. - /// // \x62 is the ASCII codepoint for 'b'. - /// // - /// // In sum, each of the prefixes is replaced by a single replacement - /// // codepoint since none of the prefixes are properly completed. This - /// // is in contrast to other strategies that might insert a replacement - /// // codepoint for every single byte. - /// let bs = B(b"\x61\xF1\x80\x80\xE1\x80\xC2\x62"); - /// assert_eq!("a\u{FFFD}\u{FFFD}\u{FFFD}b", bs.to_str_lossy()); - /// ``` - #[cfg(feature = "std")] - #[inline] - pub fn to_str_lossy(&self) -> Cow { - match utf8::validate(self.as_bytes()) { - Ok(()) => { - // SAFETY: This is safe because of the guarantees provided by - // utf8::validate. - unsafe { - Cow::Borrowed(str::from_utf8_unchecked(self.as_bytes())) - } - } - Err(err) => { - let mut lossy = String::with_capacity(self.len()); - let (valid, after) = self - .as_bytes() - .split_at(err.valid_up_to()); - // SAFETY: This is safe because utf8::validate guarantees - // that all of `valid` is valid UTF-8. - lossy.push_str(unsafe { str::from_utf8_unchecked(valid) }); - lossy.push_str("\u{FFFD}"); - if let Some(len) = err.error_len() { - B(&after[len..]).to_str_lossy_into(&mut lossy); - } - Cow::Owned(lossy) - } - } - } - - /// Copy the contents of this byte string into the given owned string - /// buffer, while replacing invalid UTF-8 code unit sequences with the - /// Unicode replacement codepoint (`U+FFFD`). - /// - /// This method uses the same "substitution of maximal subparts" strategy - /// for inserting the replacement codepoint as the - /// [`to_str_lossy`](#method.to_str_lossy) method. - /// - /// This routine is useful for amortizing allocation. However, unlike - /// `to_str_lossy`, this routine will _always_ copy the contents of this - /// byte string into the destination buffer, even if this byte string is - /// valid UTF-8. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use std::borrow::Cow; - /// use bstr::BString; - /// - /// let mut bstring = BString::from("☃βツ"); - /// // Add a byte that makes the sequence invalid. - /// bstring.push_byte(b'\xFF'); - /// - /// let mut dest = String::new(); - /// bstring.to_str_lossy_into(&mut dest); - /// assert_eq!("☃βツ\u{FFFD}", dest); - /// ``` - #[cfg(feature = "std")] - #[inline] - pub fn to_str_lossy_into(&self, dest: &mut String) { - dest.reserve(self.len()); - let mut bytes = self.as_bytes(); - loop { - match utf8::validate(bytes) { - Ok(()) => { - // SAFETY: This is safe because utf8::validate guarantees - // that all of `bytes` is valid UTF-8. - dest.push_str(unsafe { str::from_utf8_unchecked(bytes) }); - break; - } - Err(err) => { - let (valid, after) = bytes.split_at(err.valid_up_to()); - // SAFETY: This is safe because utf8::validate guarantees - // that all of `valid` is valid UTF-8. - dest.push_str(unsafe { str::from_utf8_unchecked(valid) }); - dest.push_str("\u{FFFD}"); - match err.error_len() { - None => break, - Some(len) => bytes = &after[len..], - } - } - } - } - } - - /// Create an OS string slice from this byte string. - /// - /// On Unix, this always succeeds and is zero cost. On non-Unix systems, - /// this returns a UTF-8 decoding error if this byte string is not valid - /// UTF-8. (For example, on Windows, file paths are allowed to be a - /// sequence of arbitrary 16-bit integers. There is no obvious mapping from - /// an arbitrary sequence of 8-bit integers to an arbitrary sequence of - /// 16-bit integers.) - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::B; - /// - /// let bs = B("foo"); - /// let os_str = bs.to_os_str().expect("should be valid UTF-8"); - /// assert_eq!(os_str, "foo"); - /// ``` - #[cfg(feature = "std")] - #[inline] - pub fn to_os_str(&self) -> Result<&OsStr, Utf8Error> { - self.to_os_str_imp() - } - - #[cfg(feature = "std")] - #[cfg(unix)] - #[inline] - fn to_os_str_imp(&self) -> Result<&OsStr, Utf8Error> { - use std::os::unix::ffi::OsStrExt; - - Ok(OsStr::from_bytes(self.as_bytes())) - } - - #[cfg(feature = "std")] - #[cfg(not(unix))] - #[inline] - fn to_os_str_imp(&self) -> Result<&OsStr, Utf8Error> { - self.to_str().map(OsStr::new) - } - - /// Lossily create an OS string slice from this byte string. - /// - /// On Unix, this always succeeds and is zero cost. On non-Unix systems, - /// this will perform a UTF-8 check and lossily convert this byte string - /// into valid UTF-8 using the Unicode replacement codepoint. - /// - /// Note that this can prevent the correct roundtripping of file paths on - /// non-Unix systems such as Windows, where file paths are an arbitrary - /// sequence of 16-bit integers. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::B; - /// - /// let bs = B(b"foo\xFFbar"); - /// let os_str = bs.to_os_str_lossy(); - /// assert_eq!(os_str.to_string_lossy(), "foo\u{FFFD}bar"); - /// ``` - #[cfg(feature = "std")] - #[inline] - pub fn to_os_str_lossy(&self) -> Cow { - self.to_os_str_lossy_imp() - } - - #[cfg(feature = "std")] - #[cfg(unix)] - #[inline] - fn to_os_str_lossy_imp(&self) -> Cow { - use std::os::unix::ffi::OsStrExt; - - Cow::Borrowed(OsStr::from_bytes(self.as_bytes())) - } - - #[cfg(feature = "std")] - #[cfg(not(unix))] - #[inline] - fn to_os_str_lossy_imp(&self) -> Cow { - use std::ffi::OsString; - - match self.to_str_lossy() { - Cow::Borrowed(x) => Cow::Borrowed(OsStr::new(x)), - Cow::Owned(x) => Cow::Owned(OsString::from(x)), - } - } - - /// Create a path slice from this byte string. - /// - /// On Unix, this always succeeds and is zero cost. On non-Unix systems, - /// this returns a UTF-8 decoding error if this byte string is not valid - /// UTF-8. (For example, on Windows, file paths are allowed to be a - /// sequence of arbitrary 16-bit integers. There is no obvious mapping from - /// an arbitrary sequence of 8-bit integers to an arbitrary sequence of - /// 16-bit integers.) - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::B; - /// - /// let bs = B("foo"); - /// let path = bs.to_path().expect("should be valid UTF-8"); - /// assert_eq!(path.as_os_str(), "foo"); - /// ``` - #[cfg(feature = "std")] - #[inline] - pub fn to_path(&self) -> Result<&Path, Utf8Error> { - self.to_os_str().map(Path::new) - } - - /// Lossily create a path slice from this byte string. - /// - /// On Unix, this always succeeds and is zero cost. On non-Unix systems, - /// this will perform a UTF-8 check and lossily convert this byte string - /// into valid UTF-8 using the Unicode replacement codepoint. - /// - /// Note that this can prevent the correct roundtripping of file paths on - /// non-Unix systems such as Windows, where file paths are an arbitrary - /// sequence of 16-bit integers. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::B; - /// - /// let bs = B(b"foo\xFFbar"); - /// let path = bs.to_path_lossy(); - /// assert_eq!(path.to_string_lossy(), "foo\u{FFFD}bar"); - /// ``` - #[cfg(feature = "std")] - #[inline] - pub fn to_path_lossy(&self) -> Cow { - use std::path::PathBuf; - - match self.to_os_str_lossy() { - Cow::Borrowed(x) => Cow::Borrowed(Path::new(x)), - Cow::Owned(x) => Cow::Owned(PathBuf::from(x)), - } - } - - /// Create a new `BString` by repeating this byte string `n` times. - /// - /// # Panics - /// - /// This function panics if the capacity of the new `BString` would - /// overflow. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::B; - /// - /// assert_eq!("foofoofoofoo", B("foo").repeat(4)); - /// assert_eq!("", B("foo").repeat(0)); - /// ``` - #[cfg(feature = "std")] - #[inline] - pub fn repeat(&self, n: usize) -> BString { - iter::repeat(self).take(n).collect() - } - - /// Returns true if and only if this byte string contains the given needle. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::B; - /// - /// assert!(B("foo bar").contains("foo")); - /// assert!(B("foo bar").contains("bar")); - /// assert!(!B("foo").contains("foobar")); - /// ``` - #[inline] - pub fn contains>(&self, needle: B) -> bool { - self.find(needle).is_some() - } - - /// Returns true if and only if this byte string has the given prefix. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::B; - /// - /// assert!(B("foo bar").starts_with("foo")); - /// assert!(!B("foo bar").starts_with("bar")); - /// assert!(!B("foo").starts_with("foobar")); - /// ``` - #[inline] - pub fn starts_with>(&self, prefix: B) -> bool { - let prefix = prefix.as_ref(); - self.get(..prefix.len()).map_or(false, |x| x == prefix) - } - - /// Returns true if and only if this byte string has the given suffix. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::B; - /// - /// assert!(B("foo bar").ends_with("bar")); - /// assert!(!B("foo bar").ends_with("foo")); - /// assert!(!B("bar").ends_with("foobar")); - /// ``` - #[inline] - pub fn ends_with>(&self, suffix: B) -> bool { - let suffix = suffix.as_ref(); - self.len() - .checked_sub(suffix.len()) - .map_or(false, |s| &self[s..] == suffix) - } - - /// Returns the index of the first occurrence of the given needle. - /// - /// The needle may be any type that can be cheaply converted into a - /// `&[u8]`. This includes, but is not limited to, `&str`, `&BStr`, and of - /// course, `&[u8]` itself. - /// - /// Note that if you're are searching for the same needle in many - /// different small haystacks, it may be faster to initialize a - /// [`Finder`](struct.Finder.html) once, and reuse it for each search. - /// - /// # Complexity - /// - /// This routine is guaranteed to have worst case linear time complexity - /// with respect to both the needle and the haystack. That is, this runs - /// in `O(needle.len() + haystack.len())` time. - /// - /// This routine is also guaranteed to have worst case constant space - /// complexity. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::B; - /// - /// let s = B("foo bar baz"); - /// assert_eq!(Some(0), s.find("foo")); - /// assert_eq!(Some(4), s.find("bar")); - /// assert_eq!(None, s.find("quux")); - /// ``` - #[inline] - pub fn find>(&self, needle: B) -> Option { - Finder::new(needle.as_ref()).find(self) - } - - /// Returns the index of the last occurrence of the given needle. - /// - /// The needle may be any type that can be cheaply converted into a - /// `&[u8]`. This includes, but is not limited to, `&str`, `&BStr`, and of - /// course, `&[u8]` itself. - /// - /// Note that if you're are searching for the same needle in many - /// different small haystacks, it may be faster to initialize a - /// [`FinderReverse`](struct.FinderReverse.html) once, and reuse it for - /// each search. - /// - /// # Complexity - /// - /// This routine is guaranteed to have worst case linear time complexity - /// with respect to both the needle and the haystack. That is, this runs - /// in `O(needle.len() + haystack.len())` time. - /// - /// This routine is also guaranteed to have worst case constant space - /// complexity. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::B; - /// - /// let s = B("foo bar baz"); - /// assert_eq!(Some(0), s.rfind("foo")); - /// assert_eq!(Some(4), s.rfind("bar")); - /// assert_eq!(Some(8), s.rfind("ba")); - /// assert_eq!(None, s.rfind("quux")); - /// ``` - #[inline] - pub fn rfind>(&self, needle: B) -> Option { - FinderReverse::new(needle.as_ref()).rfind(self) - } - - /// Returns an iterator of the non-overlapping occurrences of the given - /// needle. The iterator yields byte offset positions indicating the start - /// of each match. - /// - /// # Complexity - /// - /// This routine is guaranteed to have worst case linear time complexity - /// with respect to both the needle and the haystack. That is, this runs - /// in `O(needle.len() + haystack.len())` time. - /// - /// This routine is also guaranteed to have worst case constant space - /// complexity. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::B; - /// - /// let s = B("foo bar foo foo quux foo"); - /// let matches: Vec = s.find_iter("foo").collect(); - /// assert_eq!(matches, vec![0, 8, 12, 21]); - /// ``` - /// - /// An empty string matches at every position, including the position - /// immediately following the last byte: - /// - /// ``` - /// use bstr::B; - /// - /// let matches: Vec = B("foo").find_iter("").collect(); - /// assert_eq!(matches, vec![0, 1, 2, 3]); - /// - /// let matches: Vec = B("").find_iter("").collect(); - /// assert_eq!(matches, vec![0]); - /// ``` - #[inline] - pub fn find_iter<'a, B: ?Sized + AsRef<[u8]>>( - &'a self, - needle: &'a B, - ) -> Find<'a> { - Find::new(self, BStr::new(needle.as_ref())) - } - - /// Returns an iterator of the non-overlapping occurrences of the given - /// needle in reverse. The iterator yields byte offset positions indicating - /// the start of each match. - /// - /// # Complexity - /// - /// This routine is guaranteed to have worst case linear time complexity - /// with respect to both the needle and the haystack. That is, this runs - /// in `O(needle.len() + haystack.len())` time. - /// - /// This routine is also guaranteed to have worst case constant space - /// complexity. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::B; - /// - /// let s = B("foo bar foo foo quux foo"); - /// let matches: Vec = s.rfind_iter("foo").collect(); - /// assert_eq!(matches, vec![21, 12, 8, 0]); - /// ``` - /// - /// An empty string matches at every position, including the position - /// immediately following the last byte: - /// - /// ``` - /// use bstr::B; - /// - /// let matches: Vec = B("foo").rfind_iter("").collect(); - /// assert_eq!(matches, vec![3, 2, 1, 0]); - /// - /// let matches: Vec = B("").rfind_iter("").collect(); - /// assert_eq!(matches, vec![0]); - /// ``` - #[inline] - pub fn rfind_iter<'a, B: ?Sized + AsRef<[u8]>>( - &'a self, - needle: &'a B, - ) -> FindReverse<'a> { - FindReverse::new(self, BStr::new(needle.as_ref())) - } - - /// Returns the index of the first occurrence of the given byte. If the - /// byte does not occur in this byte string, then `None` is returned. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::B; - /// - /// assert_eq!(Some(10), B("foo bar baz").find_byte(b'z')); - /// assert_eq!(None, B("foo bar baz").find_byte(b'y')); - /// ``` - #[inline] - pub fn find_byte(&self, byte: u8) -> Option { - memchr(byte, self.as_bytes()) - } - - /// Returns the index of the last occurrence of the given byte. If the - /// byte does not occur in this byte string, then `None` is returned. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::B; - /// - /// assert_eq!(Some(10), B("foo bar baz").rfind_byte(b'z')); - /// assert_eq!(None, B("foo bar baz").rfind_byte(b'y')); - /// ``` - #[inline] - pub fn rfind_byte(&self, byte: u8) -> Option { - memrchr(byte, self.as_bytes()) - } - - /// Returns the index of the first occurrence of the given codepoint. - /// If the codepoint does not occur in this byte string, then `None` is - /// returned. - /// - /// Note that if one searches for the replacement codepoint, `\u{FFFD}`, - /// then only explicit occurrences of that encoding will be found. Invalid - /// UTF-8 sequences will not be matched. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::B; - /// - /// assert_eq!(Some(10), B("foo bar baz").find_char('z')); - /// assert_eq!(Some(4), B("αβγγδ").find_char('γ')); - /// assert_eq!(None, B("foo bar baz").find_char('y')); - /// ``` - #[inline] - pub fn find_char(&self, ch: char) -> Option { - self.find(ch.encode_utf8(&mut [0; 4])) - } - - /// Returns the index of the last occurrence of the given codepoint. - /// If the codepoint does not occur in this byte string, then `None` is - /// returned. - /// - /// Note that if one searches for the replacement codepoint, `\u{FFFD}`, - /// then only explicit occurrences of that encoding will be found. Invalid - /// UTF-8 sequences will not be matched. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::B; - /// - /// assert_eq!(Some(10), B("foo bar baz").rfind_char('z')); - /// assert_eq!(Some(6), B("αβγγδ").rfind_char('γ')); - /// assert_eq!(None, B("foo bar baz").rfind_char('y')); - /// ``` - #[inline] - pub fn rfind_char(&self, ch: char) -> Option { - self.rfind(ch.encode_utf8(&mut [0; 4])) - } - - /// Returns an iterator over the fields in a byte string, separated by - /// contiguous whitespace. - /// - /// # Example - /// - /// Basic usage: - /// - /// ``` - /// use bstr::{B, BStr}; - /// - /// let s = B(" foo\tbar\t\u{2003}\nquux \n"); - /// let fields: Vec<&BStr> = s.fields().collect(); - /// assert_eq!(fields, vec!["foo", "bar", "quux"]); - /// ``` - /// - /// A byte string consisting of just whitespace yields no elements: - /// - /// ``` - /// use bstr::B; - /// - /// assert_eq!(0, B(" \n\t\u{2003}\n \t").fields().count()); - /// ``` - #[inline] - pub fn fields(&self) -> Fields { - Fields::new(self) - } - - /// Returns an iterator over the fields in a byte string, separated by - /// contiguous codepoints satisfying the given predicate. - /// - /// If this byte - /// - /// # Example - /// - /// Basic usage: - /// - /// ``` - /// use bstr::{B, BStr}; - /// - /// let s = B("123foo999999bar1quux123456"); - /// let fields: Vec<&BStr> = s.fields_with(|c| c.is_numeric()).collect(); - /// assert_eq!(fields, vec!["foo", "bar", "quux"]); - /// ``` - /// - /// A byte string consisting of all codepoints satisfying the predicate - /// yields no elements: - /// - /// ``` - /// use bstr::B; - /// - /// assert_eq!(0, B("1911354563").fields_with(|c| c.is_numeric()).count()); - /// ``` - #[inline] - pub fn fields_with bool>(&self, f: F) -> FieldsWith { - FieldsWith::new(self, f) - } - - /// Returns an iterator over substrings of this byte string, separated - /// by the given byte string. Each element yielded is guaranteed not to - /// include the splitter substring. - /// - /// The splitter may be any type that can be cheaply converted into a - /// `&[u8]`. This includes, but is not limited to, `&str`, `&BStr`, and of - /// course, `&[u8]` itself. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::{B, BStr}; - /// - /// let x: Vec<&BStr> = B("Mary had a little lamb").split(" ").collect(); - /// assert_eq!(x, vec!["Mary", "had", "a", "little", "lamb"]); - /// - /// let x: Vec<&BStr> = B("").split("X").collect(); - /// assert_eq!(x, vec![""]); - /// - /// let x: Vec<&BStr> = B("lionXXtigerXleopard").split("X").collect(); - /// assert_eq!(x, vec!["lion", "", "tiger", "leopard"]); - /// - /// let x: Vec<&BStr> = B("lion::tiger::leopard").split("::").collect(); - /// assert_eq!(x, vec!["lion", "tiger", "leopard"]); - /// ``` - /// - /// If a string contains multiple contiguous separators, you will end up - /// with empty strings yielded by the iterator: - /// - /// ``` - /// use bstr::{B, BStr}; - /// - /// let x: Vec<&BStr> = B("||||a||b|c").split("|").collect(); - /// assert_eq!(x, vec!["", "", "", "", "a", "", "b", "c"]); - /// - /// let x: Vec<&BStr> = B("(///)").split("/").collect(); - /// assert_eq!(x, vec!["(", "", "", ")"]); - /// ``` - /// - /// Separators at the start or end of a string are neighbored by empty - /// strings. - /// - /// ``` - /// use bstr::{B, BStr}; - /// - /// let x: Vec<&BStr> = B("010").split("0").collect(); - /// assert_eq!(x, vec!["", "1", ""]); - /// ``` - /// - /// When the empty string is used as a separator, it splits every **byte** - /// in the byte string, along with the beginning and end of the byte - /// string. - /// - /// ``` - /// use bstr::{B, BStr}; - /// - /// let x: Vec<&BStr> = B("rust").split("").collect(); - /// assert_eq!(x, vec!["", "r", "u", "s", "t", ""]); - /// - /// // Splitting by an empty string is not UTF-8 aware. Elements yielded - /// // may not be valid UTF-8! - /// let x: Vec<&BStr> = B("☃").split("").collect(); - /// assert_eq!(x, vec![B(""), B(b"\xE2"), B(b"\x98"), B(b"\x83"), B("")]); - /// ``` - /// - /// Contiguous separators, especially whitespace, can lead to possibly - /// surprising behavior. For example, this code is correct: - /// - /// ``` - /// use bstr::{B, BStr}; - /// - /// let x: Vec<&BStr> = B(" a b c").split(" ").collect(); - /// assert_eq!(x, vec!["", "", "", "", "a", "", "b", "c"]); - /// ``` - /// - /// It does *not* give you `["a", "b", "c"]`. For that behavior, use - /// [`fields`](#method.fields) instead. - #[inline] - pub fn split<'a, B: ?Sized + AsRef<[u8]>>( - &'a self, - splitter: &'a B, - ) -> Split<'a> { - Split::new(self, BStr::new(splitter.as_ref())) - } - - /// Returns an iterator over substrings of this byte string, separated by - /// the given byte string, in reverse. Each element yielded is guaranteed - /// not to include the splitter substring. - /// - /// The splitter may be any type that can be cheaply converted into a - /// `&[u8]`. This includes, but is not limited to, `&str`, `&BStr`, and of - /// course, `&[u8]` itself. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::{B, BStr}; - /// - /// let x: Vec<&BStr> = B("Mary had a little lamb").rsplit(" ").collect(); - /// assert_eq!(x, vec!["lamb", "little", "a", "had", "Mary"]); - /// - /// let x: Vec<&BStr> = B("").rsplit("X").collect(); - /// assert_eq!(x, vec![""]); - /// - /// let x: Vec<&BStr> = B("lionXXtigerXleopard").rsplit("X").collect(); - /// assert_eq!(x, vec!["leopard", "tiger", "", "lion"]); - /// - /// let x: Vec<&BStr> = B("lion::tiger::leopard").rsplit("::").collect(); - /// assert_eq!(x, vec!["leopard", "tiger", "lion"]); - /// ``` - /// - /// If a string contains multiple contiguous separators, you will end up - /// with empty strings yielded by the iterator: - /// - /// ``` - /// use bstr::{B, BStr}; - /// - /// let x: Vec<&BStr> = B("||||a||b|c").rsplit("|").collect(); - /// assert_eq!(x, vec!["c", "b", "", "a", "", "", "", ""]); - /// - /// let x: Vec<&BStr> = B("(///)").rsplit("/").collect(); - /// assert_eq!(x, vec![")", "", "", "("]); - /// ``` - /// - /// Separators at the start or end of a string are neighbored by empty - /// strings. - /// - /// ``` - /// use bstr::{B, BStr}; - /// - /// let x: Vec<&BStr> = B("010").rsplit("0").collect(); - /// assert_eq!(x, vec!["", "1", ""]); - /// ``` - /// - /// When the empty string is used as a separator, it splits every **byte** - /// in the byte string, along with the beginning and end of the byte - /// string. - /// - /// ``` - /// use bstr::{B, BStr}; - /// - /// let x: Vec<&BStr> = B("rust").rsplit("").collect(); - /// assert_eq!(x, vec!["", "t", "s", "u", "r", ""]); - /// - /// // Splitting by an empty string is not UTF-8 aware. Elements yielded - /// // may not be valid UTF-8! - /// let x: Vec<&BStr> = B("☃").rsplit("").collect(); - /// assert_eq!(x, vec![B(""), B(b"\x83"), B(b"\x98"), B(b"\xE2"), B("")]); - /// ``` - /// - /// Contiguous separators, especially whitespace, can lead to possibly - /// surprising behavior. For example, this code is correct: - /// - /// ``` - /// use bstr::{B, BStr}; - /// - /// let x: Vec<&BStr> = B(" a b c").rsplit(" ").collect(); - /// assert_eq!(x, vec!["c", "b", "", "a", "", "", "", ""]); - /// ``` - /// - /// It does *not* give you `["a", "b", "c"]`. - #[inline] - pub fn rsplit<'a, B: ?Sized + AsRef<[u8]>>( - &'a self, - splitter: &'a B, - ) -> SplitReverse<'a> { - SplitReverse::new(self, BStr::new(splitter.as_ref())) - } - - /// Returns an iterator of at most `limit` substrings of this byte string, - /// separated by the given byte string. If `limit` substrings are yielded, - /// then the last substring will contain the remainder of this byte string. - /// - /// The splitter may be any type that can be cheaply converted into a - /// `&[u8]`. This includes, but is not limited to, `&str`, `&BStr`, and of - /// course, `&[u8]` itself. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::{B, BStr}; - /// - /// let x: Vec<_> = B("Mary had a little lamb").splitn(3, " ").collect(); - /// assert_eq!(x, vec!["Mary", "had", "a little lamb"]); - /// - /// let x: Vec<_> = B("").splitn(3, "X").collect(); - /// assert_eq!(x, vec![""]); - /// - /// let x: Vec<_> = B("lionXXtigerXleopard").splitn(3, "X").collect(); - /// assert_eq!(x, vec!["lion", "", "tigerXleopard"]); - /// - /// let x: Vec<_> = B("lion::tiger::leopard").splitn(2, "::").collect(); - /// assert_eq!(x, vec!["lion", "tiger::leopard"]); - /// - /// let x: Vec<_> = B("abcXdef").splitn(1, "X").collect(); - /// assert_eq!(x, vec!["abcXdef"]); - /// - /// let x: Vec<_> = B("abcXdef").splitn(0, "X").collect(); - /// assert!(x.is_empty()); - /// ``` - #[inline] - pub fn splitn<'a, B: ?Sized + AsRef<[u8]>>( - &'a self, - limit: usize, - splitter: &'a B, - ) -> SplitN<'a> { - SplitN::new(self, BStr::new(splitter.as_ref()), limit) - } - - /// Returns an iterator of at most `limit` substrings of this byte string, - /// separated by the given byte string, in reverse. If `limit` substrings - /// are yielded, then the last substring will contain the remainder of this - /// byte string. - /// - /// The splitter may be any type that can be cheaply converted into a - /// `&[u8]`. This includes, but is not limited to, `&str`, `&BStr`, and of - /// course, `&[u8]` itself. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::{B, BStr}; - /// - /// let x: Vec<_> = B("Mary had a little lamb").rsplitn(3, " ").collect(); - /// assert_eq!(x, vec!["lamb", "little", "Mary had a"]); - /// - /// let x: Vec<_> = B("").rsplitn(3, "X").collect(); - /// assert_eq!(x, vec![""]); - /// - /// let x: Vec<_> = B("lionXXtigerXleopard").rsplitn(3, "X").collect(); - /// assert_eq!(x, vec!["leopard", "tiger", "lionX"]); - /// - /// let x: Vec<_> = B("lion::tiger::leopard").rsplitn(2, "::").collect(); - /// assert_eq!(x, vec!["leopard", "lion::tiger"]); - /// - /// let x: Vec<_> = B("abcXdef").rsplitn(1, "X").collect(); - /// assert_eq!(x, vec!["abcXdef"]); - /// - /// let x: Vec<_> = B("abcXdef").rsplitn(0, "X").collect(); - /// assert!(x.is_empty()); - /// ``` - #[inline] - pub fn rsplitn<'a, B: ?Sized + AsRef<[u8]>>( - &'a self, - limit: usize, - splitter: &'a B, - ) -> SplitNReverse<'a> { - SplitNReverse::new(self, BStr::new(splitter.as_ref()), limit) - } - - /// Replace all matches of the given needle with the given replacement, and - /// the result as a new `BString`. - /// - /// This routine is useful as a convenience. If you need to reuse an - /// allocation, use [`replace_into`](#method.replace_into) instead. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::B; - /// - /// let s = B("this is old").replace("old", "new"); - /// assert_eq!(s, "this is new"); - /// ``` - /// - /// When the pattern doesn't match: - /// - /// ``` - /// use bstr::B; - /// - /// let s = B("this is old").replace("nada nada", "limonada"); - /// assert_eq!(s, "this is old"); - /// ``` - /// - /// When the needle is an empty string: - /// - /// ``` - /// use bstr::B; - /// - /// let s = B("foo").replace("", "Z"); - /// assert_eq!(s, "ZfZoZoZ"); - /// ``` - #[cfg(feature = "std")] - #[inline] - pub fn replace, R: AsRef<[u8]>>( - &self, - needle: N, - replacement: R, - ) -> BString { - let mut dest = BString::with_capacity(self.len()); - self.replace_into(needle, replacement, &mut dest); - dest - } - - /// Replace up to `limit` matches of the given needle with the given - /// replacement, and the result as a new `BString`. - /// - /// This routine is useful as a convenience. If you need to reuse an - /// allocation, use [`replacen_into`](#method.replacen_into) instead. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::B; - /// - /// let s = B("foofoo").replacen("o", "z", 2); - /// assert_eq!(s, "fzzfoo"); - /// ``` - /// - /// When the pattern doesn't match: - /// - /// ``` - /// use bstr::B; - /// - /// let s = B("foofoo").replacen("a", "z", 2); - /// assert_eq!(s, "foofoo"); - /// ``` - /// - /// When the needle is an empty string: - /// - /// ``` - /// use bstr::B; - /// - /// let s = B("foo").replacen("", "Z", 2); - /// assert_eq!(s, "ZfZoo"); - /// ``` - #[cfg(feature = "std")] - #[inline] - pub fn replacen, R: AsRef<[u8]>>( - &self, - needle: N, - replacement: R, - limit: usize, - ) -> BString { - let mut dest = BString::with_capacity(self.len()); - self.replacen_into(needle, replacement, limit, &mut dest); - dest - } - - /// Replace all matches of the given needle with the given replacement, - /// and write the result into the provided `BString`. - /// - /// This does **not** clear `dest` before writing to it. - /// - /// This routine is useful for reusing allocation. For a more convenient - /// API, use [`replace`](#method.replace) instead. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::{B, BString}; - /// - /// let s = B("this is old"); - /// - /// let mut dest = BString::new(); - /// s.replace_into("old", "new", &mut dest); - /// assert_eq!(dest, "this is new"); - /// ``` - /// - /// When the pattern doesn't match: - /// - /// ``` - /// use bstr::{B, BString}; - /// - /// let s = B("this is old"); - /// - /// let mut dest = BString::new(); - /// s.replace_into("nada nada", "limonada", &mut dest); - /// assert_eq!(dest, "this is old"); - /// ``` - /// - /// When the needle is an empty string: - /// - /// ``` - /// use bstr::{B, BString}; - /// - /// let s = B("foo"); - /// - /// let mut dest = BString::new(); - /// s.replace_into("", "Z", &mut dest); - /// assert_eq!(dest, "ZfZoZoZ"); - /// ``` - #[cfg(feature = "std")] - #[inline] - pub fn replace_into, R: AsRef<[u8]>>( - &self, - needle: N, - replacement: R, - dest: &mut BString, - ) { - let (needle, replacement) = (needle.as_ref(), replacement.as_ref()); - - let mut last = 0; - for start in self.find_iter(needle) { - dest.push(&self[last..start]); - dest.push(replacement); - last = start + needle.len(); - } - dest.push(&self[last..]); - } - - /// Replace up to `limit` matches of the given needle with the given - /// replacement, and write the result into the provided `BString`. - /// - /// This does **not** clear `dest` before writing to it. - /// - /// This routine is useful for reusing allocation. For a more convenient - /// API, use [`replace`](#method.replacen) instead. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::{B, BString}; - /// - /// let s = B("foofoo"); - /// - /// let mut dest = BString::new(); - /// s.replacen_into("o", "z", 2, &mut dest); - /// assert_eq!(dest, "fzzfoo"); - /// ``` - /// - /// When the pattern doesn't match: - /// - /// ``` - /// use bstr::{B, BString}; - /// - /// let s = B("foofoo"); - /// - /// let mut dest = BString::new(); - /// s.replacen_into("a", "z", 2, &mut dest); - /// assert_eq!(dest, "foofoo"); - /// ``` - /// - /// When the needle is an empty string: - /// - /// ``` - /// use bstr::{B, BString}; - /// - /// let s = B("foo"); - /// - /// let mut dest = BString::new(); - /// s.replacen_into("", "Z", 2, &mut dest); - /// assert_eq!(dest, "ZfZoo"); - /// ``` - #[cfg(feature = "std")] - #[inline] - pub fn replacen_into, R: AsRef<[u8]>>( - &self, - needle: N, - replacement: R, - limit: usize, - dest: &mut BString, - ) { - let (needle, replacement) = (needle.as_ref(), replacement.as_ref()); - - let mut last = 0; - for start in self.find_iter(needle).take(limit) { - dest.push(&self[last..start]); - dest.push(replacement); - last = start + needle.len(); - } - dest.push(&self[last..]); - } - - /// Returns an iterator over the bytes in this byte string. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::B; - /// - /// let bs = B("foobar"); - /// let bytes: Vec = bs.bytes().collect(); - /// assert_eq!(bytes, bs); - /// ``` - #[inline] - pub fn bytes(&self) -> Bytes { - Bytes { it: self.as_bytes().iter() } - } - - /// Returns an iterator over the Unicode scalar values in this byte string. - /// If invalid UTF-8 is encountered, then the Unicode replacement codepoint - /// is yielded instead. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::B; - /// - /// let bs = B(b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61"); - /// let chars: Vec = bs.chars().collect(); - /// assert_eq!(vec!['☃', '\u{FFFD}', '𝞃', '\u{FFFD}', 'a'], chars); - /// ``` - /// - /// Codepoints can also be iterated over in reverse: - /// - /// ``` - /// use bstr::B; - /// - /// let bs = B(b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61"); - /// let chars: Vec = bs.chars().rev().collect(); - /// assert_eq!(vec!['a', '\u{FFFD}', '𝞃', '\u{FFFD}', '☃'], chars); - /// ``` - #[inline] - pub fn chars(&self) -> Chars { - Chars::new(self) - } - - /// Returns an iterator over the Unicode scalar values in this byte string - /// along with their starting and ending byte index positions. If invalid - /// UTF-8 is encountered, then the Unicode replacement codepoint is yielded - /// instead. - /// - /// Note that this is slightly different from the `CharIndices` iterator - /// provided by the standard library. Aside from working on possibly - /// invalid UTF-8, this iterator provides both the corresponding starting - /// and ending byte indices of each codepoint yielded. The ending position - /// is necessary to slice the original byte string when invalid UTF-8 bytes - /// are converted into a Unicode replacement codepoint, since a single - /// replacement codepoint can substitute anywhere from 1 to 3 invalid bytes - /// (inclusive). - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::B; - /// - /// let bs = B(b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61"); - /// let chars: Vec<(usize, usize, char)> = bs.char_indices().collect(); - /// assert_eq!(chars, vec![ - /// (0, 3, '☃'), - /// (3, 4, '\u{FFFD}'), - /// (4, 8, '𝞃'), - /// (8, 10, '\u{FFFD}'), - /// (10, 11, 'a'), - /// ]); - /// ``` - /// - /// Codepoints can also be iterated over in reverse: - /// - /// ``` - /// use bstr::B; - /// - /// let bs = B(b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61"); - /// let chars: Vec<(usize, usize, char)> = bs - /// .char_indices() - /// .rev() - /// .collect(); - /// assert_eq!(chars, vec![ - /// (10, 11, 'a'), - /// (8, 10, '\u{FFFD}'), - /// (4, 8, '𝞃'), - /// (3, 4, '\u{FFFD}'), - /// (0, 3, '☃'), - /// ]); - /// ``` - #[inline] - pub fn char_indices(&self) -> CharIndices { - CharIndices::new(self) - } - - /// Returns an iterator over the grapheme clusters in this byte string. - /// If invalid UTF-8 is encountered, then the Unicode replacement codepoint - /// is yielded instead. - /// - /// # Examples - /// - /// This example shows how multiple codepoints can combine to form a - /// single grapheme cluster: - /// - /// ``` - /// use bstr::B; - /// - /// let bs = B("a\u{0300}\u{0316}\u{1F1FA}\u{1F1F8}"); - /// let graphemes: Vec<&str> = bs.graphemes().collect(); - /// assert_eq!(vec!["à̖", "🇺🇸"], graphemes); - /// ``` - /// - /// This shows that graphemes can be iterated over in reverse: - /// - /// ``` - /// use bstr::B; - /// - /// let bs = B("a\u{0300}\u{0316}\u{1F1FA}\u{1F1F8}"); - /// let graphemes: Vec<&str> = bs.graphemes().rev().collect(); - /// assert_eq!(vec!["🇺🇸", "à̖"], graphemes); - /// ``` - #[cfg(feature = "unicode")] - #[inline] - pub fn graphemes(&self) -> Graphemes { - Graphemes::new(self) - } - - /// Returns an iterator over the grapheme clusters in this byte string - /// along with their starting and ending byte index positions. If invalid - /// UTF-8 is encountered, then the Unicode replacement codepoint is yielded - /// instead. - /// - /// # Examples - /// - /// This example shows how to get the byte offsets of each individual - /// grapheme cluster: - /// - /// ``` - /// use bstr::B; - /// - /// let bs = B("a\u{0300}\u{0316}\u{1F1FA}\u{1F1F8}"); - /// let graphemes: Vec<(usize, usize, &str)> = - /// bs.grapheme_indices().collect(); - /// assert_eq!(vec![(0, 5, "à̖"), (5, 13, "🇺🇸")], graphemes); - /// ``` - /// - /// This example shows what happens when invalid UTF-8 is enountered. Note - /// that the offsets are valid indices into the original string, and do - /// not necessarily correspond to the length of the `&str` returned! - /// - /// ``` - /// use bstr::BString; - /// - /// let mut bytes = BString::new(); - /// bytes.push("a\u{0300}\u{0316}"); - /// bytes.push_byte(b'\xFF'); - /// bytes.push("\u{1F1FA}\u{1F1F8}"); - /// - /// let graphemes: Vec<(usize, usize, &str)> = - /// bytes.grapheme_indices().collect(); - /// assert_eq!( - /// graphemes, - /// vec![(0, 5, "à̖"), (5, 6, "\u{FFFD}"), (6, 14, "🇺🇸")] - /// ); - /// ``` - #[cfg(feature = "unicode")] - #[inline] - pub fn grapheme_indices(&self) -> GraphemeIndices { - GraphemeIndices::new(self) - } - - /// Returns an iterator over the words in this byte string. If invalid - /// UTF-8 is encountered, then the Unicode replacement codepoint is yielded - /// instead. - /// - /// This is similar to - /// [`words_with_breaks`](struct.BStr.html#method.words_with_breaks), - /// except it only returns elements that contain a "word" character. A word - /// character is defined by UTS #18 (Annex C) to be the combination of the - /// `Alphabetic` and `Join_Control` properties, along with the - /// `Decimal_Number`, `Mark` and `Connector_Punctuation` general - /// categories. - /// - /// Since words are made up of one or more codepoints, this iterator - /// yields `&str` elements. When invalid UTF-8 is encountered, replacement - /// codepoints are [substituted](index.html#handling-of-invalid-utf-8). - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::B; - /// - /// let bs = B(r#"The quick ("brown") fox can't jump 32.3 feet, right?"#); - /// let words: Vec<&str> = bs.words().collect(); - /// assert_eq!(words, vec![ - /// "The", "quick", "brown", "fox", "can't", - /// "jump", "32.3", "feet", "right", - /// ]); - /// ``` - #[cfg(feature = "unicode")] - #[inline] - pub fn words(&self) -> Words { - Words::new(self) - } - - /// Returns an iterator over the words in this byte string along with - /// their starting and ending byte index positions. - /// - /// This is similar to - /// [`words_with_break_indices`](struct.BStr.html#method.words_with_break_indices), - /// except it only returns elements that contain a "word" character. A word - /// character is defined by UTS #18 (Annex C) to be the combination of the - /// `Alphabetic` and `Join_Control` properties, along with the - /// `Decimal_Number`, `Mark` and `Connector_Punctuation` general - /// categories. - /// - /// Since words are made up of one or more codepoints, this iterator - /// yields `&str` elements. When invalid UTF-8 is encountered, replacement - /// codepoints are [substituted](index.html#handling-of-invalid-utf-8). - /// - /// # Examples - /// - /// This example shows how to get the byte offsets of each individual - /// word: - /// - /// ``` - /// use bstr::B; - /// - /// let bs = B("can't jump 32.3 feet"); - /// let words: Vec<(usize, usize, &str)> = bs.word_indices().collect(); - /// assert_eq!(words, vec![ - /// (0, 5, "can't"), - /// (6, 10, "jump"), - /// (11, 15, "32.3"), - /// (16, 20, "feet"), - /// ]); - /// ``` - #[cfg(feature = "unicode")] - #[inline] - pub fn word_indices(&self) -> WordIndices { - WordIndices::new(self) - } - - /// Returns an iterator over the words in this byte string, along with - /// all breaks between the words. Concatenating all elements yielded by - /// the iterator results in the original string (modulo Unicode replacement - /// codepoint substitutions if invalid UTF-8 is encountered). - /// - /// Since words are made up of one or more codepoints, this iterator - /// yields `&str` elements. When invalid UTF-8 is encountered, replacement - /// codepoints are [substituted](index.html#handling-of-invalid-utf-8). - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::B; - /// - /// let bs = B(r#"The quick ("brown") fox can't jump 32.3 feet, right?"#); - /// let words: Vec<&str> = bs.words_with_breaks().collect(); - /// assert_eq!(words, vec![ - /// "The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", - /// " ", "fox", " ", "can't", " ", "jump", " ", "32.3", " ", "feet", - /// ",", " ", "right", "?", - /// ]); - /// ``` - #[cfg(feature = "unicode")] - #[inline] - pub fn words_with_breaks(&self) -> WordsWithBreaks { - WordsWithBreaks::new(self) - } - - /// Returns an iterator over the words and their byte offsets in this - /// byte string, along with all breaks between the words. Concatenating - /// all elements yielded by the iterator results in the original string - /// (modulo Unicode replacement codepoint substitutions if invalid UTF-8 is - /// encountered). - /// - /// Since words are made up of one or more codepoints, this iterator - /// yields `&str` elements. When invalid UTF-8 is encountered, replacement - /// codepoints are [substituted](index.html#handling-of-invalid-utf-8). - /// - /// # Examples - /// - /// This example shows how to get the byte offsets of each individual - /// word: - /// - /// ``` - /// use bstr::B; - /// - /// let bs = B("can't jump 32.3 feet"); - /// let words: Vec<(usize, usize, &str)> = - /// bs.words_with_break_indices().collect(); - /// assert_eq!(words, vec![ - /// (0, 5, "can't"), - /// (5, 6, " "), - /// (6, 10, "jump"), - /// (10, 11, " "), - /// (11, 15, "32.3"), - /// (15, 16, " "), - /// (16, 20, "feet"), - /// ]); - /// ``` - #[cfg(feature = "unicode")] - #[inline] - pub fn words_with_break_indices(&self) -> WordsWithBreakIndices { - WordsWithBreakIndices::new(self) - } - - /// Returns an iterator over the sentences in this byte string. - /// - /// Typically, a sentence will include its trailing punctuation and - /// whitespace. Concatenating all elements yielded by the iterator - /// results in the original string (modulo Unicode replacement codepoint - /// substitutions if invalid UTF-8 is encountered). - /// - /// Since sentences are made up of one or more codepoints, this iterator - /// yields `&str` elements. When invalid UTF-8 is encountered, replacement - /// codepoints are [substituted](index.html#handling-of-invalid-utf-8). - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::B; - /// - /// let bs = B("I want this. Not that. Right now."); - /// let sentences: Vec<&str> = bs.sentences().collect(); - /// assert_eq!(sentences, vec![ - /// "I want this. ", - /// "Not that. ", - /// "Right now.", - /// ]); - /// ``` - #[cfg(feature = "unicode")] - #[inline] - pub fn sentences(&self) -> Sentences { - Sentences::new(self) - } - - /// Returns an iterator over the sentences in this byte string along with - /// their starting and ending byte index positions. - /// - /// Typically, a sentence will include its trailing punctuation and - /// whitespace. Concatenating all elements yielded by the iterator - /// results in the original string (modulo Unicode replacement codepoint - /// substitutions if invalid UTF-8 is encountered). - /// - /// Since sentences are made up of one or more codepoints, this iterator - /// yields `&str` elements. When invalid UTF-8 is encountered, replacement - /// codepoints are [substituted](index.html#handling-of-invalid-utf-8). - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::B; - /// - /// let bs = B("I want this. Not that. Right now."); - /// let sentences: Vec<(usize, usize, &str)> = - /// bs.sentence_indices().collect(); - /// assert_eq!(sentences, vec![ - /// (0, 13, "I want this. "), - /// (13, 23, "Not that. "), - /// (23, 33, "Right now."), - /// ]); - /// ``` - #[cfg(feature = "unicode")] - #[inline] - pub fn sentence_indices(&self) -> SentenceIndices { - SentenceIndices::new(self) - } - - /// An iterator over all lines in a byte string, without their - /// terminators. - /// - /// For this iterator, the only line terminators recognized are `\r\n` and - /// `\n`. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::{B, BStr}; - /// - /// let s = B("\ - /// foo - /// - /// bar\r - /// baz - /// - /// - /// quux"); - /// let lines: Vec<&BStr> = s.lines().collect(); - /// assert_eq!(lines, vec![ - /// "foo", "", "bar", "baz", "", "", "quux", - /// ]); - /// ``` - #[inline] - pub fn lines(&self) -> Lines { - Lines::new(self) - } - - /// An iterator over all lines in a byte string, including their - /// terminators. - /// - /// For this iterator, the only line terminator recognized is `\n`. (Since - /// line terminators are included, this also handles `\r\n` line endings.) - /// - /// Line terminators are only included if they are present in the original - /// byte string. For example, the last line in a byte string may not end - /// with a line terminator. - /// - /// Concatenating all elements yielded by this iterator is guaranteed to - /// yield the original byte string. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::{B, BStr}; - /// - /// let s = B("\ - /// foo - /// - /// bar\r - /// baz - /// - /// - /// quux"); - /// let lines: Vec<&BStr> = s.lines_with_terminator().collect(); - /// assert_eq!(lines, vec![ - /// "foo\n", "\n", "bar\r\n", "baz\n", "\n", "\n", "quux", - /// ]); - /// ``` - #[inline] - pub fn lines_with_terminator(&self) -> LinesWithTerminator { - LinesWithTerminator::new(self) - } - - /// Return a byte string slice with leading and trailing whitespace - /// removed. - /// - /// Whitespace is defined according to the terms of the `White_Space` - /// Unicode property. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::B; - /// - /// let s = B(" foo\tbar\t\u{2003}\n"); - /// assert_eq!(s.trim(), "foo\tbar"); - /// ``` - #[cfg(feature = "unicode")] - #[inline] - pub fn trim(&self) -> &BStr { - self.trim_start().trim_end() - } - - /// Return a byte string slice with leading whitespace removed. - /// - /// Whitespace is defined according to the terms of the `White_Space` - /// Unicode property. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::B; - /// - /// let s = B(" foo\tbar\t\u{2003}\n"); - /// assert_eq!(s.trim_start(), "foo\tbar\t\u{2003}\n"); - /// ``` - #[inline] - pub fn trim_start(&self) -> &BStr { - self.trim_start_imp() - } - - #[cfg(feature = "unicode")] - #[inline] - fn trim_start_imp(&self) -> &BStr { - let start = whitespace_len_fwd(self.as_bytes()); - &self[start..] - } - - #[cfg(not(feature = "unicode"))] - #[inline] - fn trim_start_imp(&self) -> &BStr { - self.trim_start_with(|c| c.is_whitespace()) - } - - /// Return a byte string slice with trailing whitespace removed. - /// - /// Whitespace is defined according to the terms of the `White_Space` - /// Unicode property. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::B; - /// - /// let s = B(" foo\tbar\t\u{2003}\n"); - /// assert_eq!(s.trim_end(), " foo\tbar"); - /// ``` - #[inline] - pub fn trim_end(&self) -> &BStr { - self.trim_end_imp() - } - - #[cfg(feature = "unicode")] - #[inline] - fn trim_end_imp(&self) -> &BStr { - let end = whitespace_len_rev(self.as_bytes()); - &self[..end] - } - - #[cfg(not(feature = "unicode"))] - #[inline] - fn trim_end_imp(&self) -> &BStr { - self.trim_end_with(|c| c.is_whitespace()) - } - - /// Return a byte string slice with leading and trailing characters - /// satisfying the given predicate removed. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::B; - /// - /// let s = B("123foo5bar789"); - /// assert_eq!(s.trim_with(|c| c.is_numeric()), "foo5bar"); - /// ``` - #[inline] - pub fn trim_with bool>(&self, mut trim: F) -> &BStr { - self.trim_start_with(&mut trim).trim_end_with(&mut trim) - } - - /// Return a byte string slice with leading characters satisfying the given - /// predicate removed. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::B; - /// - /// let s = B("123foo5bar789"); - /// assert_eq!(s.trim_start_with(|c| c.is_numeric()), "foo5bar789"); - /// ``` - #[inline] - pub fn trim_start_with bool>( - &self, - mut trim: F, - ) -> &BStr { - for (s, _, ch) in self.char_indices() { - if !trim(ch) { - return &self[s..]; - } - } - B("") - } - - /// Return a byte string slice with trailing characters satisfying the - /// given predicate removed. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::B; - /// - /// let s = B("123foo5bar"); - /// assert_eq!(s.trim_end_with(|c| c.is_numeric()), "123foo5bar"); - /// ``` - #[inline] - pub fn trim_end_with bool>( - &self, - mut trim: F, - ) -> &BStr { - for (_, e, ch) in self.char_indices().rev() { - if !trim(ch) { - return &self[..e]; - } - } - B("") - } - - /// Returns a new `BString` containing the lowercase equivalent of this - /// byte string. - /// - /// In this case, lowercase is defined according to the `Lowercase` Unicode - /// property. - /// - /// If invalid UTF-8 is seen, or if a character has no lowercase variant, - /// then it is written to the given buffer unchanged. - /// - /// Note that some characters in this byte string may expand into multiple - /// characters when changing the case, so the number of bytes written to - /// the given byte string may not be equivalent to the number of bytes in - /// this byte string. - /// - /// If you'd like to reuse an allocation for performance reasons, then use - /// [`to_lowercase_into`](#method.to_lowercase_into) instead. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::{B, BString}; - /// - /// let s = B("HELLO Β"); - /// assert_eq!("hello β", s.to_lowercase()); - /// ``` - /// - /// Scripts without case are not changed: - /// - /// ``` - /// use bstr::{B, BString}; - /// - /// let s = B("农历新年"); - /// assert_eq!("农历新年", s.to_lowercase()); - /// ``` - /// - /// Invalid UTF-8 remains as is: - /// - /// ``` - /// use bstr::{B, BString}; - /// - /// let s = B(b"FOO\xFFBAR\xE2\x98BAZ"); - /// assert_eq!(B(b"foo\xFFbar\xE2\x98baz"), s.to_lowercase()); - /// ``` - #[cfg(all(feature = "std", feature = "unicode"))] - #[inline] - pub fn to_lowercase(&self) -> BString { - let mut buf = BString::new(); - self.to_lowercase_into(&mut buf); - buf - } - - /// Writes the lowercase equivalent of this byte string into the given - /// buffer. The buffer is not cleared before written to. - /// - /// In this case, lowercase is defined according to the `Lowercase` - /// Unicode property. - /// - /// If invalid UTF-8 is seen, or if a character has no lowercase variant, - /// then it is written to the given buffer unchanged. - /// - /// Note that some characters in this byte string may expand into multiple - /// characters when changing the case, so the number of bytes written to - /// the given byte string may not be equivalent to the number of bytes in - /// this byte string. - /// - /// If you don't need to amortize allocation and instead prefer - /// convenience, then use [`to_lowercase`](#method.to_lowercase) instead. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::{B, BString}; - /// - /// let s = B("HELLO Β"); - /// - /// let mut buf = BString::new(); - /// s.to_lowercase_into(&mut buf); - /// assert_eq!("hello β", buf); - /// ``` - /// - /// Scripts without case are not changed: - /// - /// ``` - /// use bstr::{B, BString}; - /// - /// let s = B("农历新年"); - /// - /// let mut buf = BString::new(); - /// s.to_lowercase_into(&mut buf); - /// assert_eq!("农历新年", buf); - /// ``` - /// - /// Invalid UTF-8 remains as is: - /// - /// ``` - /// use bstr::{B, BString}; - /// - /// let s = B(b"FOO\xFFBAR\xE2\x98BAZ"); - /// - /// let mut buf = BString::new(); - /// s.to_lowercase_into(&mut buf); - /// assert_eq!(B(b"foo\xFFbar\xE2\x98baz"), buf); - /// ``` - #[cfg(all(feature = "std", feature = "unicode"))] - #[inline] - pub fn to_lowercase_into(&self, buf: &mut BString) { - // TODO: This is the best we can do given what std exposes I think. - // If we roll our own case handling, then we might be able to do this - // a bit faster. We shouldn't roll our own case handling unless we - // need to, e.g., for doing caseless matching or case folding. - - // TODO(BUG): This doesn't handle any special casing rules. - - buf.reserve(self.len()); - for (s, e, ch) in self.char_indices() { - if ch == '\u{FFFD}' { - buf.push(&self[s..e]); - } else { - for upper in ch.to_lowercase() { - buf.push_char(upper); - } - } - } - } - - /// Returns a new `BString` containing the ASCII lowercase equivalent of - /// this byte string. - /// - /// In this case, lowercase is only defined in ASCII letters. Namely, the - /// letters `A-Z` are converted to `a-z`. All other bytes remain unchanged. - /// In particular, the length of the byte string returned is always - /// equivalent to the length of this byte string. - /// - /// If you'd like to reuse an allocation for performance reasons, then use - /// [`make_ascii_lowercase`](#method.make_ascii_lowercase) to perform - /// the conversion in place. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::{B, BString}; - /// - /// let s = B("HELLO Β"); - /// assert_eq!("hello Β", s.to_ascii_lowercase()); - /// ``` - /// - /// Invalid UTF-8 remains as is: - /// - /// ``` - /// use bstr::{B, BString}; - /// - /// let s = B(b"FOO\xFFBAR\xE2\x98BAZ"); - /// assert_eq!(B(b"foo\xFFbar\xE2\x98baz"), s.to_ascii_lowercase()); - /// ``` - #[cfg(feature = "std")] - #[inline] - pub fn to_ascii_lowercase(&self) -> BString { - BString::from(self.as_bytes().to_ascii_lowercase()) - } - - /// Convert this byte string to its lowercase ASCII equivalent in place. - /// - /// In this case, lowercase is only defined in ASCII letters. Namely, the - /// letters `A-Z` are converted to `a-z`. All other bytes remain unchanged. - /// - /// If you don't need to do the conversion in - /// place and instead prefer convenience, then use - /// [`to_ascii_lowercase`](#method.to_ascii_lowercase) instead. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::BString; - /// - /// let mut s = BString::from("HELLO Β"); - /// s.make_ascii_lowercase(); - /// assert_eq!("hello Β", s); - /// ``` - /// - /// Invalid UTF-8 remains as is: - /// - /// ``` - /// use bstr::{B, BString}; - /// - /// let mut s = BString::from_slice(b"FOO\xFFBAR\xE2\x98BAZ"); - /// s.make_ascii_lowercase(); - /// assert_eq!(B(b"foo\xFFbar\xE2\x98baz"), s); - /// ``` - #[inline] - pub fn make_ascii_lowercase(&mut self) { - self.as_bytes_mut().make_ascii_lowercase(); - } - - /// Returns a new `BString` containing the uppercase equivalent of this - /// byte string. - /// - /// In this case, uppercase is defined according to the `Uppercase` - /// Unicode property. - /// - /// If invalid UTF-8 is seen, or if a character has no uppercase variant, - /// then it is written to the given buffer unchanged. - /// - /// Note that some characters in this byte string may expand into multiple - /// characters when changing the case, so the number of bytes written to - /// the given byte string may not be equivalent to the number of bytes in - /// this byte string. - /// - /// If you'd like to reuse an allocation for performance reasons, then use - /// [`to_uppercase_into`](#method.to_uppercase_into) instead. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::{B, BString}; - /// - /// let s = B("hello β"); - /// assert_eq!("HELLO Β", s.to_uppercase()); - /// ``` - /// - /// Scripts without case are not changed: - /// - /// ``` - /// use bstr::{B, BString}; - /// - /// let s = B("农历新年"); - /// assert_eq!("农历新年", s.to_uppercase()); - /// ``` - /// - /// Invalid UTF-8 remains as is: - /// - /// ``` - /// use bstr::{B, BString}; - /// - /// let s = B(b"foo\xFFbar\xE2\x98baz"); - /// assert_eq!(B(b"FOO\xFFBAR\xE2\x98BAZ"), s.to_uppercase()); - /// ``` - #[cfg(all(feature = "std", feature = "unicode"))] - #[inline] - pub fn to_uppercase(&self) -> BString { - let mut buf = BString::new(); - self.to_uppercase_into(&mut buf); - buf - } - - /// Writes the uppercase equivalent of this byte string into the given - /// buffer. The buffer is not cleared before written to. - /// - /// In this case, uppercase is defined according to the `Uppercase` - /// Unicode property. - /// - /// If invalid UTF-8 is seen, or if a character has no uppercase variant, - /// then it is written to the given buffer unchanged. - /// - /// Note that some characters in this byte string may expand into multiple - /// characters when changing the case, so the number of bytes written to - /// the given byte string may not be equivalent to the number of bytes in - /// this byte string. - /// - /// If you don't need to amortize allocation and instead prefer - /// convenience, then use [`to_uppercase`](#method.to_uppercase) instead. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::{B, BString}; - /// - /// let s = B("hello β"); - /// - /// let mut buf = BString::new(); - /// s.to_uppercase_into(&mut buf); - /// assert_eq!("HELLO Β", buf); - /// ``` - /// - /// Scripts without case are not changed: - /// - /// ``` - /// use bstr::{B, BString}; - /// - /// let s = B("农历新年"); - /// - /// let mut buf = BString::new(); - /// s.to_uppercase_into(&mut buf); - /// assert_eq!("农历新年", buf); - /// ``` - /// - /// Invalid UTF-8 remains as is: - /// - /// ``` - /// use bstr::{B, BString}; - /// - /// let s = B(b"foo\xFFbar\xE2\x98baz"); - /// - /// let mut buf = BString::new(); - /// s.to_uppercase_into(&mut buf); - /// assert_eq!(B(b"FOO\xFFBAR\xE2\x98BAZ"), buf); - /// ``` - #[cfg(all(feature = "std", feature = "unicode"))] - #[inline] - pub fn to_uppercase_into(&self, buf: &mut BString) { - // TODO: This is the best we can do given what std exposes I think. - // If we roll our own case handling, then we might be able to do this - // a bit faster. We shouldn't roll our own case handling unless we - // need to, e.g., for doing caseless matching or case folding. - buf.reserve(self.len()); - for (s, e, ch) in self.char_indices() { - if ch == '\u{FFFD}' { - buf.push(&self[s..e]); - } else if ch.is_ascii() { - buf.push_char(ch.to_ascii_uppercase()); - } else { - for upper in ch.to_uppercase() { - buf.push_char(upper); - } - } - } - } - - /// Returns a new `BString` containing the ASCII uppercase equivalent of - /// this byte string. - /// - /// In this case, uppercase is only defined in ASCII letters. Namely, the - /// letters `a-z` are converted to `A-Z`. All other bytes remain unchanged. - /// In particular, the length of the byte string returned is always - /// equivalent to the length of this byte string. - /// - /// If you'd like to reuse an allocation for performance reasons, then use - /// [`make_ascii_uppercase`](#method.make_ascii_uppercase) to perform - /// the conversion in place. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::{B, BString}; - /// - /// let s = B("hello β"); - /// assert_eq!("HELLO β", s.to_ascii_uppercase()); - /// ``` - /// - /// Invalid UTF-8 remains as is: - /// - /// ``` - /// use bstr::{B, BString}; - /// - /// let s = B(b"foo\xFFbar\xE2\x98baz"); - /// assert_eq!(B(b"FOO\xFFBAR\xE2\x98BAZ"), s.to_ascii_uppercase()); - /// ``` - #[cfg(feature = "std")] - #[inline] - pub fn to_ascii_uppercase(&self) -> BString { - BString::from(self.as_bytes().to_ascii_uppercase()) - } - - /// Convert this byte string to its uppercase ASCII equivalent in place. - /// - /// In this case, uppercase is only defined in ASCII letters. Namely, the - /// letters `a-z` are converted to `A-Z`. All other bytes remain unchanged. - /// - /// If you don't need to do the conversion in - /// place and instead prefer convenience, then use - /// [`to_ascii_uppercase`](#method.to_ascii_uppercase) instead. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::BString; - /// - /// let mut s = BString::from("hello β"); - /// s.make_ascii_uppercase(); - /// assert_eq!("HELLO β", s); - /// ``` - /// - /// Invalid UTF-8 remains as is: - /// - /// ``` - /// use bstr::{B, BString}; - /// - /// let mut s = BString::from_slice(b"foo\xFFbar\xE2\x98baz"); - /// s.make_ascii_uppercase(); - /// assert_eq!(B(b"FOO\xFFBAR\xE2\x98BAZ"), s); - /// ``` - #[inline] - pub fn make_ascii_uppercase(&mut self) { - self.as_bytes_mut().make_ascii_uppercase(); - } - - /// Reverse the bytes in this string, in place. - /// - /// Note that this is not necessarily a well formed operation. For example, - /// if this byte string contains valid UTF-8 that isn't ASCII, then - /// reversing the string will likely result in invalid UTF-8 and otherwise - /// non-sensical content. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::BString; - /// - /// let mut s = BString::from("hello"); - /// s.reverse_bytes(); - /// assert_eq!(s, "olleh"); - /// ``` - #[inline] - pub fn reverse_bytes(&mut self) { - self.as_bytes_mut().reverse(); - } - - /// Reverse the codepoints in this string, in place. - /// - /// If this byte string is valid UTF-8, then its reversal by codepoint - /// is also guaranteed to be valid UTF-8. - /// - /// This operation is equivalent to the following, but without allocating: - /// - /// ``` - /// use bstr::BString; - /// - /// let mut s = BString::from("foo☃bar"); - /// - /// let mut chars: Vec = s.chars().collect(); - /// chars.reverse(); - /// - /// let reversed: String = chars.into_iter().collect(); - /// assert_eq!(reversed, "rab☃oof"); - /// ``` - /// - /// Note that this is not necessarily a well formed operation. For example, - /// if this byte string contains grapheme clusters with more than one - /// codepoint, then those grapheme clusters will not necessarily be - /// preserved. If you'd like to preserve grapheme clusters, then use - /// [`reverse_graphemes`](#method.reverse_graphemes) instead. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::BString; - /// - /// let mut s = BString::from("foo☃bar"); - /// s.reverse_chars(); - /// assert_eq!(s, "rab☃oof"); - /// ``` - /// - /// This example shows that not all reversals lead to a well formed string. - /// For example, in this case, combining marks are used to put accents over - /// some letters, and those accent marks must appear after the codepoints - /// they modify. - /// - /// ``` - /// use bstr::{B, BString}; - /// - /// let mut s = BString::from("résumé"); - /// s.reverse_chars(); - /// assert_eq!(s, B(b"\xCC\x81emus\xCC\x81er")); - /// ``` - /// - /// A word of warning: the above example relies on the fact that - /// `résumé` is in decomposed normal form, which means there are separate - /// codepoints for the accents above `e`. If it is instead in composed - /// normal form, then the example works: - /// - /// ``` - /// use bstr::{B, BString}; - /// - /// let mut s = BString::from("résumé"); - /// s.reverse_chars(); - /// assert_eq!(s, "émusér"); - /// ``` - /// - /// The point here is to be cautious and not assume that just because - /// `reverse_chars` works in one case, that it therefore works in all - /// cases. - #[inline] - pub fn reverse_chars(&mut self) { - let mut i = 0; - loop { - let (_, size) = utf8::decode(self[i..].as_bytes()); - if size == 0 { - break; - } - if size > 1 { - self[i..i + size].reverse_bytes(); - } - i += size; - } - self.reverse_bytes(); - } - - /// Reverse the graphemes in this string, in place. - /// - /// If this byte string is valid UTF-8, then its reversal by grapheme - /// is also guaranteed to be valid UTF-8. - /// - /// This operation is equivalent to the following, but without allocating: - /// - /// ``` - /// use bstr::BString; - /// - /// let mut s = BString::from("foo☃bar"); - /// - /// let mut graphemes: Vec<&str> = s.graphemes().collect(); - /// graphemes.reverse(); - /// - /// let reversed = graphemes.concat(); - /// assert_eq!(reversed, "rab☃oof"); - /// ``` - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::BString; - /// - /// let mut s = BString::from("foo☃bar"); - /// s.reverse_graphemes(); - /// assert_eq!(s, "rab☃oof"); - /// ``` - /// - /// This example shows how this correctly handles grapheme clusters, - /// unlike `reverse_chars`. - /// - /// ``` - /// use bstr::BString; - /// - /// let mut s = BString::from("résumé"); - /// s.reverse_graphemes(); - /// assert_eq!(s, "émusér"); - /// ``` - #[cfg(feature = "unicode")] - #[inline] - pub fn reverse_graphemes(&mut self) { - use unicode::decode_grapheme; - - let mut i = 0; - loop { - let (_, size) = decode_grapheme(&self[i..]); - if size == 0 { - break; - } - if size > 1 { - self[i..i + size].reverse_bytes(); - } - i += size; - } - self.reverse_bytes(); - } - - /// Returns true if and only if every byte in this byte string is ASCII. - /// - /// ASCII is an encoding that defines 128 codepoints. A byte corresponds to - /// an ASCII codepoint if and only if it is in the inclusive range - /// `[0, 127]`. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::B; - /// - /// assert!(B("abc").is_ascii()); - /// assert!(!B("☃βツ").is_ascii()); - /// assert!(!B(b"\xFF").is_ascii()); - /// ``` - #[inline] - pub fn is_ascii(&self) -> bool { - ascii::first_non_ascii_byte(&self.bytes) == self.len() - } - - /// Returns true if and only if the entire byte string is valid UTF-8. - /// - /// If you need location information about where a byte string's first - /// invalid UTF-8 byte is, then use the [`to_str`](#method.to_str) method. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::B; - /// - /// assert!(B("abc").is_utf8()); - /// assert!(B("☃βツ").is_utf8()); - /// // invalid bytes - /// assert!(!B(b"abc\xFF").is_utf8()); - /// // surrogate encoding - /// assert!(!B(b"\xED\xA0\x80").is_utf8()); - /// // incomplete sequence - /// assert!(!B(b"\xF0\x9D\x9Ca").is_utf8()); - /// // overlong sequence - /// assert!(!B(b"\xF0\x82\x82\xAC").is_utf8()); - /// ``` - #[inline] - pub fn is_utf8(&self) -> bool { - utf8::validate(self.as_bytes()).is_ok() - } - - /// Divides this byte string into two at an index. - /// - /// The first byte string will contain all bytes at indices `[0, at)`, and - /// the second byte string will contain all bytes at indices `[at, len)`. - /// - /// # Panics - /// - /// Panics if `at > len`. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::B; - /// - /// assert_eq!(B("foobar").split_at(3), (B("foo"), B("bar"))); - /// assert_eq!(B("foobar").split_at(0), (B(""), B("foobar"))); - /// assert_eq!(B("foobar").split_at(6), (B("foobar"), B(""))); - /// ``` - #[inline] - pub fn split_at(&self, at: usize) -> (&BStr, &BStr) { - let (left, right) = self.as_bytes().split_at(at); - (BStr::new(left), BStr::new(right)) - } - - /// Divides this mutable byte string into two at an index. - /// - /// The first byte string will contain all bytes at indices `[0, at)`, and - /// the second byte string will contain all bytes at indices `[at, len)`. - /// - /// # Panics - /// - /// Panics if `at > len`. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::{B, BString}; - /// - /// let mut b = BString::from("foobar"); - /// { - /// let (left, right) = b.split_at_mut(3); - /// left[2] = b'z'; - /// right[2] = b'z'; - /// } - /// assert_eq!(b, B("fozbaz")); - /// ``` - #[inline] - pub fn split_at_mut(&mut self, at: usize) -> (&mut BStr, &mut BStr) { - let (left, right) = self.as_bytes_mut().split_at_mut(at); - (BStr::new_mut(left), BStr::new_mut(right)) - } - - /// Retrieve a reference to a byte or a subslice, depending on the type of - /// the index given. - /// - /// If given a position, this returns a reference to the byte at that - /// position, if it exists. - /// - /// If given a range, this returns the slice of bytes corresponding to that - /// range in this byte string. - /// - /// In the case of invalid indices, this returns `None`. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::B; - /// - /// let s = B("baz"); - /// assert_eq!(s.get(1), Some(&b'a')); - /// assert_eq!(s.get(0..2), Some(B("ba"))); - /// assert_eq!(s.get(2..), Some(B("z"))); - /// assert_eq!(s.get(1..=2), Some(B("az"))); - /// ``` - #[inline] - pub fn get(&self, at: I) -> Option<&I::Output> { - at.get(self) - } - - /// Retrieve a mutable reference to a byte or a subslice, depending on the - /// type of the index given. - /// - /// If given a position, this returns a reference to the byte at that - /// position, if it exists. - /// - /// If given a range, this returns the slice of bytes corresponding to that - /// range in this byte string. - /// - /// In the case of invalid indices, this returns `None`. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::BString; - /// - /// let mut s = BString::from("baz"); - /// if let Some(mut slice) = s.get_mut(1..) { - /// slice[0] = b'o'; - /// slice[1] = b'p'; - /// } - /// assert_eq!(s, "bop"); - /// ``` - #[inline] - pub fn get_mut(&mut self, at: I) -> Option<&mut I::Output> { - at.get_mut(self) - } - - /// Retrieve a reference to a byte or a subslice, depending on the type of - /// the index given, while explicitly eliding bounds checks. - /// - /// If given a position, this returns a reference to the byte at that - /// position, if it exists. - /// - /// If given a range, this returns the slice of bytes corresponding to that - /// range in this byte string. - /// - /// In the case of invalid indices, this returns `None`. - /// - /// # Safety - /// - /// Callers must ensure that the supplied bounds are correct. If they - /// are out of bounds, then this results in undefined behavior. For a - /// safe alternative, use [`get`](#method.get). - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::B; - /// - /// let s = B("baz"); - /// unsafe { - /// assert_eq!(s.get_unchecked(1), &b'a'); - /// assert_eq!(s.get_unchecked(0..2), "ba"); - /// assert_eq!(s.get_unchecked(2..), "z"); - /// assert_eq!(s.get_unchecked(1..=2), "az"); - /// } - /// ``` - pub unsafe fn get_unchecked(&self, at: I) -> &I::Output { - at.get_unchecked(self) - } - - /// Retrieve a mutable reference to a byte or a subslice, depending on the - /// type of the index given, while explicitly eliding bounds checks. - /// - /// If given a position, this returns a reference to the byte at that - /// position, if it exists. - /// - /// If given a range, this returns the slice of bytes corresponding to that - /// range in this byte string. - /// - /// In the case of invalid indices, this returns `None`. - /// - /// # Safety - /// - /// Callers must ensure that the supplied bounds are correct. If they - /// are out of bounds, then this results in undefined behavior. For a - /// safe alternative, use [`get_mut`](#method.get_mut). - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::BString; - /// - /// let mut s = BString::from("baz"); - /// { - /// let mut slice = unsafe { s.get_unchecked_mut(1..) }; - /// slice[0] = b'o'; - /// slice[1] = b'p'; - /// } - /// assert_eq!(s, "bop"); - /// ``` - pub unsafe fn get_unchecked_mut( - &mut self, - at: I, - ) -> &mut I::Output { - at.get_unchecked_mut(self) - } - - /// Returns the last byte in this byte string, if it's non-empty. If this - /// byte string is empty, this returns `None`. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::B; - /// - /// assert_eq!(Some(b'z'), B("baz").last()); - /// assert_eq!(None, B("").last()); - /// ``` - #[inline] - pub fn last(&self) -> Option { - self.get(self.len().saturating_sub(1)).map(|&b| b) - } - - /// Copies elements from one part of the slice to another part of itself, - /// where the parts may be overlapping. - /// - /// `src` is the range within this byte string to copy from, while `dest` - /// is the starting index of the range within this byte string to copy to. - /// The length indicated by `src` must be less than or equal to the number - /// of bytes from `dest` to the end of the byte string. - /// - /// # Panics - /// - /// Panics if either range is out of bounds, or if `src` is too big to fit - /// into `dest`, or if the end of `src` is before the start. - /// - /// # Examples - /// - /// Copying four bytes within a byte string: - /// - /// ``` - /// use bstr::BStr; - /// - /// let mut buf = *b"Hello, World!"; - /// let s = BStr::new_mut(&mut buf); - /// s.copy_within(1..5, 8); - /// assert_eq!(s, "Hello, Wello!"); - /// ``` - #[inline] - pub fn copy_within( - &mut self, - src: R, - dest: usize, - ) where R: ops::RangeBounds - { - let src_start = match src.start_bound() { - ops::Bound::Included(&n) => n, - ops::Bound::Excluded(&n) => { - n.checked_add(1).expect("attempted to index slice beyond max") - } - ops::Bound::Unbounded => 0, - }; - let src_end = match src.end_bound() { - ops::Bound::Included(&n) => { - n.checked_add(1).expect("attempted to index slice beyond max") - } - ops::Bound::Excluded(&n) => n, - ops::Bound::Unbounded => self.len(), - }; - assert!(src_start <= src_end, "src end is before src start"); - assert!(src_end <= self.len(), "src is out of bounds"); - let count = src_end - src_start; - assert!(dest <= self.len() - count, "dest is out of bounds"); - - // SAFETY: This is safe because we use ptr::copy to handle overlapping - // copies, and is also safe because we've checked all the bounds above. - // Finally, we are only dealing with u8 data, which is Copy, which - // means we can copy without worrying about ownership/destructors. - unsafe { - ptr::copy( - self.get_unchecked(src_start), - self.get_unchecked_mut(dest), - count, - ); - } - } - - /// Returns a raw pointer to this byte string's underlying bytes. - /// - /// # Safety - /// - /// The caller must ensure that the byte string outlives the pointer this - /// function returns, or else it will end up pointing to garbage. - /// - /// Modifying the container (like a `BString`) referenced by this byte - /// string may cause its buffer to be reallocated, which would also make - /// any pointers to it invalid. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::B; - /// - /// let s = B("hello"); - /// let p = s.as_ptr(); - /// - /// unsafe { - /// assert_eq!(*p.add(2), b'l'); - /// } - /// ``` - #[inline] - pub fn as_ptr(&self) -> *const u8 { - self.as_bytes().as_ptr() - } - - /// Returns a raw mutable pointer to this byte string's underlying bytes. - /// - /// # Safety - /// - /// The caller must ensure that the byte string outlives the pointer this - /// function returns, or else it will end up pointing to garbage. - /// - /// Modifying the container (like a `BString`) referenced by this byte - /// string may cause its buffer to be reallocated, which would also make - /// any pointers to it invalid. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::BStr; - /// - /// let mut buf = &mut [b'h', b'e', b'l', b'l', b'o']; - /// let mut s = BStr::new_mut(buf); - /// let p = s.as_mut_ptr(); - /// - /// unsafe { - /// *p.add(2) = b'Z'; - /// } - /// assert_eq!("heZlo", s); - /// ``` - #[inline] - pub fn as_mut_ptr(&mut self) -> *mut u8 { - self.as_bytes_mut().as_mut_ptr() - } -} - -/// A single substring searcher fixed to a particular needle. -/// -/// The purpose of this type is to permit callers to construct a substring -/// searcher that can be used to search haystacks without the overhead of -/// constructing the searcher in the first place. This is a somewhat niche -/// concern when it's necessary to re-use the same needle to search multiple -/// different haystacks with as little overhead as possible. In general, using -/// [`BStr::find`](struct.BStr.html#method.find) -/// or -/// [`BStr::find_iter`](struct.BStr.html#method.find_iter) -/// is good enough, but `Finder` is useful when you can meaningfully observe -/// searcher construction time in a profile. -/// -/// When the `std` feature is enabled, then this type has an `into_owned` -/// version which permits building a `Finder` that is not connected to the -/// lifetime of its needle. -#[derive(Clone, Debug)] -pub struct Finder<'a> { - searcher: TwoWay<'a>, -} - -impl<'a> Finder<'a> { - /// Create a new finder for the given needle. - #[inline] - pub fn new>(needle: &'a B) -> Finder<'a> { - Finder { searcher: TwoWay::forward(BStr::new(needle)) } - } - - /// Convert this finder into its owned variant, such that it no longer - /// borrows the needle. - /// - /// If this is already an owned finder, then this is a no-op. Otherwise, - /// this copies the needle. - /// - /// This is only available when the `std` feature is enabled. - #[cfg(feature = "std")] - #[inline] - pub fn into_owned(self) -> Finder<'static> { - Finder { searcher: self.searcher.into_owned() } - } - - /// Returns the needle that this finder searches for. - /// - /// Note that the lifetime of the needle returned is tied to the lifetime - /// of the finder, and may be shorter than the `'a` lifetime. Namely, a - /// finder's needle can be either borrowed or owned, so the lifetime of the - /// needle returned must necessarily be the shorter of the two. - #[inline] - pub fn needle(&self) -> &BStr { - self.searcher.needle() - } - - /// Returns the index of the first occurrence of this needle in the given - /// haystack. - /// - /// The haystack may be any type that can be cheaply converted into a - /// `&[u8]`. This includes, but is not limited to, `&str`, `&BStr`, and of - /// course, `&[u8]` itself. - /// - /// # Complexity - /// - /// This routine is guaranteed to have worst case linear time complexity - /// with respect to both the needle and the haystack. That is, this runs - /// in `O(needle.len() + haystack.len())` time. - /// - /// This routine is also guaranteed to have worst case constant space - /// complexity. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::Finder; - /// - /// let haystack = "foo bar baz"; - /// assert_eq!(Some(0), Finder::new("foo").find(haystack)); - /// assert_eq!(Some(4), Finder::new("bar").find(haystack)); - /// assert_eq!(None, Finder::new("quux").find(haystack)); - /// ``` - #[inline] - pub fn find>(&self, haystack: B) -> Option { - self.searcher.find(BStr::new(haystack.as_ref())) - } -} - -/// A single substring reverse searcher fixed to a particular needle. -/// -/// The purpose of this type is to permit callers to construct a substring -/// searcher that can be used to search haystacks without the overhead of -/// constructing the searcher in the first place. This is a somewhat niche -/// concern when it's necessary to re-use the same needle to search multiple -/// different haystacks with as little overhead as possible. In general, using -/// [`BStr::rfind`](struct.BStr.html#method.rfind) -/// or -/// [`BStr::rfind_iter`](struct.BStr.html#method.rfind_iter) -/// is good enough, but `FinderReverse` is useful when you can meaningfully -/// observe searcher construction time in a profile. -/// -/// When the `std` feature is enabled, then this type has an `into_owned` -/// version which permits building a `FinderReverse` that is not connected to -/// the lifetime of its needle. -#[derive(Clone, Debug)] -pub struct FinderReverse<'a> { - searcher: TwoWay<'a>, -} - -impl<'a> FinderReverse<'a> { - /// Create a new reverse finder for the given needle. - #[inline] - pub fn new>(needle: &'a B) -> FinderReverse<'a> { - FinderReverse { searcher: TwoWay::reverse(BStr::new(needle)) } - } - - /// Convert this finder into its owned variant, such that it no longer - /// borrows the needle. - /// - /// If this is already an owned finder, then this is a no-op. Otherwise, - /// this copies the needle. - /// - /// This is only available when the `std` feature is enabled. - #[cfg(feature = "std")] - #[inline] - pub fn into_owned(self) -> FinderReverse<'static> { - FinderReverse { searcher: self.searcher.into_owned() } - } - - /// Returns the needle that this finder searches for. - /// - /// Note that the lifetime of the needle returned is tied to the lifetime - /// of this finder, and may be shorter than the `'a` lifetime. Namely, - /// a finder's needle can be either borrowed or owned, so the lifetime of - /// the needle returned must necessarily be the shorter of the two. - #[inline] - pub fn needle(&self) -> &BStr { - self.searcher.needle() - } - - /// Returns the index of the last occurrence of this needle in the given - /// haystack. - /// - /// The haystack may be any type that can be cheaply converted into a - /// `&[u8]`. This includes, but is not limited to, `&str`, `&BStr`, and of - /// course, `&[u8]` itself. - /// - /// # Complexity - /// - /// This routine is guaranteed to have worst case linear time complexity - /// with respect to both the needle and the haystack. That is, this runs - /// in `O(needle.len() + haystack.len())` time. - /// - /// This routine is also guaranteed to have worst case constant space - /// complexity. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::FinderReverse; - /// - /// let haystack = "foo bar baz"; - /// assert_eq!(Some(0), FinderReverse::new("foo").rfind(haystack)); - /// assert_eq!(Some(4), FinderReverse::new("bar").rfind(haystack)); - /// assert_eq!(None, FinderReverse::new("quux").rfind(haystack)); - /// ``` - #[inline] - pub fn rfind>(&self, haystack: B) -> Option { - self.searcher.rfind(BStr::new(haystack.as_ref())) - } -} - -/// An iterator over non-overlapping substring matches. -/// -/// Matches are reported by the byte offset at which they begin. -/// -/// `'a` is the shorter of two lifetimes: the byte string being searched or the -/// byte string being looked for. -#[derive(Debug)] -pub struct Find<'a> { - haystack: &'a BStr, - prestate: PrefilterState, - searcher: TwoWay<'a>, - pos: usize, -} - -impl<'a> Find<'a> { - fn new(haystack: &'a BStr, needle: &'a BStr) -> Find<'a> { - let searcher = TwoWay::forward(needle); - let prestate = searcher.prefilter_state(); - Find { haystack, prestate, searcher, pos: 0 } - } -} - -impl<'a> Iterator for Find<'a> { - type Item = usize; - - #[inline] - fn next(&mut self) -> Option { - if self.pos > self.haystack.len() { - return None; - } - let result = self.searcher.find_with( - &mut self.prestate, - &self.haystack[self.pos..], - ); - match result { - None => None, - Some(i) => { - let pos = self.pos + i; - self.pos = pos + cmp::max(1, self.searcher.needle().len()); - Some(pos) - } - } - } -} - -/// An iterator over non-overlapping substring matches in reverse. -/// -/// Matches are reported by the byte offset at which they begin. -/// -/// `'a` is the shorter of two lifetimes: the byte string being searched or the -/// byte string being looked for. -#[derive(Debug)] -pub struct FindReverse<'a> { - haystack: &'a BStr, - prestate: PrefilterState, - searcher: TwoWay<'a>, - /// When searching with an empty needle, this gets set to `None` after - /// we've yielded the last element at `0`. - pos: Option, -} - -impl<'a> FindReverse<'a> { - fn new(haystack: &'a BStr, needle: &'a BStr) -> FindReverse<'a> { - let searcher = TwoWay::reverse(needle); - let prestate = searcher.prefilter_state(); - let pos = Some(haystack.len()); - FindReverse { haystack, prestate, searcher, pos } - } - - fn haystack(&self) -> &'a BStr { - self.haystack - } - - fn needle(&self) -> &BStr { - self.searcher.needle() - } -} - -impl<'a> Iterator for FindReverse<'a> { - type Item = usize; - - #[inline] - fn next(&mut self) -> Option { - let pos = match self.pos { - None => return None, - Some(pos) => pos, - }; - let result = self.searcher.rfind_with( - &mut self.prestate, - &self.haystack[..pos], - ); - match result { - None => None, - Some(i) => { - if pos == i { - self.pos = pos.checked_sub(1); - } else { - self.pos = Some(i); - } - Some(i) - } - } - } -} - -/// An iterator over the bytes in a byte string. -/// -/// `'a` is the lifetime of the byte string being traversed. -#[derive(Clone, Debug)] -pub struct Bytes<'a> { - it: slice::Iter<'a, u8>, -} - -impl<'a> Iterator for Bytes<'a> { - type Item = u8; - - #[inline] - fn next(&mut self) -> Option { - self.it.next().map(|&b| b) - } -} - -impl<'a> DoubleEndedIterator for Bytes<'a> { - #[inline] - fn next_back(&mut self) -> Option { - self.it.next_back().map(|&b| b) - } -} - -impl<'a> ExactSizeIterator for Bytes<'a> { - #[inline] - fn len(&self) -> usize { - self.it.len() - } -} - -/// An iterator over the fields in a byte string, separated by whitespace. -/// -/// This iterator splits on contiguous runs of whitespace, such that the fields -/// in `foo\t\t\n \nbar` are `foo` and `bar`. -/// -/// `'a` is the lifetime of the byte string being split. -#[derive(Debug)] -pub struct Fields<'a> { - it: FieldsWith<'a, fn(char) -> bool>, -} - -impl<'a> Fields<'a> { - fn new(bytes: &'a BStr) -> Fields<'a> { - Fields { it: bytes.fields_with(|ch| ch.is_whitespace()) } - } -} - -impl<'a> Iterator for Fields<'a> { - type Item = &'a BStr; - - #[inline] - fn next(&mut self) -> Option<&'a BStr> { - self.it.next() - } -} - -/// An iterator over fields in the byte string, separated by a predicate over -/// codepoints. -/// -/// This iterator splits a byte string based on its predicate function such -/// that the elements returned are separated by contiguous runs of codepoints -/// for which the predicate returns true. -/// -/// `'a` is the lifetime of the byte string being split, while `F` is the type -/// of the predicate, i.e., `FnMut(char) -> bool`. -#[derive(Debug)] -pub struct FieldsWith<'a, F> { - f: F, - bytes: &'a BStr, - chars: CharIndices<'a>, -} - -impl<'a, F: FnMut(char) -> bool> FieldsWith<'a, F> { - fn new(bytes: &'a BStr, f: F) -> FieldsWith<'a, F> { - FieldsWith { - f: f, - bytes: bytes, - chars: bytes.char_indices(), - } - } -} - -impl<'a, F: FnMut(char) -> bool> Iterator for FieldsWith<'a, F> { - type Item = &'a BStr; - - #[inline] - fn next(&mut self) -> Option<&'a BStr> { - let (start, mut end); - loop { - match self.chars.next() { - None => return None, - Some((s, e, ch)) => { - if !(self.f)(ch) { - start = s; - end = e; - break; - } - } - } - } - while let Some((_, e, ch)) = self.chars.next() { - if (self.f)(ch) { - break; - } - end = e; - } - Some(&self.bytes[start..end]) - } -} - -/// An iterator over substrings in a byte string, split by a separator. -/// -/// `'a` is the lifetime of the byte string being split, while `F` is the type -/// of the predicate, i.e., `FnMut(char) -> bool`. -#[derive(Debug)] -pub struct Split<'a> { - finder: Find<'a>, - /// The end position of the previous match of our splitter. The element - /// we yield corresponds to the substring starting at `last` up to the - /// beginning of the next match of the splitter. - last: usize, - /// Only set when iteration is complete. A corner case here is when a - /// splitter is matched at the end of the haystack. At that point, we still - /// need to yield an empty string following it. - done: bool, -} - -impl<'a> Split<'a> { - fn new(haystack: &'a BStr, splitter: &'a BStr) -> Split<'a> { - let finder = haystack.find_iter(splitter); - Split { finder, last: 0, done: false } - } -} - -impl<'a> Iterator for Split<'a> { - type Item = &'a BStr; - - #[inline] - fn next(&mut self) -> Option<&'a BStr> { - let haystack = self.finder.haystack; - match self.finder.next() { - Some(start) => { - let next = &haystack[self.last..start]; - self.last = start + self.finder.searcher.needle().len(); - Some(next) - } - None => { - if self.last >= haystack.len() { - if !self.done { - self.done = true; - Some(B("")) - } else { - None - } - } else { - let s = &haystack[self.last..]; - self.last = haystack.len(); - self.done = true; - Some(s) - } - } - } - } -} - -/// An iterator over substrings in a byte string, split by a separator, in -/// reverse. -/// -/// `'a` is the lifetime of the byte string being split, while `F` is the type -/// of the predicate, i.e., `FnMut(char) -> bool`. -#[derive(Debug)] -pub struct SplitReverse<'a> { - finder: FindReverse<'a>, - /// The end position of the previous match of our splitter. The element - /// we yield corresponds to the substring starting at `last` up to the - /// beginning of the next match of the splitter. - last: usize, - /// Only set when iteration is complete. A corner case here is when a - /// splitter is matched at the end of the haystack. At that point, we still - /// need to yield an empty string following it. - done: bool, -} - -impl<'a> SplitReverse<'a> { - fn new(haystack: &'a BStr, splitter: &'a BStr) -> SplitReverse<'a> { - let finder = haystack.rfind_iter(splitter); - SplitReverse { finder, last: haystack.len(), done: false } - } -} - -impl<'a> Iterator for SplitReverse<'a> { - type Item = &'a BStr; - - #[inline] - fn next(&mut self) -> Option<&'a BStr> { - let haystack = self.finder.haystack(); - match self.finder.next() { - Some(start) => { - let nlen = self.finder.needle().len(); - let next = &haystack[start + nlen..self.last]; - self.last = start; - Some(next) - } - None => { - if self.last == 0 { - if !self.done { - self.done = true; - Some(B("")) - } else { - None - } - } else { - let s = &haystack[..self.last]; - self.last = 0; - self.done = true; - Some(s) - } - } - } - } -} - -/// An iterator over at most `n` substrings in a byte string, split by a -/// separator. -/// -/// `'a` is the lifetime of the byte string being split, while `F` is the type -/// of the predicate, i.e., `FnMut(char) -> bool`. -#[derive(Debug)] -pub struct SplitN<'a> { - split: Split<'a>, - limit: usize, - count: usize, -} - -impl<'a> SplitN<'a> { - fn new( - haystack: &'a BStr, - splitter: &'a BStr, - limit: usize, - ) -> SplitN<'a> { - let split = haystack.split(splitter); - SplitN { split, limit, count: 0 } - } -} - -impl<'a> Iterator for SplitN<'a> { - type Item = &'a BStr; - - #[inline] - fn next(&mut self) -> Option<&'a BStr> { - self.count += 1; - if self.count > self.limit { - None - } else if self.count == self.limit { - Some(&self.split.finder.haystack[self.split.last..]) - } else { - self.split.next() - } - } -} - - -/// An iterator over at most `n` substrings in a byte string, split by a -/// separator, in reverse. -/// -/// `'a` is the lifetime of the byte string being split, while `F` is the type -/// of the predicate, i.e., `FnMut(char) -> bool`. -#[derive(Debug)] -pub struct SplitNReverse<'a> { - split: SplitReverse<'a>, - limit: usize, - count: usize, -} - -impl<'a> SplitNReverse<'a> { - fn new( - haystack: &'a BStr, - splitter: &'a BStr, - limit: usize, - ) -> SplitNReverse<'a> { - let split = haystack.rsplit(splitter); - SplitNReverse { split, limit, count: 0 } - } -} - -impl<'a> Iterator for SplitNReverse<'a> { - type Item = &'a BStr; - - #[inline] - fn next(&mut self) -> Option<&'a BStr> { - self.count += 1; - if self.count > self.limit { - None - } else if self.count == self.limit { - Some(&self.split.finder.haystack()[..self.split.last]) - } else { - self.split.next() - } - } -} - -/// An iterator over all lines in a byte string, without their terminators. -/// -/// For this iterator, the only line terminators recognized are `\r\n` and -/// `\n`. -/// -/// `'a` is the lifetime of the byte string being iterated over. -pub struct Lines<'a> { - it: LinesWithTerminator<'a>, -} - -impl<'a> Lines<'a> { - fn new(bytes: &'a BStr) -> Lines<'a> { - Lines { it: LinesWithTerminator::new(bytes) } - } -} - -impl<'a> Iterator for Lines<'a> { - type Item = &'a BStr; - - #[inline] - fn next(&mut self) -> Option<&'a BStr> { - let mut line = self.it.next()?; - if line.last() == Some(b'\n') { - line = &line[..line.len() - 1]; - if line.last() == Some(b'\r') { - line = &line[..line.len() - 1]; - } - } - Some(line) - } -} - -/// An iterator over all lines in a byte string, including their terminators. -/// -/// For this iterator, the only line terminator recognized is `\n`. (Since -/// line terminators are included, this also handles `\r\n` line endings.) -/// -/// Line terminators are only included if they are present in the original -/// byte string. For example, the last line in a byte string may not end with -/// a line terminator. -/// -/// Concatenating all elements yielded by this iterator is guaranteed to yield -/// the original byte string. -/// -/// `'a` is the lifetime of the byte string being iterated over. -pub struct LinesWithTerminator<'a> { - bytes: &'a BStr, -} - -impl<'a> LinesWithTerminator<'a> { - fn new(bytes: &'a BStr) -> LinesWithTerminator<'a> { - LinesWithTerminator { bytes } - } -} - -impl<'a> Iterator for LinesWithTerminator<'a> { - type Item = &'a BStr; - - #[inline] - fn next(&mut self) -> Option<&'a BStr> { - match self.bytes.find_byte(b'\n') { - None if self.bytes.is_empty() => None, - None => { - let line = self.bytes; - self.bytes = B(""); - Some(line) - } - Some(end) => { - let line = &self.bytes[..end + 1]; - self.bytes = &self.bytes[end + 1..]; - Some(line) - } - } - } -} - -#[cfg(test)] -mod tests { - use tests::LOSSY_TESTS; - use super::*; - - #[test] - fn to_str_lossy() { - for (i, &(expected, input)) in LOSSY_TESTS.iter().enumerate() { - let got = B(input).to_str_lossy(); - assert_eq!( - expected.as_bytes(), - got.as_bytes(), - "to_str_lossy(ith: {:?}, given: {:?})", - i, input, - ); - - let mut got = String::new(); - B(input).to_str_lossy_into(&mut got); - assert_eq!( - expected.as_bytes(), got.as_bytes(), "to_str_lossy_into", - ); - - let got = String::from_utf8_lossy(input); - assert_eq!(expected.as_bytes(), got.as_bytes(), "std"); - } - } - - #[test] - #[should_panic] - fn copy_within_fail1() { - let mut buf = *b"foobar"; - let s = BStr::new_mut(&mut buf); - s.copy_within(0..2, 5); - } - - #[test] - #[should_panic] - fn copy_within_fail2() { - let mut buf = *b"foobar"; - let s = BStr::new_mut(&mut buf); - s.copy_within(3..2, 0); - } - - #[test] - #[should_panic] - fn copy_within_fail3() { - let mut buf = *b"foobar"; - let s = BStr::new_mut(&mut buf); - s.copy_within(5..7, 0); - } - - #[test] - #[should_panic] - fn copy_within_fail4() { - let mut buf = *b"foobar"; - let s = BStr::new_mut(&mut buf); - s.copy_within(0..1, 6); - } } diff --git a/src/bstring.rs b/src/bstring.rs index 016ccba..f04c651 100644 --- a/src/bstring.rs +++ b/src/bstring.rs @@ -1,98 +1,19 @@ -use std::borrow::Cow; -use std::error; -use std::ffi::{OsStr, OsString}; -use std::fmt; -use std::iter; -use std::ops; -use std::path::{Path, PathBuf}; -use std::ptr; -use std::str; -use std::vec; - use bstr::BStr; -use utf8::{self, Utf8Error}; - -/// Concatenate the elements given by the iterator together into a single -/// `BString`. -/// -/// The elements may be any type that can be cheaply converted into an `&[u8]`. -/// This includes, but is not limited to, `&str`, `&BStr` and `&[u8]` itself. -/// -/// # Examples -/// -/// Basic usage: -/// -/// ``` -/// use bstr; -/// -/// let s = bstr::concat(&["foo", "bar", "baz"]); -/// assert_eq!(s, "foobarbaz"); -/// ``` -#[inline] -pub fn concat( - elements: I, -) -> BString -where T: AsRef<[u8]>, - I: IntoIterator -{ - let mut dest = BString::new(); - for element in elements { - dest.push(element); - } - dest -} -/// Join the elements given by the iterator with the given separator into a -/// single `BString`. -/// -/// Both the separator and the elements may be any type that can be cheaply -/// converted into an `&[u8]`. This includes, but is not limited to, -/// `&str`, `&BStr` and `&[u8]` itself. -/// -/// # Examples -/// -/// Basic usage: -/// -/// ``` -/// use bstr; -/// -/// let s = bstr::join(",", &["foo", "bar", "baz"]); -/// assert_eq!(s, "foo,bar,baz"); -/// ``` -#[inline] -pub fn join( - separator: B, - elements: I, -) -> BString -where B: AsRef<[u8]>, - T: AsRef<[u8]>, - I: IntoIterator -{ - let mut it = elements.into_iter(); - let mut dest = BString::new(); - match it.next() { - None => return dest, - Some(first) => { - dest.push(first); - } - } - for element in it { - dest.push(&separator); - dest.push(element); - } - dest -} - -/// A growable byte string that is conventionally UTF-8. +/// A wrapper for `Vec` that provides convenient string oriented trait +/// impls. /// /// A `BString` has ownership over its contents and corresponds to /// a growable or shrinkable buffer. Its borrowed counterpart is a /// [`BStr`](struct.BStr.html), called a byte string slice. /// +/// Using a `BString` is just like using a `Vec`, since `BString` +/// implements `Deref` to `Vec`. So all methods available on `Vec` +/// are also available on `BString`. +/// /// # Examples /// -/// You can create a new `BString` from a literal Unicode string or a literal -/// byte string with `BString::from`: +/// You can create a new `BString` from a `Vec` via a `From` impl: /// /// ``` /// use bstr::BString; @@ -100,92 +21,11 @@ where B: AsRef<[u8]>, /// let s = BString::from("Hello, world!"); /// ``` /// -/// You can append bytes, characters or other strings to a `BString`: -/// -/// ``` -/// use bstr::BString; -/// -/// let mut s = BString::from("Hello, "); -/// s.push_byte(b'w'); -/// s.push_char('o'); -/// s.push("rl"); -/// s.push(b"d!"); -/// assert_eq!(s, "Hello, world!"); -/// ``` -/// -/// If you have a `String` or a `Vec`, then you can create a `BString` -/// from it with zero cost: -/// -/// ``` -/// use bstr::BString; -/// -/// let s = BString::from(vec![b'f', b'o', b'o']); -/// let s = BString::from("foo".to_string()); -/// ``` -/// -/// A `BString` can be freely converted back to a `Vec`: -/// -/// ``` -/// use bstr::BString; -/// -/// let s = BString::from("foo"); -/// let vector = s.into_vec(); -/// assert_eq!(vector, vec![b'f', b'o', b'o']); -/// ``` -/// -/// However, converting from a `BString` to a `String` requires UTF-8 -/// validation: -/// -/// ``` -/// use bstr::BString; -/// -/// # fn example() -> Result<(), ::bstr::FromUtf8Error> { -/// let bytes = BString::from("hello"); -/// let string = bytes.into_string()?; -/// -/// assert_eq!("hello", string); -/// # Ok(()) }; example().unwrap() -/// ``` -/// -/// # UTF-8 -/// -/// Like byte string slices (`BStr`), a `BString` is only conventionally -/// UTF-8. This is in constrast to the standard library's `String` type, which -/// is guaranteed to be valid UTF-8. -/// -/// Because of this relaxation, types such as `Vec`, `&[u8]`, `String` and -/// `&str` can all be converted to a `BString` (or `BStr`) at zero cost without -/// any validation step. -/// -/// Moreover, this relaxation implies that many of the restrictions around -/// mutating a `String` do not apply to `BString`. Namely, if your `BString` -/// is valid UTF-8, then the various methods that mutate the `BString` do not -/// necessarily prevent you from causing the bytes to become invalid UTF-8. -/// For example: -/// -/// ``` -/// use bstr::{B, BString}; -/// -/// let mut s = BString::from("hello"); -/// s[1] = b'\xFF'; -/// // `s` was valid UTF-8, but now it's now. -/// assert_eq!(s, B(b"h\xFFllo")); -/// ``` -/// /// # Deref /// /// The `BString` type implements `Deref` and `DerefMut`, where the target -/// types are `&BStr` and `&mut BStr`, respectively. `Deref` permits all of the -/// methods defined on `BStr` to be implicitly callable on any `BString`. -/// For example, the `contains` method is defined on `BStr` and not `BString`, -/// but values of type `BString` can still use it directly: -/// -/// ``` -/// use bstr::BString; -/// -/// let s = BString::from("foobarbaz"); -/// assert!(s.contains("bar")); -/// ``` +/// types are `&Vec` and `&mut Vec`, respectively. `Deref` permits all of the +/// methods defined on `Vec` to be implicitly callable on any `BString`. /// /// For more information about how deref works, see the documentation for the /// [`std::ops::Deref`](https://doc.rust-lang.org/std/ops/trait.Deref.html) @@ -198,1391 +38,22 @@ where B: AsRef<[u8]>, /// region of memory containing the bytes, a length and a capacity. #[derive(Clone, Hash)] pub struct BString { - bytes: Vec, + pub(crate) bytes: Vec, } impl BString { - /// Creates a new empty `BString`. - /// - /// Given that the `BString` is empty, this will not allocate any initial - /// buffer. While that means that this initial operation is very - /// inexpensive, it may cause excessive allocation later when you add - /// data. If you have an idea of how much data the `String` will hold, - /// consider the [`with_capacity`] method to prevent excessive - /// re-allocation. - /// - /// [`with_capacity`]: #method.with_capacity - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::BString; - /// - /// let s = BString::new(); - /// ``` - #[inline] - pub fn new() -> BString { - BString { bytes: vec![] } - } - - /// Creates a new empty `BString` with a particular capacity. - /// - /// `BString`s have an internal buffer to hold their data. The capacity is - /// the length of that buffer, and can be queried with the [`capacity`] - /// method. This method creates an empty `BString`, but one with an initial - /// buffer that can hold `capacity` bytes. This is useful when you may be - /// appending a bunch of data to the `BString`, reducing the number of - /// reallocations it needs to do. - /// - /// [`capacity`]: #method.capacity - /// - /// If the given capacity is `0`, no allocation will occur, and this method - /// is identical to the [`new`] method. - /// - /// [`new`]: #method.new - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::BString; - /// - /// let mut s = BString::with_capacity(10); - /// - /// // The String contains no chars, even though it has capacity for more - /// assert_eq!(s.len(), 0); - /// - /// // These are all done without reallocating... - /// let cap = s.capacity(); - /// for i in 0..10 { - /// s.push_char('a'); - /// } - /// - /// assert_eq!(s.capacity(), cap); - /// - /// // ...but this may make the vector reallocate - /// s.push_char('a'); - /// ``` #[inline] - pub fn with_capacity(capacity: usize) -> BString { - BString { bytes: Vec::with_capacity(capacity) } - } - - /// Create a new byte string from the given bytes. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::BString; - /// - /// let bytes = vec![b'a', b'b', b'c']; - /// let s = BString::from_vec(bytes); - /// assert_eq!("abc", s); - /// ``` - #[inline] - pub fn from_vec(bytes: Vec) -> BString { - BString { bytes } - } - - /// Create a new byte string by copying the given slice. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::BString; - /// - /// let s = BString::from_slice(b"abc"); - /// assert_eq!("abc", s); - /// ``` - #[inline] - pub fn from_slice>(slice: B) -> BString { - BString::from_vec(slice.as_ref().to_vec()) - } - - /// Create a new byte string from an owned OS string. - /// - /// On Unix, this always succeeds and is zero cost. On non-Unix systems, - /// this returns the original OS string if it is not valid UTF-8. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use std::ffi::OsString; - /// - /// use bstr::BString; - /// - /// let os_str = OsString::from("foo"); - /// let bs = BString::from_os_string(os_str).expect("must be valid UTF-8"); - /// assert_eq!(bs, "foo"); - /// ``` - #[inline] - pub fn from_os_string(os_str: OsString) -> Result { - BString::from_os_string_imp(os_str) - } - - #[cfg(unix)] - #[inline] - fn from_os_string_imp(os_str: OsString) -> Result { - use std::os::unix::ffi::OsStringExt; - - Ok(BString::from(os_str.into_vec())) - } - - #[cfg(not(unix))] - #[inline] - fn from_os_string_imp(os_str: OsString) -> Result { - os_str.into_string().map(BString::from) - } - - /// Lossily create a new byte string from an OS string slice. - /// - /// On Unix, this always succeeds, is zero cost and always returns a slice. - /// On non-Unix systems, this does a UTF-8 check. If the given OS string - /// slice is not valid UTF-8, then it is lossily decoded into valid UTF-8 - /// (with invalid bytes replaced by the Unicode replacement codepoint). - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use std::ffi::OsStr; - /// - /// use bstr::{B, BString}; - /// - /// let os_str = OsStr::new("foo"); - /// let bs = BString::from_os_str_lossy(os_str); - /// assert_eq!(bs, B("foo")); - /// ``` - #[inline] - pub fn from_os_str_lossy<'a>(os_str: &'a OsStr) -> Cow<'a, BStr> { - BString::from_os_str_lossy_imp(os_str) - } - - #[cfg(unix)] - #[inline] - fn from_os_str_lossy_imp<'a>(os_str: &'a OsStr) -> Cow<'a, BStr> { - use std::os::unix::ffi::OsStrExt; - - Cow::Borrowed(BStr::new(os_str.as_bytes())) - } - - #[cfg(not(unix))] - #[inline] - fn from_os_str_lossy_imp<'a>(os_str: &'a OsStr) -> Cow<'a, BStr> { - match os_str.to_string_lossy() { - Cow::Borrowed(x) => Cow::Borrowed(BStr::new(x)), - Cow::Owned(x) => Cow::Owned(BString::from(x)), - } - } - - /// Create a new byte string from an owned file path. - /// - /// On Unix, this always succeeds and is zero cost. On non-Unix systems, - /// this returns the original path if it is not valid UTF-8. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use std::path::PathBuf; - /// - /// use bstr::BString; - /// - /// let path = PathBuf::from("foo"); - /// let bs = BString::from_path_buf(path).expect("must be valid UTF-8"); - /// assert_eq!(bs, "foo"); - /// ``` - #[inline] - pub fn from_path_buf(path: PathBuf) -> Result { - BString::from_os_string(path.into_os_string()) - .map_err(PathBuf::from) - } - - /// Lossily create a new byte string from a file path. - /// - /// On Unix, this always succeeds, is zero cost and always returns a slice. - /// On non-Unix systems, this does a UTF-8 check. If the given path is not - /// valid UTF-8, then it is lossily decoded into valid UTF-8 (with invalid - /// bytes replaced by the Unicode replacement codepoint). - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use std::path::Path; - /// - /// use bstr::{B, BString}; - /// - /// let path = Path::new("foo"); - /// let bs = BString::from_path_lossy(path); - /// assert_eq!(bs, B("foo")); - /// ``` - #[inline] - pub fn from_path_lossy<'a>(path: &'a Path) -> Cow<'a, BStr> { - BString::from_os_str_lossy(path.as_os_str()) - } - - /// Appends the given byte to the end of this byte string. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::BString; - /// - /// let mut s = BString::from("abc"); - /// s.push_byte(b'\xE2'); - /// s.push_byte(b'\x98'); - /// s.push_byte(b'\x83'); - /// assert_eq!("abc☃", s); - /// ``` - #[inline] - pub fn push_byte(&mut self, byte: u8) { - self.bytes.push(byte); - } - - /// Appends the given `char` to the end of this byte string. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::BString; - /// - /// let mut s = BString::from("abc"); - /// s.push_char('1'); - /// s.push_char('2'); - /// s.push_char('3'); - /// assert_eq!("abc123", s); - /// ``` - #[inline] - pub fn push_char(&mut self, ch: char) { - if ch.len_utf8() == 1 { - self.bytes.push(ch as u8); - return; - } - self.bytes.extend_from_slice(ch.encode_utf8(&mut [0; 4]).as_bytes()); - } - - /// Appends the given slice to the end of this byte string. This accepts - /// any type that be converted to a `&[u8]`. This includes, but is not - /// limited to, `&str`, `&BStr`, and of course, `&[u8]` itself. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::BString; - /// - /// let mut s = BString::from("abc"); - /// s.push(b"123"); - /// assert_eq!("abc123", s); - /// ``` - #[inline] - pub fn push>(&mut self, bytes: B) { - self.bytes.extend_from_slice(bytes.as_ref()); - } - - /// Extracts a byte string slice containing the entire `BString`. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::{BStr, BString}; - /// - /// let s = BString::from("foo"); - /// - /// assert_eq!(BStr::new("foo"), s.as_bstr()); - /// ``` - #[inline] - pub fn as_bstr(&self) -> &BStr { - BStr::from_bytes(&self.bytes) - } - - /// Returns this `BString` as a borrowed byte vector. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::BString; - /// - /// let bs = BString::from("ab"); - /// assert!(bs.as_vec().capacity() >= 2); - /// ``` - #[inline] - pub fn as_vec(&self) -> &Vec { + pub(crate) fn as_bytes(&self) -> &[u8] { &self.bytes } - /// Converts a `BString` into a mutable string slice. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::BString; - /// - /// let mut s = BString::from("foobar"); - /// let s_mut_str = s.as_mut_bstr(); - /// - /// s_mut_str[0] = b'F'; - /// - /// assert_eq!("Foobar", s_mut_str); - /// ``` - #[inline] - pub fn as_mut_bstr(&mut self) -> &mut BStr { - BStr::from_bytes_mut(&mut self.bytes) - } - - /// Returns this `BString` as a mutable byte vector. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::BString; - /// - /// let mut bs = BString::from("ab"); - /// bs.as_mut_vec().push(b'c'); - /// assert_eq!("abc", bs); - /// ``` - #[inline] - pub fn as_mut_vec(&mut self) -> &mut Vec { - &mut self.bytes - } - - /// Converts a `BString` into a byte vector. - /// - /// This consumes the `BString`, and thus the contents are not copied. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::BString; - /// - /// let s = BString::from("hello"); - /// let bytes = s.into_vec(); - /// - /// assert_eq!(vec![104, 101, 108, 108, 111], &bytes[..]); - /// ``` - #[inline] - pub fn into_vec(self) -> Vec { - self.bytes - } - - /// Converts a `BString` into a `String` if and only if this byte string is - /// valid UTF-8. - /// - /// If it is not valid UTF-8, then the error `std::string::FromUtf8Error` - /// is returned. (This error can be used to examine why UTF-8 validation - /// failed, or to regain the original byte string.) - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::BString; - /// - /// # fn example() -> Result<(), ::bstr::FromUtf8Error> { - /// let bytes = BString::from("hello"); - /// let string = bytes.into_string()?; - /// - /// assert_eq!("hello", string); - /// # Ok(()) }; example().unwrap() - /// ``` - /// - /// If this byte string is not valid UTF-8, then an error will be returned. - /// That error can then be used to inspect the location at which invalid - /// UTF-8 was found, or to regain the original byte string: - /// - /// ``` - /// use bstr::{B, BString}; - /// - /// let bytes = BString::from_slice(b"foo\xFFbar"); - /// let err = bytes.into_string().unwrap_err(); - /// - /// assert_eq!(err.utf8_error().valid_up_to(), 3); - /// assert_eq!(err.utf8_error().error_len(), Some(1)); - /// - /// // At no point in this example is an allocation performed. - /// let bytes = BString::from(err.into_bstring()); - /// assert_eq!(bytes, B(b"foo\xFFbar")); - /// ``` - #[inline] - pub fn into_string(self) -> Result { - match utf8::validate(self.as_bytes()) { - Err(err) => { - Err(FromUtf8Error { original: self, err: err }) - } - Ok(()) => { - // SAFETY: This is safe because of the guarantees provided by - // utf8::validate. - unsafe { Ok(self.into_string_unchecked()) } - } - } - } - - /// Lossily converts a `BString` into a `String`. If this byte string - /// contains invalid UTF-8, then the invalid bytes are replaced with the - /// Unicode replacement codepoint. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::BString; - /// - /// let bytes = BString::from_slice(b"foo\xFFbar"); - /// let string = bytes.into_string_lossy(); - /// assert_eq!(string, "foo\u{FFFD}bar"); - /// ``` - #[inline] - pub fn into_string_lossy(self) -> String { - self.to_string() - } - - /// Unsafely convert this byte string into a `String`, without checking for - /// valid UTF-8. - /// - /// # Safety - /// - /// Callers *must* ensure that this byte string is valid UTF-8 before - /// calling this method. Converting a byte string into a `String` that is - /// not valid UTF-8 is considered undefined behavior. - /// - /// This routine is useful in performance sensitive contexts where the - /// UTF-8 validity of the byte string is already known and it is - /// undesirable to pay the cost of an additional UTF-8 validation check - /// that [`into_string`](#method.into_string) performs. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::BString; - /// - /// // SAFETY: This is safe because string literals are guaranteed to be - /// // valid UTF-8 by the Rust compiler. - /// let s = unsafe { BString::from("☃βツ").into_string_unchecked() }; - /// assert_eq!("☃βツ", s); - /// ``` - pub unsafe fn into_string_unchecked(self) -> String { - String::from_utf8_unchecked(self.into_vec()) - } - - /// Converts this byte string into an OS string, in place. - /// - /// On Unix, this always succeeds and is zero cost. On non-Unix systems, - /// this returns the original byte string if it is not valid UTF-8. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use std::ffi::OsStr; - /// - /// use bstr::BString; - /// - /// let bs = BString::from("foo"); - /// let os_str = bs.into_os_string().expect("should be valid UTF-8"); - /// assert_eq!(os_str, OsStr::new("foo")); - /// ``` - #[inline] - pub fn into_os_string(self) -> Result { - self.into_os_string_imp() - } - - #[cfg(unix)] - #[inline] - fn into_os_string_imp(self) -> Result { - use std::os::unix::ffi::OsStringExt; - - Ok(OsString::from_vec(self.into_vec())) - } - - #[cfg(not(unix))] - #[inline] - fn into_os_string_imp(self) -> Result { - match self.into_string() { - Ok(s) => Ok(OsString::from(s)), - Err(err) => Err(err.into_bstring()), - } - } - - /// Lossily converts this byte string into an OS string, in place. - /// - /// On Unix, this always succeeds and is zero cost. On non-Unix systems, - /// this will perform a UTF-8 check and lossily convert this byte string - /// into valid UTF-8 using the Unicode replacement codepoint. - /// - /// Note that this can prevent the correct roundtripping of file paths on - /// non-Unix systems such as Windows, where file paths are an arbitrary - /// sequence of 16-bit integers. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::BString; - /// - /// let bs = BString::from_slice(b"foo\xFFbar"); - /// let os_str = bs.into_os_string_lossy(); - /// assert_eq!(os_str.to_string_lossy(), "foo\u{FFFD}bar"); - /// ``` - #[inline] - pub fn into_os_string_lossy(self) -> OsString { - self.into_os_string_lossy_imp() - } - - #[cfg(unix)] - #[inline] - fn into_os_string_lossy_imp(self) -> OsString { - use std::os::unix::ffi::OsStringExt; - - OsString::from_vec(self.into_vec()) - } - - #[cfg(not(unix))] - #[inline] - fn into_os_string_lossy_imp(self) -> OsString { - OsString::from(self.into_string_lossy()) - } - - /// Converts this byte string into an owned file path, in place. - /// - /// On Unix, this always succeeds and is zero cost. On non-Unix systems, - /// this returns the original byte string if it is not valid UTF-8. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::BString; - /// - /// let bs = BString::from("foo"); - /// let path = bs.into_path_buf().expect("should be valid UTF-8"); - /// assert_eq!(path.as_os_str(), "foo"); - /// ``` - #[inline] - pub fn into_path_buf(self) -> Result { - self.into_os_string().map(PathBuf::from) - } - - /// Lossily converts this byte string into an owned file path, in place. - /// - /// On Unix, this always succeeds and is zero cost. On non-Unix systems, - /// this will perform a UTF-8 check and lossily convert this byte string - /// into valid UTF-8 using the Unicode replacement codepoint. - /// - /// Note that this can prevent the correct roundtripping of file paths on - /// non-Unix systems such as Windows, where file paths are an arbitrary - /// sequence of 16-bit integers. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::BString; - /// - /// let bs = BString::from_slice(b"foo\xFFbar"); - /// let path = bs.into_path_buf_lossy(); - /// assert_eq!(path.to_string_lossy(), "foo\u{FFFD}bar"); - /// ``` - #[inline] - pub fn into_path_buf_lossy(self) -> PathBuf { - PathBuf::from(self.into_os_string_lossy()) - } - - /// Converts this `BString` into a `Box`. - /// - /// This will drop any excess capacity. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::BString; - /// - /// let s = BString::from("foobar"); - /// let b = s.into_boxed_bstr(); - /// assert_eq!(6, b.len()); - /// ``` - #[inline] - pub fn into_boxed_bstr(self) -> Box { - unsafe { - let slice = self.bytes.into_boxed_slice(); - Box::from_raw(Box::into_raw(slice) as *mut BStr) - } - } - - /// Returns this byte string's capacity, in bytes. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::BString; - /// - /// let s = BString::with_capacity(10); - /// assert_eq!(10, s.capacity()); - /// ``` - #[inline] - pub fn capacity(&self) -> usize { - self.bytes.capacity() - } - - /// Truncates this byte string, removing all contents. - /// - /// The resulting byte string will always have length `0`, but its capacity - /// remains unchanged. - #[inline] - pub fn clear(&mut self) { - self.bytes.clear(); - } - - /// Ensures that this `BString`'s capacity is at least `additional` - /// bytes larger than its length. - /// - /// The capacity may be increased by more than `additional` bytes if it - /// chooses, to prevent frequent reallocations. - /// - /// If you do not want this "at least" behavior, use the - /// [`reserve_exact`](#method.reserve_exact) method instead. - /// - /// # Panics - /// - /// Panics if the new capacity overflows `usize`. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::BString; - /// - /// let mut s = BString::new(); - /// s.reserve(10); - /// assert!(s.capacity() >= 10); - /// ``` - #[inline] - pub fn reserve(&mut self, additional: usize) { - self.bytes.reserve(additional); - } - - /// Ensures that this `BString`'s capacity is exactly `additional` - /// bytes larger than its length. - /// - /// Consider using the [`reserve`](#method.reserve) method unless you - /// absolutely know better than the allocator. - /// - /// # Panics - /// - /// Panics if the new capacity overflows `usize`. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::BString; - /// - /// let mut s = BString::new(); - /// s.reserve_exact(10); - /// assert!(s.capacity() >= 10); - /// ``` - #[inline] - pub fn reserve_exact(&mut self, additional: usize) { - self.bytes.reserve_exact(additional); - } - - /// Shrinks the capacity of this `BString` to match its length. - /// - /// Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::BString; - /// - /// let mut s = BString::from("foo"); - /// s.reserve(10); - /// assert!(s.capacity() >= 10); - /// s.shrink_to_fit(); - /// assert_eq!(3, s.capacity()); - /// ``` - #[inline] - pub fn shrink_to_fit(&mut self) { - self.bytes.shrink_to_fit(); - } - - /// Shortens this `BString` to the specified length, in bytes. - /// - /// If `new_len` is greater than or equal to this byte string's current - /// length, then this has no effect. - /// - /// Note that this does _not_ panic if the result is not on a valid - /// `char` boundary. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::BString; - /// - /// let mut s = BString::from("foobar"); - /// s.truncate(3); - /// assert_eq!("foo", s); - /// ``` #[inline] - pub fn truncate(&mut self, new_len: usize) { - if new_len < self.len() { - self.bytes.truncate(new_len); - } + pub(crate) fn as_bstr(&self) -> &BStr { + BStr::new(&self.bytes) } - /// Resizes this byte string in place so that the length of this byte - /// string is equivalent to `new_len`. - /// - /// If `new_len` is greater than the length of this byte string, then - /// the byte string is extended by the difference, which each additional - /// byte filled with the given value. If `new_len` is less than the length - /// of this byte string, then it is simply truncated. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::BString; - /// - /// let mut s = BString::from("f"); - /// s.resize(3, b'o'); - /// assert_eq!(s, "foo"); - /// s.resize(1, b'o'); - /// assert_eq!(s, "f"); - /// ``` #[inline] - pub fn resize(&mut self, new_len: usize, value: u8) { - self.bytes.resize(new_len, value); - } - - /// Removes the last codepoint from this `BString` and returns it. - /// - /// If this byte string is empty, then `None` is returned. If the last - /// bytes of this byte string do not correspond to a valid UTF-8 code unit - /// sequence, then the Unicode replacement codepoint is yielded instead in - /// accordance with the - /// [replacement codepoint substitution policy](index.html#handling-of-invalid-utf8-8). - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::BString; - /// - /// let mut s = BString::from("foo"); - /// assert_eq!(s.pop_char(), Some('o')); - /// assert_eq!(s.pop_char(), Some('o')); - /// assert_eq!(s.pop_char(), Some('f')); - /// assert_eq!(s.pop_char(), None); - /// ``` - /// - /// This shows the replacement codepoint substitution policy. Note that - /// the first pop yields a replacement codepoint but actually removes two - /// bytes. This is in contrast with subsequent pops when encountering - /// `\xFF` since `\xFF` is never a valid prefix for any valid UTF-8 - /// code unit sequence. - /// - /// ``` - /// use bstr::BString; - /// - /// let mut s = BString::from_slice(b"f\xFF\xFF\xFFoo\xE2\x98"); - /// assert_eq!(s.pop_char(), Some('\u{FFFD}')); - /// assert_eq!(s.pop_char(), Some('o')); - /// assert_eq!(s.pop_char(), Some('o')); - /// assert_eq!(s.pop_char(), Some('\u{FFFD}')); - /// assert_eq!(s.pop_char(), Some('\u{FFFD}')); - /// assert_eq!(s.pop_char(), Some('\u{FFFD}')); - /// assert_eq!(s.pop_char(), Some('f')); - /// assert_eq!(s.pop_char(), None); - /// ``` - #[inline] - pub fn pop_char(&mut self) -> Option { - let (ch, size) = utf8::decode_last_lossy(self.as_bytes()); - if size == 0 { - return None; - } - let new_len = self.len() - size; - self.truncate(new_len); - Some(ch) - } - - /// Removes the last byte from this `BString` and returns it. - /// - /// If this byte string is empty, then `None` is returned. - /// - /// Note that if the last codepoint in this byte string is not ASCII, then - /// removing the last byte could make this byte string contain invalid - /// UTF-8. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::BString; - /// - /// let mut s = BString::from("foo"); - /// assert_eq!(s.pop_byte(), Some(b'o')); - /// assert_eq!(s.pop_byte(), Some(b'o')); - /// assert_eq!(s.pop_byte(), Some(b'f')); - /// assert_eq!(s.pop_byte(), None); - /// ``` - #[inline] - pub fn pop_byte(&mut self) -> Option { - self.bytes.pop() - } - - /// **DEPRECATED**: Use - /// [`pop_char`](struct.BString.html#method.pop_char) - /// or - /// [`pop_byte`](struct.BString.html#method.pop_byte) - /// instead. - /// - /// Removes the last codepoint from this `BString` and returns it. - /// - /// If this byte string is empty, then `None` is returned. If the last - /// bytes of this byte string do not correspond to a valid UTF-8 code unit - /// sequence, then the Unicode replacement codepoint is yielded instead in - /// accordance with the - /// [replacement codepoint substitution policy](index.html#handling-of-invalid-utf8-8). - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::BString; - /// - /// let mut s = BString::from("foo"); - /// assert_eq!(s.pop(), Some('o')); - /// assert_eq!(s.pop(), Some('o')); - /// assert_eq!(s.pop(), Some('f')); - /// assert_eq!(s.pop(), None); - /// ``` - /// - /// This shows the replacement codepoint substitution policy. Note that - /// the first pop yields a replacement codepoint but actually removes two - /// bytes. This is in contrast with subsequent pops when encountering - /// `\xFF` since `\xFF` is never a valid prefix for any valid UTF-8 - /// code unit sequence. - /// - /// ``` - /// use bstr::BString; - /// - /// let mut s = BString::from_slice(b"f\xFF\xFF\xFFoo\xE2\x98"); - /// assert_eq!(s.pop(), Some('\u{FFFD}')); - /// assert_eq!(s.pop(), Some('o')); - /// assert_eq!(s.pop(), Some('o')); - /// assert_eq!(s.pop(), Some('\u{FFFD}')); - /// assert_eq!(s.pop(), Some('\u{FFFD}')); - /// assert_eq!(s.pop(), Some('\u{FFFD}')); - /// assert_eq!(s.pop(), Some('f')); - /// assert_eq!(s.pop(), None); - /// ``` - #[deprecated(since = "0.1.1", note = "use pop_char or pop_byte instead")] - #[inline] - pub fn pop(&mut self) -> Option { - self.pop_char() - } - - /// Removes a `char` from this `BString` at the given byte position and - /// returns it. - /// - /// If the bytes at the given position do not lead to a valid UTF-8 code - /// unit sequence, then a - /// [replacement codepoint is returned instead](index.html#handling-of-invalid-utf8-8). - /// - /// # Panics - /// - /// Panics if `at` is larger than or equal to this byte string's length. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::BString; - /// - /// let mut s = BString::from("foo☃bar"); - /// assert_eq!('☃', s.remove(3)); - /// assert_eq!("foobar", s); - /// ``` - /// - /// This example shows how the Unicode replacement codepoint policy is - /// used: - /// - /// ``` - /// use bstr::BString; - /// - /// let mut s = BString::from_slice(b"foo\xFFbar"); - /// assert_eq!('\u{FFFD}', s.remove(3)); - /// assert_eq!("foobar", s); - /// ``` - #[inline] - pub fn remove(&mut self, at: usize) -> char { - let (ch, size) = utf8::decode_lossy(self[at..].as_bytes()); - assert!(size > 0, "expected {} to be less than {}", at, self.len()); - self.bytes.drain(at..at + size); - ch - } - - /// Inserts the given codepoint into this `BString` at a particular byte - /// position. - /// - /// This is an `O(n)` operation as it may copy a number of elements in this - /// byte string proportional to its length. - /// - /// # Panics - /// - /// Panics if `at` is larger than the byte string's length. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::BString; - /// - /// let mut s = BString::from("foobar"); - /// s.insert_char(3, '☃'); - /// assert_eq!("foo☃bar", s); - /// ``` - #[inline] - pub fn insert_char(&mut self, at: usize, ch: char) { - self.insert(at, ch.encode_utf8(&mut [0; 4]).as_bytes()); - } - - /// Inserts the given byte string into this byte string at a particular - /// byte position. - /// - /// This is an `O(n)` operation as it may copy a number of elements in this - /// byte string proportional to its length. - /// - /// Note that the type parameter `B` on this method means that it can - /// accept anything that can be cheaply converted to a `&[u8]`. This - /// includes, but is not limited to, `&str`, `&BStr` and `&[u8]` itself. - /// - /// # Panics - /// - /// Panics if `at` is larger than the byte string's length. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::BString; - /// - /// let mut s = BString::from("foobar"); - /// s.insert(3, "☃☃☃"); - /// assert_eq!("foo☃☃☃bar", s); - /// ``` - #[inline] - pub fn insert>(&mut self, at: usize, bytes: B) { - assert!(at <= self.len(), "expected {} to be <= {}", at, self.len()); - - let bytes = bytes.as_ref(); - let len = self.len(); - - // SAFETY: We'd like to efficiently splice in the given bytes into - // this byte string. Since we are only working with `u8` elements here, - // we only need to consider whether our bounds are correct and whether - // our byte string has enough space. - self.reserve(bytes.len()); - unsafe { - // Shift bytes after `at` over by the length of `bytes` to make - // room for it. This requires referencing two regions of memory - // that may overlap, so we use ptr::copy. - ptr::copy( - self.bytes.as_ptr().add(at), - self.bytes.as_mut_ptr().add(at + bytes.len()), - len - at, - ); - // Now copy the bytes given into the room we made above. In this - // case, we know that the given bytes cannot possibly overlap - // with this byte string since we have a mutable borrow of the - // latter. Thus, we can use a nonoverlapping copy. - ptr::copy_nonoverlapping( - bytes.as_ptr(), - self.bytes.as_mut_ptr().add(at), - bytes.len(), - ); - self.bytes.set_len(len + bytes.len()); - } - } - - /// Splits this `BString` into two separate byte strings at the given - /// index. - /// - /// This returns a newly allocated `BString`, while `self` retans bytes - /// `[0, at)` and the returned `BString` contains bytes `[at, len)`. - /// - /// The capacity of `self` does not change. - /// - /// # Panics - /// - /// Panics if `at` is beyond the end of this byte string. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::BString; - /// - /// let mut s = BString::from("foobar"); - /// let bar = s.split_off(3); - /// assert_eq!(s, "foo"); - /// assert_eq!(bar, "bar"); - /// ``` - #[inline] - pub fn split_off(&mut self, at: usize) -> BString { - BString::from(self.bytes.split_off(at)) - } - - /// Removes the specified range in this byte string and replaces it with - /// the given bytes. The given bytes do not need to have the same length - /// as the range provided. - /// - /// # Panics - /// - /// Panics if the given range is invalid. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::BString; - /// - /// let mut s = BString::from("foobar"); - /// s.replace_range(2..4, "xxxxx"); - /// assert_eq!(s, "foxxxxxar"); - /// ``` - #[inline] - pub fn replace_range( - &mut self, - range: R, - replace_with: B, - ) where R: ops::RangeBounds, - B: AsRef<[u8]> - { - self.bytes.splice(range, replace_with.as_ref().iter().cloned()); - } - - /// Creates a draining iterator that removes the specified range in this - /// `BString` and yields each of the removed bytes. - /// - /// Note that the elements specified by the given range are removed - /// regardless of whether the returned iterator is fully exhausted. - /// - /// Also note that is is unspecified how many bytes are removed from the - /// `BString` if the `DrainBytes` iterator is leaked. - /// - /// # Panics - /// - /// Panics if the given range is not valid. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::BString; - /// - /// let mut s = BString::from("foobar"); - /// { - /// let mut drainer = s.drain_bytes(2..4); - /// assert_eq!(drainer.next(), Some(b'o')); - /// assert_eq!(drainer.next(), Some(b'b')); - /// assert_eq!(drainer.next(), None); - /// } - /// assert_eq!(s, "foar"); - /// ``` - #[inline] - pub fn drain_bytes( - &mut self, - range: R, - ) -> DrainBytes - where R: ops::RangeBounds - { - DrainBytes { it: self.bytes.drain(range) } - } -} - -/// A draining byte oriented iterator for `BString`. -/// -/// This iterator is created by -/// [`BString::drain`](struct.BString.html#method.drain). -/// -/// # Examples -/// -/// Basic usage: -/// -/// ``` -/// use bstr::BString; -/// -/// let mut s = BString::from("foobar"); -/// { -/// let mut drainer = s.drain_bytes(2..4); -/// assert_eq!(drainer.next(), Some(b'o')); -/// assert_eq!(drainer.next(), Some(b'b')); -/// assert_eq!(drainer.next(), None); -/// } -/// assert_eq!(s, "foar"); -/// ``` -#[derive(Debug)] -pub struct DrainBytes<'a> { - it: vec::Drain<'a, u8>, -} - -impl<'a> iter::FusedIterator for DrainBytes<'a> {} - -impl<'a> Iterator for DrainBytes<'a> { - type Item = u8; - - #[inline] - fn next(&mut self) -> Option { - self.it.next() - } -} - -impl<'a> DoubleEndedIterator for DrainBytes<'a> { - #[inline] - fn next_back(&mut self) -> Option { - self.it.next_back() - } -} - -impl<'a> ExactSizeIterator for DrainBytes<'a> { - #[inline] - fn len(&self) -> usize { - self.it.len() - } -} - -/// An error that may occur when converting a `BString` to a `String`. -/// -/// This error includes the original `BString` that failed to convert to a -/// `String`. This permits callers to recover the allocation used even if it -/// it not valid UTF-8. -/// -/// # Examples -/// -/// Basic usage: -/// -/// ``` -/// use bstr::{B, BString}; -/// -/// let bytes = BString::from_slice(b"foo\xFFbar"); -/// let err = bytes.into_string().unwrap_err(); -/// -/// assert_eq!(err.utf8_error().valid_up_to(), 3); -/// assert_eq!(err.utf8_error().error_len(), Some(1)); -/// -/// // At no point in this example is an allocation performed. -/// let bytes = BString::from(err.into_bstring()); -/// assert_eq!(bytes, B(b"foo\xFFbar")); -/// ``` -#[derive(Debug, Eq, PartialEq)] -pub struct FromUtf8Error { - original: BString, - err: Utf8Error, -} - -impl FromUtf8Error { - /// Return the original bytes as a slice that failed to convert to a - /// `String`. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::{B, BString}; - /// - /// let bytes = BString::from_slice(b"foo\xFFbar"); - /// let err = bytes.into_string().unwrap_err(); - /// - /// // At no point in this example is an allocation performed. - /// assert_eq!(err.as_bstr(), B(b"foo\xFFbar")); - /// ``` - #[inline] - pub fn as_bstr(&self) -> &BStr { - &self.original - } - - /// Consume this error and return the original byte string that failed to - /// convert to a `String`. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::{B, BString}; - /// - /// let bytes = BString::from_slice(b"foo\xFFbar"); - /// let err = bytes.into_string().unwrap_err(); - /// let original = err.into_bstring(); - /// - /// // At no point in this example is an allocation performed. - /// assert_eq!(original, B(b"foo\xFFbar")); - /// ``` - #[inline] - pub fn into_bstring(self) -> BString { - self.original - } - - /// Return the underlying UTF-8 error that occurred. This error provides - /// information on the nature and location of the invalid UTF-8 detected. - /// - /// # Examples - /// - /// Basic usage: - /// - /// ``` - /// use bstr::{B, BString}; - /// - /// let bytes = BString::from_slice(b"foo\xFFbar"); - /// let err = bytes.into_string().unwrap_err(); - /// - /// assert_eq!(err.utf8_error().valid_up_to(), 3); - /// assert_eq!(err.utf8_error().error_len(), Some(1)); - /// ``` - #[inline] - pub fn utf8_error(&self) -> &Utf8Error { - &self.err - } -} - -impl error::Error for FromUtf8Error { - #[inline] - fn description(&self) -> &str { "invalid UTF-8 vector" } -} - -impl fmt::Display for FromUtf8Error { - #[inline] - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "{}", self.err) - } -} - -#[cfg(test)] -mod tests { - use bstr::B; - use super::*; - - #[test] - fn insert() { - let mut s = BString::new(); - s.insert(0, "foo"); - assert_eq!("foo", s); - - let mut s = BString::from("a"); - s.insert(0, "foo"); - assert_eq!("fooa", s); - - let mut s = BString::from("a"); - s.insert(1, "foo"); - assert_eq!("afoo", s); - - let mut s = BString::from("foobar"); - s.insert(3, "quux"); - assert_eq!("fooquuxbar", s); - - let mut s = BString::from("foobar"); - s.insert(3, "x"); - assert_eq!("fooxbar", s); - - let mut s = BString::from("foobar"); - s.insert(0, "x"); - assert_eq!("xfoobar", s); - - let mut s = BString::from("foobar"); - s.insert(6, "x"); - assert_eq!("foobarx", s); - - let mut s = BString::from("foobar"); - s.insert(3, "quuxbazquux"); - assert_eq!("fooquuxbazquuxbar", s); - } - - #[test] - #[should_panic] - fn insert_fail1() { - let mut s = BString::new(); - s.insert(1, "foo"); - } - - #[test] - #[should_panic] - fn insert_fail2() { - let mut s = BString::from("a"); - s.insert(2, "foo"); - } - - #[test] - #[should_panic] - fn insert_fail3() { - let mut s = BString::from("foobar"); - s.insert(7, "foo"); - } - - #[test] - fn collect() { - let s: BString = vec!['a', 'b', 'c'].into_iter().collect(); - assert_eq!(s, "abc"); - - let s: BString = vec!["a", "b", "c"].into_iter().collect(); - assert_eq!(s, "abc"); - - let s: BString = vec![B("a"), B("b"), B("c")].into_iter().collect(); - assert_eq!(s, "abc"); + pub(crate) fn as_mut_bstr(&mut self) -> &mut BStr { + BStr::new_mut(&mut self.bytes) } } diff --git a/src/cow.rs b/src/cow.rs index 9e6efa9..971490c 100644 --- a/src/cow.rs +++ b/src/cow.rs @@ -2,51 +2,47 @@ use std::borrow::Cow; use core::ops; -use bstr::BStr; -#[cfg(feature = "std")] -use bstring::BString; - -/// A specialized copy-on-write BStr. +/// A specialized copy-on-write byte string. /// -/// The purpose of this type is to permit usage of a "borrowed or owned byte -/// string" in a way that keeps std/no-std compatibility. That is, in no-std -/// mode, this type devolves into a simple &BStr with no owned variant +/// The purpose of this type is to permit usage of a "borrowed or owned +/// byte string" in a way that keeps std/no-std compatibility. That is, in +/// no-std mode, this type devolves into a simple &[u8] with no owned variant /// availble. #[derive(Clone, Debug)] -pub struct CowBStr<'a>(Imp<'a>); +pub struct CowBytes<'a>(Imp<'a>); #[cfg(feature = "std")] #[derive(Clone, Debug)] -struct Imp<'a>(Cow<'a, BStr>); +struct Imp<'a>(Cow<'a, [u8]>); #[cfg(not(feature = "std"))] #[derive(Clone, Debug)] -struct Imp<'a>(&'a BStr); +struct Imp<'a>(&'a [u8]); -impl<'a> ops::Deref for CowBStr<'a> { - type Target = BStr; +impl<'a> ops::Deref for CowBytes<'a> { + type Target = [u8]; - fn deref(&self) -> &BStr { - self.as_bstr() + fn deref(&self) -> &[u8] { + self.as_slice() } } -impl<'a> CowBStr<'a> { - /// Create a new borrowed CowBStr. - pub fn new>(bytes: &'a B) -> CowBStr<'a> { - CowBStr(Imp::new(BStr::new(bytes))) +impl<'a> CowBytes<'a> { + /// Create a new borrowed CowBytes. + pub fn new>(bytes: &'a B) -> CowBytes<'a> { + CowBytes(Imp::new(bytes.as_ref())) } - /// Create a new owned CowBStr. + /// Create a new owned CowBytes. #[cfg(feature = "std")] - pub fn new_owned(bytes: BString) -> CowBStr<'static> { - CowBStr(Imp(Cow::Owned(bytes))) + pub fn new_owned(bytes: Vec) -> CowBytes<'static> { + CowBytes(Imp(Cow::Owned(bytes))) } /// Return a borrowed byte string, regardless of whether this is an owned /// or borrowed byte string internally. - pub fn as_bstr(&self) -> &BStr { - self.0.as_bstr() + pub fn as_slice(&self) -> &[u8] { + self.0.as_slice() } /// Return an owned version of this copy-on-write byte string. @@ -54,28 +50,27 @@ impl<'a> CowBStr<'a> { /// If this is already an owned byte string internally, then this is a /// no-op. Otherwise, the internal byte string is copied. #[cfg(feature = "std")] - pub fn into_owned(self) -> CowBStr<'static> { + pub fn into_owned(self) -> CowBytes<'static> { match (self.0).0 { - Cow::Borrowed(b) => CowBStr::new_owned(b.to_bstring()), - Cow::Owned(b) => CowBStr::new_owned(b), + Cow::Borrowed(b) => CowBytes::new_owned(b.to_vec()), + Cow::Owned(b) => CowBytes::new_owned(b), } } } impl<'a> Imp<'a> { #[cfg(feature = "std")] - pub fn new(bytes: &'a BStr) -> Imp<'a> { + pub fn new(bytes: &'a [u8]) -> Imp<'a> { Imp(Cow::Borrowed(bytes)) } #[cfg(not(feature = "std"))] - pub fn new(bytes: &'a BStr) -> Imp<'a> { + pub fn new(bytes: &'a [u8]) -> Imp<'a> { Imp(bytes) } #[cfg(feature = "std")] - pub fn as_bstr(&self) -> &BStr { - // &*self.0 + pub fn as_slice(&self) -> &[u8] { match self.0 { Cow::Owned(ref x) => x, Cow::Borrowed(x) => x, @@ -83,7 +78,7 @@ impl<'a> Imp<'a> { } #[cfg(not(feature = "std"))] - pub fn as_bstr(&self) -> &BStr { + pub fn as_slice(&self) -> &[u8] { self.0 } } diff --git a/src/ext_slice.rs b/src/ext_slice.rs new file mode 100644 index 0000000..0c7474e --- /dev/null +++ b/src/ext_slice.rs @@ -0,0 +1,3471 @@ +#[cfg(feature = "std")] +use std::borrow::Cow; +#[cfg(feature = "std")] +use std::ffi::OsStr; +#[cfg(feature = "std")] +use std::path::Path; + +use core::cmp; +use core::ops; +use core::ptr; +use core::slice; +use core::str; + +use memchr::{memchr, memrchr}; + +use ascii; +use bstr::BStr; +#[cfg(feature = "std")] +use ext_vec::ByteVec; +use search::{PrefilterState, TwoWay}; +#[cfg(feature = "unicode")] +use unicode::{ + Graphemes, GraphemeIndices, + Sentences, SentenceIndices, + Words, WordIndices, WordsWithBreaks, WordsWithBreakIndices, + whitespace_len_fwd, whitespace_len_rev, +}; +use utf8::{self, Chars, CharIndices, Utf8Error}; + +/// A short-hand constructor for building a `&[u8]`. +/// +/// This idiosyncratic constructor is useful for concisely building byte string +/// slices. Its primary utility is in conveniently writing byte string literals +/// in a uniform way. For example, consider this code that does not compile: +/// +/// ```ignore +/// let strs = vec![b"a", b"xy"]; +/// ``` +/// +/// The above code doesn't compile because the type of the byte string literal +/// `b"a"` is `&'static [u8; 1]`, and the type of `b"xy"` is +/// `&'static [u8; 2]`. Since their types aren't the same, they can't be stored +/// in the same `Vec`. (This is dissimilar from normal Unicode string slices, +/// where both `"a"` and `"xy"` have the same type of `&'static str`.) +/// +/// One way of getting the above code to compile is to convert byte strings to +/// slices. You might try this: +/// +/// ```ignore +/// let strs = vec![&b"a", &b"xy"]; +/// ``` +/// +/// But this just creates values with type `& &'static [u8; 1]` and +/// `& &'static [u8; 2]`. Instead, you need to force the issue like so: +/// +/// ``` +/// let strs = vec![&b"a"[..], &b"xy"[..]]; +/// // or +/// let strs = vec![b"a".as_ref(), b"xy".as_ref()]; +/// ``` +/// +/// But neither of these are particularly convenient to type, especially when +/// it's something as common as a string literal. Thus, this constructor +/// permits writing the following instead: +/// +/// ``` +/// use bstr::B; +/// +/// let strs = vec![B("a"), B(b"xy")]; +/// ``` +/// +/// Notice that this also lets you mix and match both string literals and byte +/// string literals. This can be quite convenient! +#[allow(non_snake_case)] +#[inline] +pub fn B<'a, B: ?Sized + AsRef<[u8]>>(bytes: &'a B) -> &'a [u8] { + bytes.as_ref() +} + +impl ByteSlice for [u8] { + fn as_bytes(&self) -> &[u8] { self } + fn as_bytes_mut(&mut self) -> &mut [u8] { self } +} + +/// Ensure that callers cannot implement `ByteSlice` by making an +/// umplementable trait its super trait. +pub trait Sealed {} +impl Sealed for [u8] {} + +/// A trait that extends a slice of bytes with string oriented methods. +pub trait ByteSlice: Sealed { + /// A method for accessing the raw bytes of this type. This is always a + /// no-op and callers shouldn't care about it. This only exists for making + /// the extension trait work. + #[doc(hidden)] + fn as_bytes(&self) -> &[u8]; + + /// A method for accessing the raw bytes of this type, mutably. This is + /// always a no-op and callers shouldn't care about it. This only exists + /// for making the extension trait work. + #[doc(hidden)] + fn as_bytes_mut(&mut self) -> &mut [u8]; + + /// Return this byte slice as a `&BStr`. + /// + /// Use `&BStr` is useful because of its `fmt::Debug` representation + /// and various other trait implementations (such as `PartialEq` and + /// `PartialOrd`). In particular, the `Debug` implementation for `BStr` + /// shows its bytes as a normal string. For invalid UTF-8, hex escape + /// sequences are used. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::ByteSlice; + /// + /// println!("{:?}", b"foo\xFFbar".as_bstr()); + /// ``` + fn as_bstr(&self) -> &BStr { + BStr::new(self.as_bytes()) + } + + /// Return this byte slice as a `&mut BStr`. + /// + /// Use `&mut BStr` is useful because of its `fmt::Debug` representation + /// and various other trait implementations (such as `PartialEq` and + /// `PartialOrd`). In particular, the `Debug` implementation for `BStr` + /// shows its bytes as a normal string. For invalid UTF-8, hex escape + /// sequences are used. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::ByteSlice; + /// + /// let mut bytes = *b"foo\xFFbar"; + /// println!("{:?}", &mut bytes.as_bstr_mut()); + /// ``` + fn as_bstr_mut(&mut self) -> &mut BStr { + BStr::new_mut(self.as_bytes_mut()) + } + + /// Create an immutable byte string from an OS string slice. + /// + /// On Unix, this always succeeds and is zero cost. On non-Unix systems, + /// this returns `None` if the given OS string is not valid UTF-8. (For + /// example, on Windows, file paths are allowed to be a sequence of + /// arbitrary 16-bit integers. Not all such sequences can be transcoded to + /// valid UTF-8.) + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use std::ffi::OsStr; + /// + /// use bstr::{B, ByteSlice}; + /// + /// let os_str = OsStr::new("foo"); + /// let bs = <[u8]>::from_os_str(os_str).expect("should be valid UTF-8"); + /// assert_eq!(bs, B("foo")); + /// ``` + #[cfg(feature = "std")] + #[inline] + fn from_os_str(os_str: &OsStr) -> Option<&[u8]> { + #[cfg(unix)] + #[inline] + fn imp(os_str: &OsStr) -> Option<&[u8]> { + use std::os::unix::ffi::OsStrExt; + + Some(os_str.as_bytes()) + } + + #[cfg(not(unix))] + #[inline] + fn imp(os_str: &OsStr) -> Option<&[u8]> { + os_str.to_str().map(|s| s.as_bytes()) + } + + imp(os_str) + } + + /// Create an immutable byte string from a file path. + /// + /// On Unix, this always succeeds and is zero cost. On non-Unix systems, + /// this returns `None` if the given path is not valid UTF-8. (For example, + /// on Windows, file paths are allowed to be a sequence of arbitrary 16-bit + /// integers. Not all such sequences can be transcoded to valid UTF-8.) + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use std::path::Path; + /// + /// use bstr::{B, ByteSlice}; + /// + /// let path = Path::new("foo"); + /// let bs = <[u8]>::from_path(path).expect("should be valid UTF-8"); + /// assert_eq!(bs, B("foo")); + /// ``` + #[cfg(feature = "std")] + #[inline] + fn from_path(path: &Path) -> Option<&[u8]> { + Self::from_os_str(path.as_os_str()) + } + + /// Safely convert this byte string into a `&str` if it's valid UTF-8. + /// + /// If this byte string is not valid UTF-8, then an error is returned. The + /// error returned indicates the first invalid byte found and the length + /// of the error. + /// + /// In cases where a lossy conversion to `&str` is acceptable, then use one + /// of the [`to_str_lossy`](trait.ByteSlice.html#method.to_str_lossy) or + /// [`to_str_lossy_into`](trait.ByteSlice.html#method.to_str_lossy_into) + /// methods. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::{B, ByteSlice, ByteVec}; + /// + /// # fn example() -> Result<(), bstr::Utf8Error> { + /// let s = B("☃βツ").to_str()?; + /// assert_eq!("☃βツ", s); + /// + /// let mut bstring = >::from("☃βツ"); + /// bstring.push(b'\xFF'); + /// let err = bstring.to_str().unwrap_err(); + /// assert_eq!(8, err.valid_up_to()); + /// # Ok(()) }; example().unwrap() + /// ``` + #[inline] + fn to_str(&self) -> Result<&str, Utf8Error> { + utf8::validate(self.as_bytes()).map(|_| { + // SAFETY: This is safe because of the guarantees provided by + // utf8::validate. + unsafe { + str::from_utf8_unchecked(self.as_bytes()) + } + }) + } + + /// Unsafely convert this byte string into a `&str`, without checking for + /// valid UTF-8. + /// + /// # Safety + /// + /// Callers *must* ensure that this byte string is valid UTF-8 before + /// calling this method. Converting a byte string into a `&str` that is + /// not valid UTF-8 is considered undefined behavior. + /// + /// This routine is useful in performance sensitive contexts where the + /// UTF-8 validity of the byte string is already known and it is + /// undesirable to pay the cost of an additional UTF-8 validation check + /// that [`to_str`](trait.ByteSlice.html#method.to_str) performs. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::{B, ByteSlice}; + /// + /// // SAFETY: This is safe because string literals are guaranteed to be + /// // valid UTF-8 by the Rust compiler. + /// let s = unsafe { B("☃βツ").to_str_unchecked() }; + /// assert_eq!("☃βツ", s); + /// ``` + unsafe fn to_str_unchecked(&self) -> &str { + str::from_utf8_unchecked(self.as_bytes()) + } + + /// Convert this byte string to a valid UTF-8 string by replacing invalid + /// UTF-8 bytes with the Unicode replacement codepoint (`U+FFFD`). + /// + /// If the byte string is already valid UTF-8, then no copying or + /// allocation is performed and a borrrowed string slice is returned. If + /// the byte string is not valid UTF-8, then an owned string buffer is + /// returned with invalid bytes replaced by the replacement codepoint. + /// + /// This method uses the "substitution of maximal subparts" (Unicode + /// Standard, Chapter 3, Section 9) strategy for inserting the replacement + /// codepoint. Specifically, a replacement codepoint is inserted whenever a + /// byte is found that cannot possibly lead to a valid code unit sequence. + /// If there were previous bytes that represented a prefix of a well-formed + /// code unit sequence, then all of those bytes are substituted with a + /// single replacement codepoint. The "substitution of maximal subparts" + /// strategy is the same strategy used by + /// [W3C's Encoding standard](https://www.w3.org/TR/encoding/). + /// For a more precise description of the maximal subpart strategy, see + /// the Unicode Standard, Chapter 3, Section 9. See also + /// [Public Review Issue #121](http://www.unicode.org/review/pr-121.html). + /// + /// N.B. Rust's standard library also appears to use the same strategy, + /// but it does not appear to be an API guarantee. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use std::borrow::Cow; + /// + /// use bstr::ByteSlice; + /// + /// let mut bstring = >::from("☃βツ"); + /// assert_eq!(Cow::Borrowed("☃βツ"), bstring.to_str_lossy()); + /// + /// // Add a byte that makes the sequence invalid. + /// bstring.push(b'\xFF'); + /// assert_eq!(Cow::Borrowed("☃βツ\u{FFFD}"), bstring.to_str_lossy()); + /// ``` + /// + /// This demonstrates the "maximal subpart" substitution logic. + /// + /// ``` + /// use bstr::{B, ByteSlice}; + /// + /// // \x61 is the ASCII codepoint for 'a'. + /// // \xF1\x80\x80 is a valid 3-byte code unit prefix. + /// // \xE1\x80 is a valid 2-byte code unit prefix. + /// // \xC2 is a valid 1-byte code unit prefix. + /// // \x62 is the ASCII codepoint for 'b'. + /// // + /// // In sum, each of the prefixes is replaced by a single replacement + /// // codepoint since none of the prefixes are properly completed. This + /// // is in contrast to other strategies that might insert a replacement + /// // codepoint for every single byte. + /// let bs = B(b"\x61\xF1\x80\x80\xE1\x80\xC2\x62"); + /// assert_eq!("a\u{FFFD}\u{FFFD}\u{FFFD}b", bs.to_str_lossy()); + /// ``` + #[cfg(feature = "std")] + #[inline] + fn to_str_lossy(&self) -> Cow { + match utf8::validate(self.as_bytes()) { + Ok(()) => { + // SAFETY: This is safe because of the guarantees provided by + // utf8::validate. + unsafe { + Cow::Borrowed(str::from_utf8_unchecked(self.as_bytes())) + } + } + Err(err) => { + let mut lossy = String::with_capacity(self.as_bytes().len()); + let (valid, after) = self + .as_bytes() + .split_at(err.valid_up_to()); + // SAFETY: This is safe because utf8::validate guarantees + // that all of `valid` is valid UTF-8. + lossy.push_str(unsafe { str::from_utf8_unchecked(valid) }); + lossy.push_str("\u{FFFD}"); + if let Some(len) = err.error_len() { + after[len..].to_str_lossy_into(&mut lossy); + } + Cow::Owned(lossy) + } + } + } + + /// Copy the contents of this byte string into the given owned string + /// buffer, while replacing invalid UTF-8 code unit sequences with the + /// Unicode replacement codepoint (`U+FFFD`). + /// + /// This method uses the same "substitution of maximal subparts" strategy + /// for inserting the replacement codepoint as the + /// [`to_str_lossy`](trait.ByteSlice.html#method.to_str_lossy) method. + /// + /// This routine is useful for amortizing allocation. However, unlike + /// `to_str_lossy`, this routine will _always_ copy the contents of this + /// byte string into the destination buffer, even if this byte string is + /// valid UTF-8. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use std::borrow::Cow; + /// + /// use bstr::ByteSlice; + /// + /// let mut bstring = >::from("☃βツ"); + /// // Add a byte that makes the sequence invalid. + /// bstring.push(b'\xFF'); + /// + /// let mut dest = String::new(); + /// bstring.to_str_lossy_into(&mut dest); + /// assert_eq!("☃βツ\u{FFFD}", dest); + /// ``` + #[cfg(feature = "std")] + #[inline] + fn to_str_lossy_into(&self, dest: &mut String) { + let mut bytes = self.as_bytes(); + dest.reserve(bytes.len()); + loop { + match utf8::validate(bytes) { + Ok(()) => { + // SAFETY: This is safe because utf8::validate guarantees + // that all of `bytes` is valid UTF-8. + dest.push_str(unsafe { str::from_utf8_unchecked(bytes) }); + break; + } + Err(err) => { + let (valid, after) = bytes.split_at(err.valid_up_to()); + // SAFETY: This is safe because utf8::validate guarantees + // that all of `valid` is valid UTF-8. + dest.push_str(unsafe { str::from_utf8_unchecked(valid) }); + dest.push_str("\u{FFFD}"); + match err.error_len() { + None => break, + Some(len) => bytes = &after[len..], + } + } + } + } + } + + /// Create an OS string slice from this byte string. + /// + /// On Unix, this always succeeds and is zero cost. On non-Unix systems, + /// this returns a UTF-8 decoding error if this byte string is not valid + /// UTF-8. (For example, on Windows, file paths are allowed to be a + /// sequence of arbitrary 16-bit integers. There is no obvious mapping from + /// an arbitrary sequence of 8-bit integers to an arbitrary sequence of + /// 16-bit integers.) + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::{B, ByteSlice}; + /// + /// let os_str = b"foo".to_os_str().expect("should be valid UTF-8"); + /// assert_eq!(os_str, "foo"); + /// ``` + #[cfg(feature = "std")] + #[inline] + fn to_os_str(&self) -> Result<&OsStr, Utf8Error> { + #[cfg(unix)] + #[inline] + fn imp(bytes: &[u8]) -> Result<&OsStr, Utf8Error> { + use std::os::unix::ffi::OsStrExt; + + Ok(OsStr::from_bytes(bytes)) + } + + #[cfg(not(unix))] + #[inline] + fn imp(bytes: &[u8]) -> Result<&OsStr, Utf8Error> { + bytes.to_str().map(OsStr::new) + } + + imp(self.as_bytes()) + } + + /// Lossily create an OS string slice from this byte string. + /// + /// On Unix, this always succeeds and is zero cost. On non-Unix systems, + /// this will perform a UTF-8 check and lossily convert this byte string + /// into valid UTF-8 using the Unicode replacement codepoint. + /// + /// Note that this can prevent the correct roundtripping of file paths on + /// non-Unix systems such as Windows, where file paths are an arbitrary + /// sequence of 16-bit integers. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::ByteSlice; + /// + /// let os_str = b"foo\xFFbar".to_os_str_lossy(); + /// assert_eq!(os_str.to_string_lossy(), "foo\u{FFFD}bar"); + /// ``` + #[cfg(feature = "std")] + #[inline] + fn to_os_str_lossy(&self) -> Cow { + #[cfg(unix)] + #[inline] + fn imp(bytes: &[u8]) -> Cow { + use std::os::unix::ffi::OsStrExt; + + Cow::Borrowed(OsStr::from_bytes(bytes)) + } + + #[cfg(not(unix))] + #[inline] + fn imp(bytes: &[u8]) -> Cow { + use std::ffi::OsString; + + match bytes.to_str_lossy() { + Cow::Borrowed(x) => Cow::Borrowed(OsStr::new(x)), + Cow::Owned(x) => Cow::Owned(OsString::from(x)), + } + } + + imp(self.as_bytes()) + } + + /// Create a path slice from this byte string. + /// + /// On Unix, this always succeeds and is zero cost. On non-Unix systems, + /// this returns a UTF-8 decoding error if this byte string is not valid + /// UTF-8. (For example, on Windows, file paths are allowed to be a + /// sequence of arbitrary 16-bit integers. There is no obvious mapping from + /// an arbitrary sequence of 8-bit integers to an arbitrary sequence of + /// 16-bit integers.) + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::ByteSlice; + /// + /// let path = b"foo".to_path().expect("should be valid UTF-8"); + /// assert_eq!(path.as_os_str(), "foo"); + /// ``` + #[cfg(feature = "std")] + #[inline] + fn to_path(&self) -> Result<&Path, Utf8Error> { + self.to_os_str().map(Path::new) + } + + /// Lossily create a path slice from this byte string. + /// + /// On Unix, this always succeeds and is zero cost. On non-Unix systems, + /// this will perform a UTF-8 check and lossily convert this byte string + /// into valid UTF-8 using the Unicode replacement codepoint. + /// + /// Note that this can prevent the correct roundtripping of file paths on + /// non-Unix systems such as Windows, where file paths are an arbitrary + /// sequence of 16-bit integers. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::ByteSlice; + /// + /// let bs = b"foo\xFFbar"; + /// let path = bs.to_path_lossy(); + /// assert_eq!(path.to_string_lossy(), "foo\u{FFFD}bar"); + /// ``` + #[cfg(feature = "std")] + #[inline] + fn to_path_lossy(&self) -> Cow { + use std::path::PathBuf; + + match self.to_os_str_lossy() { + Cow::Borrowed(x) => Cow::Borrowed(Path::new(x)), + Cow::Owned(x) => Cow::Owned(PathBuf::from(x)), + } + } + + /// Create a new byte string by repeating this byte string `n` times. + /// + /// # Panics + /// + /// This function panics if the capacity of the new byte string would + /// overflow. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::{B, ByteSlice}; + /// + /// assert_eq!(b"foo".repeatn(4), B("foofoofoofoo")); + /// assert_eq!(b"foo".repeatn(0), B("")); + /// ``` + #[cfg(feature = "std")] + #[inline] + fn repeatn(&self, n: usize) -> Vec { + let bs = self.as_bytes(); + let mut dst = vec![0; bs.len() * n]; + for i in 0..n { + dst[i * bs.len()..(i+1) * bs.len()].copy_from_slice(bs); + } + dst + } + + /// Returns true if and only if this byte string contains the given needle. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::ByteSlice; + /// + /// assert!(b"foo bar".contains_str("foo")); + /// assert!(b"foo bar".contains_str("bar")); + /// assert!(!b"foo".contains_str("foobar")); + /// ``` + #[inline] + fn contains_str>(&self, needle: B) -> bool { + self.find(needle).is_some() + } + + /// Returns true if and only if this byte string has the given prefix. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::ByteSlice; + /// + /// assert!(b"foo bar".starts_with_str("foo")); + /// assert!(!b"foo bar".starts_with_str("bar")); + /// assert!(!b"foo".starts_with_str("foobar")); + /// ``` + #[inline] + fn starts_with_str>(&self, prefix: B) -> bool { + self.as_bytes().starts_with(prefix.as_ref()) + } + + /// Returns true if and only if this byte string has the given suffix. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::ByteSlice; + /// + /// assert!(b"foo bar".ends_with_str("bar")); + /// assert!(!b"foo bar".ends_with_str("foo")); + /// assert!(!b"bar".ends_with_str("foobar")); + /// ``` + #[inline] + fn ends_with_str>(&self, suffix: B) -> bool { + self.as_bytes().ends_with(suffix.as_ref()) + } + + /// Returns the index of the first occurrence of the given needle. + /// + /// The needle may be any type that can be cheaply converted into a + /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`. + /// + /// Note that if you're are searching for the same needle in many + /// different small haystacks, it may be faster to initialize a + /// [`Finder`](struct.Finder.html) once, and reuse it for each search. + /// + /// # Complexity + /// + /// This routine is guaranteed to have worst case linear time complexity + /// with respect to both the needle and the haystack. That is, this runs + /// in `O(needle.len() + haystack.len())` time. + /// + /// This routine is also guaranteed to have worst case constant space + /// complexity. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::ByteSlice; + /// + /// let s = b"foo bar baz"; + /// assert_eq!(Some(0), s.find("foo")); + /// assert_eq!(Some(4), s.find("bar")); + /// assert_eq!(None, s.find("quux")); + /// ``` + #[inline] + fn find>(&self, needle: B) -> Option { + Finder::new(needle.as_ref()).find(self.as_bytes()) + } + + /// Returns the index of the last occurrence of the given needle. + /// + /// The needle may be any type that can be cheaply converted into a + /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`. + /// + /// Note that if you're are searching for the same needle in many + /// different small haystacks, it may be faster to initialize a + /// [`FinderReverse`](struct.FinderReverse.html) once, and reuse it for + /// each search. + /// + /// # Complexity + /// + /// This routine is guaranteed to have worst case linear time complexity + /// with respect to both the needle and the haystack. That is, this runs + /// in `O(needle.len() + haystack.len())` time. + /// + /// This routine is also guaranteed to have worst case constant space + /// complexity. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::ByteSlice; + /// + /// let s = b"foo bar baz"; + /// assert_eq!(Some(0), s.rfind("foo")); + /// assert_eq!(Some(4), s.rfind("bar")); + /// assert_eq!(Some(8), s.rfind("ba")); + /// assert_eq!(None, s.rfind("quux")); + /// ``` + #[inline] + fn rfind>(&self, needle: B) -> Option { + FinderReverse::new(needle.as_ref()).rfind(self.as_bytes()) + } + + /// Returns an iterator of the non-overlapping occurrences of the given + /// needle. The iterator yields byte offset positions indicating the start + /// of each match. + /// + /// # Complexity + /// + /// This routine is guaranteed to have worst case linear time complexity + /// with respect to both the needle and the haystack. That is, this runs + /// in `O(needle.len() + haystack.len())` time. + /// + /// This routine is also guaranteed to have worst case constant space + /// complexity. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::ByteSlice; + /// + /// let s = b"foo bar foo foo quux foo"; + /// let matches: Vec = s.find_iter("foo").collect(); + /// assert_eq!(matches, vec![0, 8, 12, 21]); + /// ``` + /// + /// An empty string matches at every position, including the position + /// immediately following the last byte: + /// + /// ``` + /// use bstr::ByteSlice; + /// + /// let matches: Vec = b"foo".find_iter("").collect(); + /// assert_eq!(matches, vec![0, 1, 2, 3]); + /// + /// let matches: Vec = b"".find_iter("").collect(); + /// assert_eq!(matches, vec![0]); + /// ``` + #[inline] + fn find_iter<'a, B: ?Sized + AsRef<[u8]>>( + &'a self, + needle: &'a B, + ) -> Find<'a> { + Find::new(self.as_bytes(), needle.as_ref()) + } + + /// Returns an iterator of the non-overlapping occurrences of the given + /// needle in reverse. The iterator yields byte offset positions indicating + /// the start of each match. + /// + /// # Complexity + /// + /// This routine is guaranteed to have worst case linear time complexity + /// with respect to both the needle and the haystack. That is, this runs + /// in `O(needle.len() + haystack.len())` time. + /// + /// This routine is also guaranteed to have worst case constant space + /// complexity. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::ByteSlice; + /// + /// let s = b"foo bar foo foo quux foo"; + /// let matches: Vec = s.rfind_iter("foo").collect(); + /// assert_eq!(matches, vec![21, 12, 8, 0]); + /// ``` + /// + /// An empty string matches at every position, including the position + /// immediately following the last byte: + /// + /// ``` + /// use bstr::ByteSlice; + /// + /// let matches: Vec = b"foo".rfind_iter("").collect(); + /// assert_eq!(matches, vec![3, 2, 1, 0]); + /// + /// let matches: Vec = b"".rfind_iter("").collect(); + /// assert_eq!(matches, vec![0]); + /// ``` + #[inline] + fn rfind_iter<'a, B: ?Sized + AsRef<[u8]>>( + &'a self, + needle: &'a B, + ) -> FindReverse<'a> { + FindReverse::new(self.as_bytes(), needle.as_ref()) + } + + /// Returns the index of the first occurrence of the given byte. If the + /// byte does not occur in this byte string, then `None` is returned. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::ByteSlice; + /// + /// assert_eq!(Some(10), b"foo bar baz".find_byte(b'z')); + /// assert_eq!(None, b"foo bar baz".find_byte(b'y')); + /// ``` + #[inline] + fn find_byte(&self, byte: u8) -> Option { + memchr(byte, self.as_bytes()) + } + + /// Returns the index of the last occurrence of the given byte. If the + /// byte does not occur in this byte string, then `None` is returned. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::ByteSlice; + /// + /// assert_eq!(Some(10), b"foo bar baz".rfind_byte(b'z')); + /// assert_eq!(None, b"foo bar baz".rfind_byte(b'y')); + /// ``` + #[inline] + fn rfind_byte(&self, byte: u8) -> Option { + memrchr(byte, self.as_bytes()) + } + + /// Returns the index of the first occurrence of the given codepoint. + /// If the codepoint does not occur in this byte string, then `None` is + /// returned. + /// + /// Note that if one searches for the replacement codepoint, `\u{FFFD}`, + /// then only explicit occurrences of that encoding will be found. Invalid + /// UTF-8 sequences will not be matched. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::{B, ByteSlice}; + /// + /// assert_eq!(Some(10), b"foo bar baz".find_char('z')); + /// assert_eq!(Some(4), B("αβγγδ").find_char('γ')); + /// assert_eq!(None, b"foo bar baz".find_char('y')); + /// ``` + #[inline] + fn find_char(&self, ch: char) -> Option { + self.find(ch.encode_utf8(&mut [0; 4])) + } + + /// Returns the index of the last occurrence of the given codepoint. + /// If the codepoint does not occur in this byte string, then `None` is + /// returned. + /// + /// Note that if one searches for the replacement codepoint, `\u{FFFD}`, + /// then only explicit occurrences of that encoding will be found. Invalid + /// UTF-8 sequences will not be matched. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::{B, ByteSlice}; + /// + /// assert_eq!(Some(10), b"foo bar baz".rfind_char('z')); + /// assert_eq!(Some(6), B("αβγγδ").rfind_char('γ')); + /// assert_eq!(None, b"foo bar baz".rfind_char('y')); + /// ``` + #[inline] + fn rfind_char(&self, ch: char) -> Option { + self.rfind(ch.encode_utf8(&mut [0; 4])) + } + + /// Returns an iterator over the fields in a byte string, separated by + /// contiguous whitespace. + /// + /// # Example + /// + /// Basic usage: + /// + /// ``` + /// use bstr::{B, ByteSlice}; + /// + /// let s = B(" foo\tbar\t\u{2003}\nquux \n"); + /// let fields: Vec<&[u8]> = s.fields().collect(); + /// assert_eq!(fields, vec![B("foo"), B("bar"), B("quux")]); + /// ``` + /// + /// A byte string consisting of just whitespace yields no elements: + /// + /// ``` + /// use bstr::{B, ByteSlice}; + /// + /// assert_eq!(0, B(" \n\t\u{2003}\n \t").fields().count()); + /// ``` + #[inline] + fn fields(&self) -> Fields { + Fields::new(self.as_bytes()) + } + + /// Returns an iterator over the fields in a byte string, separated by + /// contiguous codepoints satisfying the given predicate. + /// + /// If this byte + /// + /// # Example + /// + /// Basic usage: + /// + /// ``` + /// use bstr::{B, ByteSlice}; + /// + /// let s = b"123foo999999bar1quux123456"; + /// let fields: Vec<&[u8]> = s.fields_with(|c| c.is_numeric()).collect(); + /// assert_eq!(fields, vec![B("foo"), B("bar"), B("quux")]); + /// ``` + /// + /// A byte string consisting of all codepoints satisfying the predicate + /// yields no elements: + /// + /// ``` + /// use bstr::ByteSlice; + /// + /// assert_eq!(0, b"1911354563".fields_with(|c| c.is_numeric()).count()); + /// ``` + #[inline] + fn fields_with bool>(&self, f: F) -> FieldsWith { + FieldsWith::new(self.as_bytes(), f) + } + + /// Returns an iterator over substrings of this byte string, separated + /// by the given byte string. Each element yielded is guaranteed not to + /// include the splitter substring. + /// + /// The splitter may be any type that can be cheaply converted into a + /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::{B, ByteSlice}; + /// + /// let x: Vec<&[u8]> = b"Mary had a little lamb".split_str(" ").collect(); + /// assert_eq!(x, vec![ + /// B("Mary"), B("had"), B("a"), B("little"), B("lamb"), + /// ]); + /// + /// let x: Vec<&[u8]> = b"".split_str("X").collect(); + /// assert_eq!(x, vec![b""]); + /// + /// let x: Vec<&[u8]> = b"lionXXtigerXleopard".split_str("X").collect(); + /// assert_eq!(x, vec![B("lion"), B(""), B("tiger"), B("leopard")]); + /// + /// let x: Vec<&[u8]> = b"lion::tiger::leopard".split_str("::").collect(); + /// assert_eq!(x, vec![B("lion"), B("tiger"), B("leopard")]); + /// ``` + /// + /// If a string contains multiple contiguous separators, you will end up + /// with empty strings yielded by the iterator: + /// + /// ``` + /// use bstr::{B, ByteSlice}; + /// + /// let x: Vec<&[u8]> = b"||||a||b|c".split_str("|").collect(); + /// assert_eq!(x, vec![ + /// B(""), B(""), B(""), B(""), B("a"), B(""), B("b"), B("c"), + /// ]); + /// + /// let x: Vec<&[u8]> = b"(///)".split_str("/").collect(); + /// assert_eq!(x, vec![B("("), B(""), B(""), B(")")]); + /// ``` + /// + /// Separators at the start or end of a string are neighbored by empty + /// strings. + /// + /// ``` + /// use bstr::{B, ByteSlice}; + /// + /// let x: Vec<&[u8]> = b"010".split_str("0").collect(); + /// assert_eq!(x, vec![B(""), B("1"), B("")]); + /// ``` + /// + /// When the empty string is used as a separator, it splits every **byte** + /// in the byte string, along with the beginning and end of the byte + /// string. + /// + /// ``` + /// use bstr::{B, ByteSlice}; + /// + /// let x: Vec<&[u8]> = b"rust".split_str("").collect(); + /// assert_eq!(x, vec![ + /// B(""), B("r"), B("u"), B("s"), B("t"), B(""), + /// ]); + /// + /// // Splitting by an empty string is not UTF-8 aware. Elements yielded + /// // may not be valid UTF-8! + /// let x: Vec<&[u8]> = B("☃").split_str("").collect(); + /// assert_eq!(x, vec![ + /// B(""), B(b"\xE2"), B(b"\x98"), B(b"\x83"), B(""), + /// ]); + /// ``` + /// + /// Contiguous separators, especially whitespace, can lead to possibly + /// surprising behavior. For example, this code is correct: + /// + /// ``` + /// use bstr::{B, ByteSlice}; + /// + /// let x: Vec<&[u8]> = b" a b c".split_str(" ").collect(); + /// assert_eq!(x, vec![ + /// B(""), B(""), B(""), B(""), B("a"), B(""), B("b"), B("c"), + /// ]); + /// ``` + /// + /// It does *not* give you `["a", "b", "c"]`. For that behavior, use + /// [`fields`](#method.fields) instead. + #[inline] + fn split_str<'a, B: ?Sized + AsRef<[u8]>>( + &'a self, + splitter: &'a B, + ) -> Split<'a> { + Split::new(self.as_bytes(), splitter.as_ref()) + } + + /// Returns an iterator over substrings of this byte string, separated by + /// the given byte string, in reverse. Each element yielded is guaranteed + /// not to include the splitter substring. + /// + /// The splitter may be any type that can be cheaply converted into a + /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::{B, ByteSlice}; + /// + /// let x: Vec<&[u8]> = + /// b"Mary had a little lamb".rsplit_str(" ").collect(); + /// assert_eq!(x, vec![ + /// B("lamb"), B("little"), B("a"), B("had"), B("Mary"), + /// ]); + /// + /// let x: Vec<&[u8]> = b"".rsplit_str("X").collect(); + /// assert_eq!(x, vec![b""]); + /// + /// let x: Vec<&[u8]> = b"lionXXtigerXleopard".rsplit_str("X").collect(); + /// assert_eq!(x, vec![B("leopard"), B("tiger"), B(""), B("lion")]); + /// + /// let x: Vec<&[u8]> = b"lion::tiger::leopard".rsplit_str("::").collect(); + /// assert_eq!(x, vec![B("leopard"), B("tiger"), B("lion")]); + /// ``` + /// + /// If a string contains multiple contiguous separators, you will end up + /// with empty strings yielded by the iterator: + /// + /// ``` + /// use bstr::{B, ByteSlice}; + /// + /// let x: Vec<&[u8]> = b"||||a||b|c".rsplit_str("|").collect(); + /// assert_eq!(x, vec![ + /// B("c"), B("b"), B(""), B("a"), B(""), B(""), B(""), B(""), + /// ]); + /// + /// let x: Vec<&[u8]> = b"(///)".rsplit_str("/").collect(); + /// assert_eq!(x, vec![B(")"), B(""), B(""), B("(")]); + /// ``` + /// + /// Separators at the start or end of a string are neighbored by empty + /// strings. + /// + /// ``` + /// use bstr::{B, ByteSlice}; + /// + /// let x: Vec<&[u8]> = b"010".rsplit_str("0").collect(); + /// assert_eq!(x, vec![B(""), B("1"), B("")]); + /// ``` + /// + /// When the empty string is used as a separator, it splits every **byte** + /// in the byte string, along with the beginning and end of the byte + /// string. + /// + /// ``` + /// use bstr::{B, ByteSlice}; + /// + /// let x: Vec<&[u8]> = b"rust".rsplit_str("").collect(); + /// assert_eq!(x, vec![ + /// B(""), B("t"), B("s"), B("u"), B("r"), B(""), + /// ]); + /// + /// // Splitting by an empty string is not UTF-8 aware. Elements yielded + /// // may not be valid UTF-8! + /// let x: Vec<&[u8]> = B("☃").rsplit_str("").collect(); + /// assert_eq!(x, vec![B(""), B(b"\x83"), B(b"\x98"), B(b"\xE2"), B("")]); + /// ``` + /// + /// Contiguous separators, especially whitespace, can lead to possibly + /// surprising behavior. For example, this code is correct: + /// + /// ``` + /// use bstr::{B, ByteSlice}; + /// + /// let x: Vec<&[u8]> = b" a b c".rsplit_str(" ").collect(); + /// assert_eq!(x, vec![ + /// B("c"), B("b"), B(""), B("a"), B(""), B(""), B(""), B(""), + /// ]); + /// ``` + /// + /// It does *not* give you `["a", "b", "c"]`. + #[inline] + fn rsplit_str<'a, B: ?Sized + AsRef<[u8]>>( + &'a self, + splitter: &'a B, + ) -> SplitReverse<'a> { + SplitReverse::new(self.as_bytes(), splitter.as_ref()) + } + + /// Returns an iterator of at most `limit` substrings of this byte string, + /// separated by the given byte string. If `limit` substrings are yielded, + /// then the last substring will contain the remainder of this byte string. + /// + /// The needle may be any type that can be cheaply converted into a + /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::{B, ByteSlice}; + /// + /// let x: Vec<_> = b"Mary had a little lamb".splitn_str(3, " ").collect(); + /// assert_eq!(x, vec![B("Mary"), B("had"), B("a little lamb")]); + /// + /// let x: Vec<_> = b"".splitn_str(3, "X").collect(); + /// assert_eq!(x, vec![b""]); + /// + /// let x: Vec<_> = b"lionXXtigerXleopard".splitn_str(3, "X").collect(); + /// assert_eq!(x, vec![B("lion"), B(""), B("tigerXleopard")]); + /// + /// let x: Vec<_> = b"lion::tiger::leopard".splitn_str(2, "::").collect(); + /// assert_eq!(x, vec![B("lion"), B("tiger::leopard")]); + /// + /// let x: Vec<_> = b"abcXdef".splitn_str(1, "X").collect(); + /// assert_eq!(x, vec![B("abcXdef")]); + /// + /// let x: Vec<_> = b"abcXdef".splitn_str(0, "X").collect(); + /// assert!(x.is_empty()); + /// ``` + #[inline] + fn splitn_str<'a, B: ?Sized + AsRef<[u8]>>( + &'a self, + limit: usize, + splitter: &'a B, + ) -> SplitN<'a> { + SplitN::new(self.as_bytes(), splitter.as_ref(), limit) + } + + /// Returns an iterator of at most `limit` substrings of this byte string, + /// separated by the given byte string, in reverse. If `limit` substrings + /// are yielded, then the last substring will contain the remainder of this + /// byte string. + /// + /// The needle may be any type that can be cheaply converted into a + /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::{B, ByteSlice}; + /// + /// let x: Vec<_> = + /// b"Mary had a little lamb".rsplitn_str(3, " ").collect(); + /// assert_eq!(x, vec![B("lamb"), B("little"), B("Mary had a")]); + /// + /// let x: Vec<_> = b"".rsplitn_str(3, "X").collect(); + /// assert_eq!(x, vec![b""]); + /// + /// let x: Vec<_> = b"lionXXtigerXleopard".rsplitn_str(3, "X").collect(); + /// assert_eq!(x, vec![B("leopard"), B("tiger"), B("lionX")]); + /// + /// let x: Vec<_> = b"lion::tiger::leopard".rsplitn_str(2, "::").collect(); + /// assert_eq!(x, vec![B("leopard"), B("lion::tiger")]); + /// + /// let x: Vec<_> = b"abcXdef".rsplitn_str(1, "X").collect(); + /// assert_eq!(x, vec![B("abcXdef")]); + /// + /// let x: Vec<_> = b"abcXdef".rsplitn_str(0, "X").collect(); + /// assert!(x.is_empty()); + /// ``` + #[inline] + fn rsplitn_str<'a, B: ?Sized + AsRef<[u8]>>( + &'a self, + limit: usize, + splitter: &'a B, + ) -> SplitNReverse<'a> { + SplitNReverse::new(self.as_bytes(), splitter.as_ref(), limit) + } + + /// Replace all matches of the given needle with the given replacement, and + /// the result as a new `Vec`. + /// + /// This routine is useful as a convenience. If you need to reuse an + /// allocation, use [`replace_into`](#method.replace_into) instead. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::ByteSlice; + /// + /// let s = b"this is old".replace("old", "new"); + /// assert_eq!(s, "this is new".as_bytes()); + /// ``` + /// + /// When the pattern doesn't match: + /// + /// ``` + /// use bstr::ByteSlice; + /// + /// let s = b"this is old".replace("nada nada", "limonada"); + /// assert_eq!(s, "this is old".as_bytes()); + /// ``` + /// + /// When the needle is an empty string: + /// + /// ``` + /// use bstr::ByteSlice; + /// + /// let s = b"foo".replace("", "Z"); + /// assert_eq!(s, "ZfZoZoZ".as_bytes()); + /// ``` + #[cfg(feature = "std")] + #[inline] + fn replace, R: AsRef<[u8]>>( + &self, + needle: N, + replacement: R, + ) -> Vec { + let mut dest = Vec::with_capacity(self.as_bytes().len()); + self.replace_into(needle, replacement, &mut dest); + dest + } + + /// Replace up to `limit` matches of the given needle with the given + /// replacement, and the result as a new `Vec`. + /// + /// This routine is useful as a convenience. If you need to reuse an + /// allocation, use [`replacen_into`](#method.replacen_into) instead. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::ByteSlice; + /// + /// let s = b"foofoo".replacen("o", "z", 2); + /// assert_eq!(s, "fzzfoo".as_bytes()); + /// ``` + /// + /// When the pattern doesn't match: + /// + /// ``` + /// use bstr::ByteSlice; + /// + /// let s = b"foofoo".replacen("a", "z", 2); + /// assert_eq!(s, "foofoo".as_bytes()); + /// ``` + /// + /// When the needle is an empty string: + /// + /// ``` + /// use bstr::ByteSlice; + /// + /// let s = b"foo".replacen("", "Z", 2); + /// assert_eq!(s, "ZfZoo".as_bytes()); + /// ``` + #[cfg(feature = "std")] + #[inline] + fn replacen, R: AsRef<[u8]>>( + &self, + needle: N, + replacement: R, + limit: usize, + ) -> Vec { + let mut dest = Vec::with_capacity(self.as_bytes().len()); + self.replacen_into(needle, replacement, limit, &mut dest); + dest + } + + /// Replace all matches of the given needle with the given replacement, + /// and write the result into the provided `Vec`. + /// + /// This does **not** clear `dest` before writing to it. + /// + /// This routine is useful for reusing allocation. For a more convenient + /// API, use [`replace`](#method.replace) instead. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::ByteSlice; + /// + /// let s = b"this is old"; + /// + /// let mut dest = vec![]; + /// s.replace_into("old", "new", &mut dest); + /// assert_eq!(dest, "this is new".as_bytes()); + /// ``` + /// + /// When the pattern doesn't match: + /// + /// ``` + /// use bstr::ByteSlice; + /// + /// let s = b"this is old"; + /// + /// let mut dest = vec![]; + /// s.replace_into("nada nada", "limonada", &mut dest); + /// assert_eq!(dest, "this is old".as_bytes()); + /// ``` + /// + /// When the needle is an empty string: + /// + /// ``` + /// use bstr::ByteSlice; + /// + /// let s = b"foo"; + /// + /// let mut dest = vec![]; + /// s.replace_into("", "Z", &mut dest); + /// assert_eq!(dest, "ZfZoZoZ".as_bytes()); + /// ``` + #[cfg(feature = "std")] + #[inline] + fn replace_into, R: AsRef<[u8]>>( + &self, + needle: N, + replacement: R, + dest: &mut Vec, + ) { + let (needle, replacement) = (needle.as_ref(), replacement.as_ref()); + + let mut last = 0; + for start in self.find_iter(needle) { + dest.push_str(&self.as_bytes()[last..start]); + dest.push_str(replacement); + last = start + needle.len(); + } + dest.push_str(&self.as_bytes()[last..]); + } + + /// Replace up to `limit` matches of the given needle with the given + /// replacement, and write the result into the provided `Vec`. + /// + /// This does **not** clear `dest` before writing to it. + /// + /// This routine is useful for reusing allocation. For a more convenient + /// API, use [`replacen`](#method.replacen) instead. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::ByteSlice; + /// + /// let s = b"foofoo"; + /// + /// let mut dest = vec![]; + /// s.replacen_into("o", "z", 2, &mut dest); + /// assert_eq!(dest, "fzzfoo".as_bytes()); + /// ``` + /// + /// When the pattern doesn't match: + /// + /// ``` + /// use bstr::ByteSlice; + /// + /// let s = b"foofoo"; + /// + /// let mut dest = vec![]; + /// s.replacen_into("a", "z", 2, &mut dest); + /// assert_eq!(dest, "foofoo".as_bytes()); + /// ``` + /// + /// When the needle is an empty string: + /// + /// ``` + /// use bstr::ByteSlice; + /// + /// let s = b"foo"; + /// + /// let mut dest = vec![]; + /// s.replacen_into("", "Z", 2, &mut dest); + /// assert_eq!(dest, "ZfZoo".as_bytes()); + /// ``` + #[cfg(feature = "std")] + #[inline] + fn replacen_into, R: AsRef<[u8]>>( + &self, + needle: N, + replacement: R, + limit: usize, + dest: &mut Vec, + ) { + let (needle, replacement) = (needle.as_ref(), replacement.as_ref()); + + let mut last = 0; + for start in self.find_iter(needle).take(limit) { + dest.push_str(&self.as_bytes()[last..start]); + dest.push_str(replacement); + last = start + needle.len(); + } + dest.push_str(&self.as_bytes()[last..]); + } + + /// Returns an iterator over the bytes in this byte string. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::ByteSlice; + /// + /// let bs = b"foobar"; + /// let bytes: Vec = bs.bytes().collect(); + /// assert_eq!(bytes, bs); + /// ``` + #[inline] + fn bytes(&self) -> Bytes { + Bytes { it: self.as_bytes().iter() } + } + + /// Returns an iterator over the Unicode scalar values in this byte string. + /// If invalid UTF-8 is encountered, then the Unicode replacement codepoint + /// is yielded instead. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::ByteSlice; + /// + /// let bs = b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61"; + /// let chars: Vec = bs.chars().collect(); + /// assert_eq!(vec!['☃', '\u{FFFD}', '𝞃', '\u{FFFD}', 'a'], chars); + /// ``` + /// + /// Codepoints can also be iterated over in reverse: + /// + /// ``` + /// use bstr::ByteSlice; + /// + /// let bs = b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61"; + /// let chars: Vec = bs.chars().rev().collect(); + /// assert_eq!(vec!['a', '\u{FFFD}', '𝞃', '\u{FFFD}', '☃'], chars); + /// ``` + #[inline] + fn chars(&self) -> Chars { + Chars::new(self.as_bytes()) + } + + /// Returns an iterator over the Unicode scalar values in this byte string + /// along with their starting and ending byte index positions. If invalid + /// UTF-8 is encountered, then the Unicode replacement codepoint is yielded + /// instead. + /// + /// Note that this is slightly different from the `CharIndices` iterator + /// provided by the standard library. Aside from working on possibly + /// invalid UTF-8, this iterator provides both the corresponding starting + /// and ending byte indices of each codepoint yielded. The ending position + /// is necessary to slice the original byte string when invalid UTF-8 bytes + /// are converted into a Unicode replacement codepoint, since a single + /// replacement codepoint can substitute anywhere from 1 to 3 invalid bytes + /// (inclusive). + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::ByteSlice; + /// + /// let bs = b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61"; + /// let chars: Vec<(usize, usize, char)> = bs.char_indices().collect(); + /// assert_eq!(chars, vec![ + /// (0, 3, '☃'), + /// (3, 4, '\u{FFFD}'), + /// (4, 8, '𝞃'), + /// (8, 10, '\u{FFFD}'), + /// (10, 11, 'a'), + /// ]); + /// ``` + /// + /// Codepoints can also be iterated over in reverse: + /// + /// ``` + /// use bstr::ByteSlice; + /// + /// let bs = b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61"; + /// let chars: Vec<(usize, usize, char)> = bs + /// .char_indices() + /// .rev() + /// .collect(); + /// assert_eq!(chars, vec![ + /// (10, 11, 'a'), + /// (8, 10, '\u{FFFD}'), + /// (4, 8, '𝞃'), + /// (3, 4, '\u{FFFD}'), + /// (0, 3, '☃'), + /// ]); + /// ``` + #[inline] + fn char_indices(&self) -> CharIndices { + CharIndices::new(self.as_bytes()) + } + + /// Returns an iterator over the grapheme clusters in this byte string. + /// If invalid UTF-8 is encountered, then the Unicode replacement codepoint + /// is yielded instead. + /// + /// # Examples + /// + /// This example shows how multiple codepoints can combine to form a + /// single grapheme cluster: + /// + /// ``` + /// use bstr::ByteSlice; + /// + /// let bs = "a\u{0300}\u{0316}\u{1F1FA}\u{1F1F8}".as_bytes(); + /// let graphemes: Vec<&str> = bs.graphemes().collect(); + /// assert_eq!(vec!["à̖", "🇺🇸"], graphemes); + /// ``` + /// + /// This shows that graphemes can be iterated over in reverse: + /// + /// ``` + /// use bstr::ByteSlice; + /// + /// let bs = "a\u{0300}\u{0316}\u{1F1FA}\u{1F1F8}".as_bytes(); + /// let graphemes: Vec<&str> = bs.graphemes().rev().collect(); + /// assert_eq!(vec!["🇺🇸", "à̖"], graphemes); + /// ``` + #[cfg(feature = "unicode")] + #[inline] + fn graphemes(&self) -> Graphemes { + Graphemes::new(self.as_bytes()) + } + + /// Returns an iterator over the grapheme clusters in this byte string + /// along with their starting and ending byte index positions. If invalid + /// UTF-8 is encountered, then the Unicode replacement codepoint is yielded + /// instead. + /// + /// # Examples + /// + /// This example shows how to get the byte offsets of each individual + /// grapheme cluster: + /// + /// ``` + /// use bstr::ByteSlice; + /// + /// let bs = "a\u{0300}\u{0316}\u{1F1FA}\u{1F1F8}".as_bytes(); + /// let graphemes: Vec<(usize, usize, &str)> = + /// bs.grapheme_indices().collect(); + /// assert_eq!(vec![(0, 5, "à̖"), (5, 13, "🇺🇸")], graphemes); + /// ``` + /// + /// This example shows what happens when invalid UTF-8 is enountered. Note + /// that the offsets are valid indices into the original string, and do + /// not necessarily correspond to the length of the `&str` returned! + /// + /// ``` + /// use bstr::{ByteSlice, ByteVec}; + /// + /// let mut bytes = vec![]; + /// bytes.push_str("a\u{0300}\u{0316}"); + /// bytes.push(b'\xFF'); + /// bytes.push_str("\u{1F1FA}\u{1F1F8}"); + /// + /// let graphemes: Vec<(usize, usize, &str)> = + /// bytes.grapheme_indices().collect(); + /// assert_eq!( + /// graphemes, + /// vec![(0, 5, "à̖"), (5, 6, "\u{FFFD}"), (6, 14, "🇺🇸")] + /// ); + /// ``` + #[cfg(feature = "unicode")] + #[inline] + fn grapheme_indices(&self) -> GraphemeIndices { + GraphemeIndices::new(self.as_bytes()) + } + + /// Returns an iterator over the words in this byte string. If invalid + /// UTF-8 is encountered, then the Unicode replacement codepoint is yielded + /// instead. + /// + /// This is similar to + /// [`words_with_breaks`](trait.ByteSlice.html#method.words_with_breaks), + /// except it only returns elements that contain a "word" character. A word + /// character is defined by UTS #18 (Annex C) to be the combination of the + /// `Alphabetic` and `Join_Control` properties, along with the + /// `Decimal_Number`, `Mark` and `Connector_Punctuation` general + /// categories. + /// + /// Since words are made up of one or more codepoints, this iterator + /// yields `&str` elements. When invalid UTF-8 is encountered, replacement + /// codepoints are [substituted](index.html#handling-of-invalid-utf-8). + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::ByteSlice; + /// + /// let bs = br#"The quick ("brown") fox can't jump 32.3 feet, right?"#; + /// let words: Vec<&str> = bs.words().collect(); + /// assert_eq!(words, vec![ + /// "The", "quick", "brown", "fox", "can't", + /// "jump", "32.3", "feet", "right", + /// ]); + /// ``` + #[cfg(feature = "unicode")] + #[inline] + fn words(&self) -> Words { + Words::new(self.as_bytes()) + } + + /// Returns an iterator over the words in this byte string along with + /// their starting and ending byte index positions. + /// + /// This is similar to + /// [`words_with_break_indices`](trait.ByteSlice.html#method.words_with_break_indices), + /// except it only returns elements that contain a "word" character. A word + /// character is defined by UTS #18 (Annex C) to be the combination of the + /// `Alphabetic` and `Join_Control` properties, along with the + /// `Decimal_Number`, `Mark` and `Connector_Punctuation` general + /// categories. + /// + /// Since words are made up of one or more codepoints, this iterator + /// yields `&str` elements. When invalid UTF-8 is encountered, replacement + /// codepoints are [substituted](index.html#handling-of-invalid-utf-8). + /// + /// # Examples + /// + /// This example shows how to get the byte offsets of each individual + /// word: + /// + /// ``` + /// use bstr::ByteSlice; + /// + /// let bs = b"can't jump 32.3 feet"; + /// let words: Vec<(usize, usize, &str)> = bs.word_indices().collect(); + /// assert_eq!(words, vec![ + /// (0, 5, "can't"), + /// (6, 10, "jump"), + /// (11, 15, "32.3"), + /// (16, 20, "feet"), + /// ]); + /// ``` + #[cfg(feature = "unicode")] + #[inline] + fn word_indices(&self) -> WordIndices { + WordIndices::new(self.as_bytes()) + } + + /// Returns an iterator over the words in this byte string, along with + /// all breaks between the words. Concatenating all elements yielded by + /// the iterator results in the original string (modulo Unicode replacement + /// codepoint substitutions if invalid UTF-8 is encountered). + /// + /// Since words are made up of one or more codepoints, this iterator + /// yields `&str` elements. When invalid UTF-8 is encountered, replacement + /// codepoints are [substituted](index.html#handling-of-invalid-utf-8). + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::ByteSlice; + /// + /// let bs = br#"The quick ("brown") fox can't jump 32.3 feet, right?"#; + /// let words: Vec<&str> = bs.words_with_breaks().collect(); + /// assert_eq!(words, vec![ + /// "The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", + /// " ", "fox", " ", "can't", " ", "jump", " ", "32.3", " ", "feet", + /// ",", " ", "right", "?", + /// ]); + /// ``` + #[cfg(feature = "unicode")] + #[inline] + fn words_with_breaks(&self) -> WordsWithBreaks { + WordsWithBreaks::new(self.as_bytes()) + } + + /// Returns an iterator over the words and their byte offsets in this + /// byte string, along with all breaks between the words. Concatenating + /// all elements yielded by the iterator results in the original string + /// (modulo Unicode replacement codepoint substitutions if invalid UTF-8 is + /// encountered). + /// + /// Since words are made up of one or more codepoints, this iterator + /// yields `&str` elements. When invalid UTF-8 is encountered, replacement + /// codepoints are [substituted](index.html#handling-of-invalid-utf-8). + /// + /// # Examples + /// + /// This example shows how to get the byte offsets of each individual + /// word: + /// + /// ``` + /// use bstr::ByteSlice; + /// + /// let bs = b"can't jump 32.3 feet"; + /// let words: Vec<(usize, usize, &str)> = + /// bs.words_with_break_indices().collect(); + /// assert_eq!(words, vec![ + /// (0, 5, "can't"), + /// (5, 6, " "), + /// (6, 10, "jump"), + /// (10, 11, " "), + /// (11, 15, "32.3"), + /// (15, 16, " "), + /// (16, 20, "feet"), + /// ]); + /// ``` + #[cfg(feature = "unicode")] + #[inline] + fn words_with_break_indices(&self) -> WordsWithBreakIndices { + WordsWithBreakIndices::new(self.as_bytes()) + } + + /// Returns an iterator over the sentences in this byte string. + /// + /// Typically, a sentence will include its trailing punctuation and + /// whitespace. Concatenating all elements yielded by the iterator + /// results in the original string (modulo Unicode replacement codepoint + /// substitutions if invalid UTF-8 is encountered). + /// + /// Since sentences are made up of one or more codepoints, this iterator + /// yields `&str` elements. When invalid UTF-8 is encountered, replacement + /// codepoints are [substituted](index.html#handling-of-invalid-utf-8). + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::ByteSlice; + /// + /// let bs = b"I want this. Not that. Right now."; + /// let sentences: Vec<&str> = bs.sentences().collect(); + /// assert_eq!(sentences, vec![ + /// "I want this. ", + /// "Not that. ", + /// "Right now.", + /// ]); + /// ``` + #[cfg(feature = "unicode")] + #[inline] + fn sentences(&self) -> Sentences { + Sentences::new(self.as_bytes()) + } + + /// Returns an iterator over the sentences in this byte string along with + /// their starting and ending byte index positions. + /// + /// Typically, a sentence will include its trailing punctuation and + /// whitespace. Concatenating all elements yielded by the iterator + /// results in the original string (modulo Unicode replacement codepoint + /// substitutions if invalid UTF-8 is encountered). + /// + /// Since sentences are made up of one or more codepoints, this iterator + /// yields `&str` elements. When invalid UTF-8 is encountered, replacement + /// codepoints are [substituted](index.html#handling-of-invalid-utf-8). + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::ByteSlice; + /// + /// let bs = b"I want this. Not that. Right now."; + /// let sentences: Vec<(usize, usize, &str)> = + /// bs.sentence_indices().collect(); + /// assert_eq!(sentences, vec![ + /// (0, 13, "I want this. "), + /// (13, 23, "Not that. "), + /// (23, 33, "Right now."), + /// ]); + /// ``` + #[cfg(feature = "unicode")] + #[inline] + fn sentence_indices(&self) -> SentenceIndices { + SentenceIndices::new(self.as_bytes()) + } + + /// An iterator over all lines in a byte string, without their + /// terminators. + /// + /// For this iterator, the only line terminators recognized are `\r\n` and + /// `\n`. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::{B, ByteSlice}; + /// + /// let s = b"\ + /// foo + /// + /// bar\r + /// baz + /// + /// + /// quux"; + /// let lines: Vec<&[u8]> = s.lines().collect(); + /// assert_eq!(lines, vec![ + /// B("foo"), B(""), B("bar"), B("baz"), B(""), B(""), B("quux"), + /// ]); + /// ``` + #[inline] + fn lines(&self) -> Lines { + Lines::new(self.as_bytes()) + } + + /// An iterator over all lines in a byte string, including their + /// terminators. + /// + /// For this iterator, the only line terminator recognized is `\n`. (Since + /// line terminators are included, this also handles `\r\n` line endings.) + /// + /// Line terminators are only included if they are present in the original + /// byte string. For example, the last line in a byte string may not end + /// with a line terminator. + /// + /// Concatenating all elements yielded by this iterator is guaranteed to + /// yield the original byte string. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::{B, ByteSlice}; + /// + /// let s = b"\ + /// foo + /// + /// bar\r + /// baz + /// + /// + /// quux"; + /// let lines: Vec<&[u8]> = s.lines_with_terminator().collect(); + /// assert_eq!(lines, vec![ + /// B("foo\n"), + /// B("\n"), + /// B("bar\r\n"), + /// B("baz\n"), + /// B("\n"), + /// B("\n"), + /// B("quux"), + /// ]); + /// ``` + #[inline] + fn lines_with_terminator(&self) -> LinesWithTerminator { + LinesWithTerminator::new(self.as_bytes()) + } + + /// Return a byte string slice with leading and trailing whitespace + /// removed. + /// + /// Whitespace is defined according to the terms of the `White_Space` + /// Unicode property. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::{B, ByteSlice}; + /// + /// let s = B(" foo\tbar\t\u{2003}\n"); + /// assert_eq!(s.trim(), B("foo\tbar")); + /// ``` + #[cfg(feature = "unicode")] + #[inline] + fn trim(&self) -> &[u8] { + self.trim_start().trim_end() + } + + /// Return a byte string slice with leading whitespace removed. + /// + /// Whitespace is defined according to the terms of the `White_Space` + /// Unicode property. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::{B, ByteSlice}; + /// + /// let s = B(" foo\tbar\t\u{2003}\n"); + /// assert_eq!(s.trim_start(), B("foo\tbar\t\u{2003}\n")); + /// ``` + #[cfg(feature = "unicode")] + #[inline] + fn trim_start(&self) -> &[u8] { + let start = whitespace_len_fwd(self.as_bytes()); + &self.as_bytes()[start..] + } + + /// Return a byte string slice with trailing whitespace removed. + /// + /// Whitespace is defined according to the terms of the `White_Space` + /// Unicode property. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::{B, ByteSlice}; + /// + /// let s = B(" foo\tbar\t\u{2003}\n"); + /// assert_eq!(s.trim_end(), B(" foo\tbar")); + /// ``` + #[cfg(feature = "unicode")] + #[inline] + fn trim_end(&self) -> &[u8] { + let end = whitespace_len_rev(self.as_bytes()); + &self.as_bytes()[..end] + } + + /// Return a byte string slice with leading and trailing characters + /// satisfying the given predicate removed. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::{B, ByteSlice}; + /// + /// let s = b"123foo5bar789"; + /// assert_eq!(s.trim_with(|c| c.is_numeric()), B("foo5bar")); + /// ``` + #[inline] + fn trim_with bool>(&self, mut trim: F) -> &[u8] { + self.trim_start_with(&mut trim).trim_end_with(&mut trim) + } + + /// Return a byte string slice with leading characters satisfying the given + /// predicate removed. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::{B, ByteSlice}; + /// + /// let s = b"123foo5bar789"; + /// assert_eq!(s.trim_start_with(|c| c.is_numeric()), B("foo5bar789")); + /// ``` + #[inline] + fn trim_start_with bool>( + &self, + mut trim: F, + ) -> &[u8] { + for (s, _, ch) in self.char_indices() { + if !trim(ch) { + return &self.as_bytes()[s..]; + } + } + b"" + } + + /// Return a byte string slice with trailing characters satisfying the + /// given predicate removed. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::{B, ByteSlice}; + /// + /// let s = b"123foo5bar"; + /// assert_eq!(s.trim_end_with(|c| c.is_numeric()), B("123foo5bar")); + /// ``` + #[inline] + fn trim_end_with bool>( + &self, + mut trim: F, + ) -> &[u8] { + for (_, e, ch) in self.char_indices().rev() { + if !trim(ch) { + return &self.as_bytes()[..e]; + } + } + b"" + } + + /// Returns a new `Vec` containing the lowercase equivalent of this + /// byte string. + /// + /// In this case, lowercase is defined according to the `Lowercase` Unicode + /// property. + /// + /// If invalid UTF-8 is seen, or if a character has no lowercase variant, + /// then it is written to the given buffer unchanged. + /// + /// Note that some characters in this byte string may expand into multiple + /// characters when changing the case, so the number of bytes written to + /// the given byte string may not be equivalent to the number of bytes in + /// this byte string. + /// + /// If you'd like to reuse an allocation for performance reasons, then use + /// [`to_lowercase_into`](#method.to_lowercase_into) instead. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::{B, ByteSlice}; + /// + /// let s = B("HELLO Β"); + /// assert_eq!("hello β".as_bytes(), s.to_lowercase().as_bytes()); + /// ``` + /// + /// Scripts without case are not changed: + /// + /// ``` + /// use bstr::{B, ByteSlice}; + /// + /// let s = B("农历新年"); + /// assert_eq!("农历新年".as_bytes(), s.to_lowercase().as_bytes()); + /// ``` + /// + /// Invalid UTF-8 remains as is: + /// + /// ``` + /// use bstr::{B, ByteSlice}; + /// + /// let s = B(b"FOO\xFFBAR\xE2\x98BAZ"); + /// assert_eq!(B(b"foo\xFFbar\xE2\x98baz"), s.to_lowercase().as_bytes()); + /// ``` + #[cfg(all(feature = "std", feature = "unicode"))] + #[inline] + fn to_lowercase(&self) -> Vec { + let mut buf = vec![]; + self.to_lowercase_into(&mut buf); + buf + } + + /// Writes the lowercase equivalent of this byte string into the given + /// buffer. The buffer is not cleared before written to. + /// + /// In this case, lowercase is defined according to the `Lowercase` + /// Unicode property. + /// + /// If invalid UTF-8 is seen, or if a character has no lowercase variant, + /// then it is written to the given buffer unchanged. + /// + /// Note that some characters in this byte string may expand into multiple + /// characters when changing the case, so the number of bytes written to + /// the given byte string may not be equivalent to the number of bytes in + /// this byte string. + /// + /// If you don't need to amortize allocation and instead prefer + /// convenience, then use [`to_lowercase`](#method.to_lowercase) instead. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::{B, ByteSlice}; + /// + /// let s = B("HELLO Β"); + /// + /// let mut buf = vec![]; + /// s.to_lowercase_into(&mut buf); + /// assert_eq!("hello β".as_bytes(), buf.as_bytes()); + /// ``` + /// + /// Scripts without case are not changed: + /// + /// ``` + /// use bstr::{B, ByteSlice}; + /// + /// let s = B("农历新年"); + /// + /// let mut buf = vec![]; + /// s.to_lowercase_into(&mut buf); + /// assert_eq!("农历新年".as_bytes(), buf.as_bytes()); + /// ``` + /// + /// Invalid UTF-8 remains as is: + /// + /// ``` + /// use bstr::{B, ByteSlice}; + /// + /// let s = B(b"FOO\xFFBAR\xE2\x98BAZ"); + /// + /// let mut buf = vec![]; + /// s.to_lowercase_into(&mut buf); + /// assert_eq!(B(b"foo\xFFbar\xE2\x98baz"), buf.as_bytes()); + /// ``` + #[cfg(all(feature = "std", feature = "unicode"))] + #[inline] + fn to_lowercase_into(&self, buf: &mut Vec) { + // TODO: This is the best we can do given what std exposes I think. + // If we roll our own case handling, then we might be able to do this + // a bit faster. We shouldn't roll our own case handling unless we + // need to, e.g., for doing caseless matching or case folding. + + // TODO(BUG): This doesn't handle any special casing rules. + + buf.reserve(self.as_bytes().len()); + for (s, e, ch) in self.char_indices() { + if ch == '\u{FFFD}' { + buf.push_str(&self.as_bytes()[s..e]); + } else if ch.is_ascii() { + buf.push_char(ch.to_ascii_lowercase()); + } else { + for upper in ch.to_lowercase() { + buf.push_char(upper); + } + } + } + } + + /// Returns a new `Vec` containing the ASCII lowercase equivalent of + /// this byte string. + /// + /// In this case, lowercase is only defined in ASCII letters. Namely, the + /// letters `A-Z` are converted to `a-z`. All other bytes remain unchanged. + /// In particular, the length of the byte string returned is always + /// equivalent to the length of this byte string. + /// + /// If you'd like to reuse an allocation for performance reasons, then use + /// [`make_ascii_lowercase`](#method.make_ascii_lowercase) to perform + /// the conversion in place. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::{B, ByteSlice}; + /// + /// let s = B("HELLO Β"); + /// assert_eq!("hello Β".as_bytes(), s.to_ascii_lowercase().as_bytes()); + /// ``` + /// + /// Invalid UTF-8 remains as is: + /// + /// ``` + /// use bstr::{B, ByteSlice}; + /// + /// let s = B(b"FOO\xFFBAR\xE2\x98BAZ"); + /// assert_eq!(s.to_ascii_lowercase(), B(b"foo\xFFbar\xE2\x98baz")); + /// ``` + #[cfg(feature = "std")] + #[inline] + fn to_ascii_lowercase(&self) -> Vec { + self.as_bytes().to_ascii_lowercase() + } + + /// Convert this byte string to its lowercase ASCII equivalent in place. + /// + /// In this case, lowercase is only defined in ASCII letters. Namely, the + /// letters `A-Z` are converted to `a-z`. All other bytes remain unchanged. + /// + /// If you don't need to do the conversion in + /// place and instead prefer convenience, then use + /// [`to_ascii_lowercase`](#method.to_ascii_lowercase) instead. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::ByteSlice; + /// + /// let mut s = >::from("HELLO Β"); + /// s.make_ascii_lowercase(); + /// assert_eq!(s, "hello Β".as_bytes()); + /// ``` + /// + /// Invalid UTF-8 remains as is: + /// + /// ``` + /// use bstr::{B, ByteSlice, ByteVec}; + /// + /// let mut s = >::from_slice(b"FOO\xFFBAR\xE2\x98BAZ"); + /// s.make_ascii_lowercase(); + /// assert_eq!(s, B(b"foo\xFFbar\xE2\x98baz")); + /// ``` + #[inline] + fn make_ascii_lowercase(&mut self) { + self.as_bytes_mut().make_ascii_lowercase(); + } + + /// Returns a new `Vec` containing the uppercase equivalent of this + /// byte string. + /// + /// In this case, uppercase is defined according to the `Uppercase` + /// Unicode property. + /// + /// If invalid UTF-8 is seen, or if a character has no uppercase variant, + /// then it is written to the given buffer unchanged. + /// + /// Note that some characters in this byte string may expand into multiple + /// characters when changing the case, so the number of bytes written to + /// the given byte string may not be equivalent to the number of bytes in + /// this byte string. + /// + /// If you'd like to reuse an allocation for performance reasons, then use + /// [`to_uppercase_into`](#method.to_uppercase_into) instead. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::{B, ByteSlice}; + /// + /// let s = B("hello β"); + /// assert_eq!(s.to_uppercase(), B("HELLO Β")); + /// ``` + /// + /// Scripts without case are not changed: + /// + /// ``` + /// use bstr::{B, ByteSlice}; + /// + /// let s = B("农历新年"); + /// assert_eq!(s.to_uppercase(), B("农历新年")); + /// ``` + /// + /// Invalid UTF-8 remains as is: + /// + /// ``` + /// use bstr::{B, ByteSlice}; + /// + /// let s = B(b"foo\xFFbar\xE2\x98baz"); + /// assert_eq!(s.to_uppercase(), B(b"FOO\xFFBAR\xE2\x98BAZ")); + /// ``` + #[cfg(all(feature = "std", feature = "unicode"))] + #[inline] + fn to_uppercase(&self) -> Vec { + let mut buf = vec![]; + self.to_uppercase_into(&mut buf); + buf + } + + /// Writes the uppercase equivalent of this byte string into the given + /// buffer. The buffer is not cleared before written to. + /// + /// In this case, uppercase is defined according to the `Uppercase` + /// Unicode property. + /// + /// If invalid UTF-8 is seen, or if a character has no uppercase variant, + /// then it is written to the given buffer unchanged. + /// + /// Note that some characters in this byte string may expand into multiple + /// characters when changing the case, so the number of bytes written to + /// the given byte string may not be equivalent to the number of bytes in + /// this byte string. + /// + /// If you don't need to amortize allocation and instead prefer + /// convenience, then use [`to_uppercase`](#method.to_uppercase) instead. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::{B, ByteSlice}; + /// + /// let s = B("hello β"); + /// + /// let mut buf = vec![]; + /// s.to_uppercase_into(&mut buf); + /// assert_eq!(buf, B("HELLO Β")); + /// ``` + /// + /// Scripts without case are not changed: + /// + /// ``` + /// use bstr::{B, ByteSlice}; + /// + /// let s = B("农历新年"); + /// + /// let mut buf = vec![]; + /// s.to_uppercase_into(&mut buf); + /// assert_eq!(buf, B("农历新年")); + /// ``` + /// + /// Invalid UTF-8 remains as is: + /// + /// ``` + /// use bstr::{B, ByteSlice}; + /// + /// let s = B(b"foo\xFFbar\xE2\x98baz"); + /// + /// let mut buf = vec![]; + /// s.to_uppercase_into(&mut buf); + /// assert_eq!(buf, B(b"FOO\xFFBAR\xE2\x98BAZ")); + /// ``` + #[cfg(all(feature = "std", feature = "unicode"))] + #[inline] + fn to_uppercase_into(&self, buf: &mut Vec) { + // TODO: This is the best we can do given what std exposes I think. + // If we roll our own case handling, then we might be able to do this + // a bit faster. We shouldn't roll our own case handling unless we + // need to, e.g., for doing caseless matching or case folding. + buf.reserve(self.as_bytes().len()); + for (s, e, ch) in self.char_indices() { + if ch == '\u{FFFD}' { + buf.push_str(&self.as_bytes()[s..e]); + } else if ch.is_ascii() { + buf.push_char(ch.to_ascii_uppercase()); + } else { + for upper in ch.to_uppercase() { + buf.push_char(upper); + } + } + } + } + + /// Returns a new `Vec` containing the ASCII uppercase equivalent of + /// this byte string. + /// + /// In this case, uppercase is only defined in ASCII letters. Namely, the + /// letters `a-z` are converted to `A-Z`. All other bytes remain unchanged. + /// In particular, the length of the byte string returned is always + /// equivalent to the length of this byte string. + /// + /// If you'd like to reuse an allocation for performance reasons, then use + /// [`make_ascii_uppercase`](#method.make_ascii_uppercase) to perform + /// the conversion in place. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::{B, ByteSlice}; + /// + /// let s = B("hello β"); + /// assert_eq!(s.to_ascii_uppercase(), B("HELLO β")); + /// ``` + /// + /// Invalid UTF-8 remains as is: + /// + /// ``` + /// use bstr::{B, ByteSlice}; + /// + /// let s = B(b"foo\xFFbar\xE2\x98baz"); + /// assert_eq!(s.to_ascii_uppercase(), B(b"FOO\xFFBAR\xE2\x98BAZ")); + /// ``` + #[cfg(feature = "std")] + #[inline] + fn to_ascii_uppercase(&self) -> Vec { + self.as_bytes().to_ascii_uppercase() + } + + /// Convert this byte string to its uppercase ASCII equivalent in place. + /// + /// In this case, uppercase is only defined in ASCII letters. Namely, the + /// letters `a-z` are converted to `A-Z`. All other bytes remain unchanged. + /// + /// If you don't need to do the conversion in + /// place and instead prefer convenience, then use + /// [`to_ascii_uppercase`](#method.to_ascii_uppercase) instead. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::{B, ByteSlice}; + /// + /// let mut s = >::from("hello β"); + /// s.make_ascii_uppercase(); + /// assert_eq!(s, B("HELLO β")); + /// ``` + /// + /// Invalid UTF-8 remains as is: + /// + /// ``` + /// use bstr::{B, ByteSlice, ByteVec}; + /// + /// let mut s = >::from_slice(b"foo\xFFbar\xE2\x98baz"); + /// s.make_ascii_uppercase(); + /// assert_eq!(s, B(b"FOO\xFFBAR\xE2\x98BAZ")); + /// ``` + #[inline] + fn make_ascii_uppercase(&mut self) { + self.as_bytes_mut().make_ascii_uppercase(); + } + + /// Reverse the bytes in this string, in place. + /// + /// This is not necessarily a well formed operation! For example, if this + /// byte string contains valid UTF-8 that isn't ASCII, then reversing the + /// string will likely result in invalid UTF-8 and otherwise non-sensical + /// content. + /// + /// Note that this is equivalent to the generic `[u8]::reverse` method. + /// This method is provided to permit callers to explicitly differentiate + /// between reversing bytes, codepoints and graphemes. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::ByteSlice; + /// + /// let mut s = >::from("hello"); + /// s.reverse_bytes(); + /// assert_eq!(s, "olleh".as_bytes()); + /// ``` + #[inline] + fn reverse_bytes(&mut self) { + self.as_bytes_mut().reverse(); + } + + /// Reverse the codepoints in this string, in place. + /// + /// If this byte string is valid UTF-8, then its reversal by codepoint + /// is also guaranteed to be valid UTF-8. + /// + /// This operation is equivalent to the following, but without allocating: + /// + /// ``` + /// use bstr::ByteSlice; + /// + /// let mut s = >::from("foo☃bar"); + /// + /// let mut chars: Vec = s.chars().collect(); + /// chars.reverse(); + /// + /// let reversed: String = chars.into_iter().collect(); + /// assert_eq!(reversed, "rab☃oof"); + /// ``` + /// + /// Note that this is not necessarily a well formed operation. For example, + /// if this byte string contains grapheme clusters with more than one + /// codepoint, then those grapheme clusters will not necessarily be + /// preserved. If you'd like to preserve grapheme clusters, then use + /// [`reverse_graphemes`](#method.reverse_graphemes) instead. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::ByteSlice; + /// + /// let mut s = >::from("foo☃bar"); + /// s.reverse_chars(); + /// assert_eq!(s, "rab☃oof".as_bytes()); + /// ``` + /// + /// This example shows that not all reversals lead to a well formed string. + /// For example, in this case, combining marks are used to put accents over + /// some letters, and those accent marks must appear after the codepoints + /// they modify. + /// + /// ``` + /// use bstr::{B, ByteSlice}; + /// + /// let mut s = >::from("résumé"); + /// s.reverse_chars(); + /// assert_eq!(s, B(b"\xCC\x81emus\xCC\x81er")); + /// ``` + /// + /// A word of warning: the above example relies on the fact that + /// `résumé` is in decomposed normal form, which means there are separate + /// codepoints for the accents above `e`. If it is instead in composed + /// normal form, then the example works: + /// + /// ``` + /// use bstr::{B, ByteSlice}; + /// + /// let mut s = >::from("résumé"); + /// s.reverse_chars(); + /// assert_eq!(s, B("émusér")); + /// ``` + /// + /// The point here is to be cautious and not assume that just because + /// `reverse_chars` works in one case, that it therefore works in all + /// cases. + #[inline] + fn reverse_chars(&mut self) { + let mut i = 0; + loop { + let (_, size) = utf8::decode(&self.as_bytes()[i..]); + if size == 0 { + break; + } + if size > 1 { + self.as_bytes_mut()[i..i + size].reverse_bytes(); + } + i += size; + } + self.reverse_bytes(); + } + + /// Reverse the graphemes in this string, in place. + /// + /// If this byte string is valid UTF-8, then its reversal by grapheme + /// is also guaranteed to be valid UTF-8. + /// + /// This operation is equivalent to the following, but without allocating: + /// + /// ``` + /// use bstr::ByteSlice; + /// + /// let mut s = >::from("foo☃bar"); + /// + /// let mut graphemes: Vec<&str> = s.graphemes().collect(); + /// graphemes.reverse(); + /// + /// let reversed = graphemes.concat(); + /// assert_eq!(reversed, "rab☃oof"); + /// ``` + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::ByteSlice; + /// + /// let mut s = >::from("foo☃bar"); + /// s.reverse_graphemes(); + /// assert_eq!(s, "rab☃oof".as_bytes()); + /// ``` + /// + /// This example shows how this correctly handles grapheme clusters, + /// unlike `reverse_chars`. + /// + /// ``` + /// use bstr::ByteSlice; + /// + /// let mut s = >::from("résumé"); + /// s.reverse_graphemes(); + /// assert_eq!(s, "émusér".as_bytes()); + /// ``` + #[cfg(feature = "unicode")] + #[inline] + fn reverse_graphemes(&mut self) { + use unicode::decode_grapheme; + + let mut i = 0; + loop { + let (_, size) = decode_grapheme(&self.as_bytes()[i..]); + if size == 0 { + break; + } + if size > 1 { + self.as_bytes_mut()[i..i + size].reverse_bytes(); + } + i += size; + } + self.reverse_bytes(); + } + + /// Returns true if and only if every byte in this byte string is ASCII. + /// + /// ASCII is an encoding that defines 128 codepoints. A byte corresponds to + /// an ASCII codepoint if and only if it is in the inclusive range + /// `[0, 127]`. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::{B, ByteSlice}; + /// + /// assert!(B("abc").is_ascii()); + /// assert!(!B("☃βツ").is_ascii()); + /// assert!(!B(b"\xFF").is_ascii()); + /// ``` + #[inline] + fn is_ascii(&self) -> bool { + ascii::first_non_ascii_byte(self.as_bytes()) == self.as_bytes().len() + } + + /// Returns true if and only if the entire byte string is valid UTF-8. + /// + /// If you need location information about where a byte string's first + /// invalid UTF-8 byte is, then use the [`to_str`](#method.to_str) method. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::{B, ByteSlice}; + /// + /// assert!(B("abc").is_utf8()); + /// assert!(B("☃βツ").is_utf8()); + /// // invalid bytes + /// assert!(!B(b"abc\xFF").is_utf8()); + /// // surrogate encoding + /// assert!(!B(b"\xED\xA0\x80").is_utf8()); + /// // incomplete sequence + /// assert!(!B(b"\xF0\x9D\x9Ca").is_utf8()); + /// // overlong sequence + /// assert!(!B(b"\xF0\x82\x82\xAC").is_utf8()); + /// ``` + #[inline] + fn is_utf8(&self) -> bool { + utf8::validate(self.as_bytes()).is_ok() + } + + /// Returns the last byte in this byte string, if it's non-empty. If this + /// byte string is empty, this returns `None`. + /// + /// Note that this is like the generic `[u8]::last`, except this returns + /// the byte by value instead of a reference to the byte. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::ByteSlice; + /// + /// assert_eq!(Some(b'z'), b"baz".last_byte()); + /// assert_eq!(None, b"".last_byte()); + /// ``` + #[inline] + fn last_byte(&self) -> Option { + let bytes = self.as_bytes(); + bytes.get(bytes.len().saturating_sub(1)).map(|&b| b) + } + + /// Copies elements from one part of the slice to another part of itself, + /// where the parts may be overlapping. + /// + /// `src` is the range within this byte string to copy from, while `dest` + /// is the starting index of the range within this byte string to copy to. + /// The length indicated by `src` must be less than or equal to the number + /// of bytes from `dest` to the end of the byte string. + /// + /// # Panics + /// + /// Panics if either range is out of bounds, or if `src` is too big to fit + /// into `dest`, or if the end of `src` is before the start. + /// + /// # Examples + /// + /// Copying four bytes within a byte string: + /// + /// ``` + /// use bstr::{B, ByteSlice}; + /// + /// let mut buf = *b"Hello, World!"; + /// let s = &mut buf; + /// s.copy_within_str(1..5, 8); + /// assert_eq!(s, B("Hello, Wello!")); + /// ``` + #[inline] + fn copy_within_str( + &mut self, + src: R, + dest: usize, + ) where R: ops::RangeBounds + { + // TODO: Deprecate this once slice::copy_within stabilizes. + let src_start = match src.start_bound() { + ops::Bound::Included(&n) => n, + ops::Bound::Excluded(&n) => { + n.checked_add(1).expect("attempted to index slice beyond max") + } + ops::Bound::Unbounded => 0, + }; + let src_end = match src.end_bound() { + ops::Bound::Included(&n) => { + n.checked_add(1).expect("attempted to index slice beyond max") + } + ops::Bound::Excluded(&n) => n, + ops::Bound::Unbounded => self.as_bytes().len(), + }; + assert!(src_start <= src_end, "src end is before src start"); + assert!(src_end <= self.as_bytes().len(), "src is out of bounds"); + let count = src_end - src_start; + assert!( + dest <= self.as_bytes().len() - count, + "dest is out of bounds", + ); + + // SAFETY: This is safe because we use ptr::copy to handle overlapping + // copies, and is also safe because we've checked all the bounds above. + // Finally, we are only dealing with u8 data, which is Copy, which + // means we can copy without worrying about ownership/destructors. + unsafe { + ptr::copy( + self.as_bytes().get_unchecked(src_start), + self.as_bytes_mut().get_unchecked_mut(dest), + count, + ); + } + } +} + +/// A single substring searcher fixed to a particular needle. +/// +/// The purpose of this type is to permit callers to construct a substring +/// searcher that can be used to search haystacks without the overhead of +/// constructing the searcher in the first place. This is a somewhat niche +/// concern when it's necessary to re-use the same needle to search multiple +/// different haystacks with as little overhead as possible. In general, using +/// [`ByteSlice::find`](trait.ByteSlice.html#method.find) +/// or +/// [`ByteSlice::find_iter`](trait.ByteSlice.html#method.find_iter) +/// is good enough, but `Finder` is useful when you can meaningfully observe +/// searcher construction time in a profile. +/// +/// When the `std` feature is enabled, then this type has an `into_owned` +/// version which permits building a `Finder` that is not connected to the +/// lifetime of its needle. +#[derive(Clone, Debug)] +pub struct Finder<'a> { + searcher: TwoWay<'a>, +} + +impl<'a> Finder<'a> { + /// Create a new finder for the given needle. + #[inline] + pub fn new>(needle: &'a B) -> Finder<'a> { + Finder { searcher: TwoWay::forward(needle.as_ref()) } + } + + /// Convert this finder into its owned variant, such that it no longer + /// borrows the needle. + /// + /// If this is already an owned finder, then this is a no-op. Otherwise, + /// this copies the needle. + /// + /// This is only available when the `std` feature is enabled. + #[cfg(feature = "std")] + #[inline] + pub fn into_owned(self) -> Finder<'static> { + Finder { searcher: self.searcher.into_owned() } + } + + /// Returns the needle that this finder searches for. + /// + /// Note that the lifetime of the needle returned is tied to the lifetime + /// of the finder, and may be shorter than the `'a` lifetime. Namely, a + /// finder's needle can be either borrowed or owned, so the lifetime of the + /// needle returned must necessarily be the shorter of the two. + #[inline] + pub fn needle(&self) -> &[u8] { + self.searcher.needle() + } + + /// Returns the index of the first occurrence of this needle in the given + /// haystack. + /// + /// The haystack may be any type that can be cheaply converted into a + /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`. + /// + /// # Complexity + /// + /// This routine is guaranteed to have worst case linear time complexity + /// with respect to both the needle and the haystack. That is, this runs + /// in `O(needle.len() + haystack.len())` time. + /// + /// This routine is also guaranteed to have worst case constant space + /// complexity. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::Finder; + /// + /// let haystack = "foo bar baz"; + /// assert_eq!(Some(0), Finder::new("foo").find(haystack)); + /// assert_eq!(Some(4), Finder::new("bar").find(haystack)); + /// assert_eq!(None, Finder::new("quux").find(haystack)); + /// ``` + #[inline] + pub fn find>(&self, haystack: B) -> Option { + self.searcher.find(haystack.as_ref()) + } +} + +/// A single substring reverse searcher fixed to a particular needle. +/// +/// The purpose of this type is to permit callers to construct a substring +/// searcher that can be used to search haystacks without the overhead of +/// constructing the searcher in the first place. This is a somewhat niche +/// concern when it's necessary to re-use the same needle to search multiple +/// different haystacks with as little overhead as possible. In general, using +/// [`ByteSlice::rfind`](trait.ByteSlice.html#method.rfind) +/// or +/// [`ByteSlice::rfind_iter`](trait.ByteSlice.html#method.rfind_iter) +/// is good enough, but `FinderReverse` is useful when you can meaningfully +/// observe searcher construction time in a profile. +/// +/// When the `std` feature is enabled, then this type has an `into_owned` +/// version which permits building a `FinderReverse` that is not connected to +/// the lifetime of its needle. +#[derive(Clone, Debug)] +pub struct FinderReverse<'a> { + searcher: TwoWay<'a>, +} + +impl<'a> FinderReverse<'a> { + /// Create a new reverse finder for the given needle. + #[inline] + pub fn new>(needle: &'a B) -> FinderReverse<'a> { + FinderReverse { searcher: TwoWay::reverse(needle.as_ref()) } + } + + /// Convert this finder into its owned variant, such that it no longer + /// borrows the needle. + /// + /// If this is already an owned finder, then this is a no-op. Otherwise, + /// this copies the needle. + /// + /// This is only available when the `std` feature is enabled. + #[cfg(feature = "std")] + #[inline] + pub fn into_owned(self) -> FinderReverse<'static> { + FinderReverse { searcher: self.searcher.into_owned() } + } + + /// Returns the needle that this finder searches for. + /// + /// Note that the lifetime of the needle returned is tied to the lifetime + /// of this finder, and may be shorter than the `'a` lifetime. Namely, + /// a finder's needle can be either borrowed or owned, so the lifetime of + /// the needle returned must necessarily be the shorter of the two. + #[inline] + pub fn needle(&self) -> &[u8] { + self.searcher.needle() + } + + /// Returns the index of the last occurrence of this needle in the given + /// haystack. + /// + /// The haystack may be any type that can be cheaply converted into a + /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`. + /// + /// # Complexity + /// + /// This routine is guaranteed to have worst case linear time complexity + /// with respect to both the needle and the haystack. That is, this runs + /// in `O(needle.len() + haystack.len())` time. + /// + /// This routine is also guaranteed to have worst case constant space + /// complexity. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::FinderReverse; + /// + /// let haystack = "foo bar baz"; + /// assert_eq!(Some(0), FinderReverse::new("foo").rfind(haystack)); + /// assert_eq!(Some(4), FinderReverse::new("bar").rfind(haystack)); + /// assert_eq!(None, FinderReverse::new("quux").rfind(haystack)); + /// ``` + #[inline] + pub fn rfind>(&self, haystack: B) -> Option { + self.searcher.rfind(haystack.as_ref()) + } +} + +/// An iterator over non-overlapping substring matches. +/// +/// Matches are reported by the byte offset at which they begin. +/// +/// `'a` is the shorter of two lifetimes: the byte string being searched or the +/// byte string being looked for. +#[derive(Debug)] +pub struct Find<'a> { + haystack: &'a [u8], + prestate: PrefilterState, + searcher: TwoWay<'a>, + pos: usize, +} + +impl<'a> Find<'a> { + fn new(haystack: &'a [u8], needle: &'a [u8]) -> Find<'a> { + let searcher = TwoWay::forward(needle); + let prestate = searcher.prefilter_state(); + Find { haystack, prestate, searcher, pos: 0 } + } +} + +impl<'a> Iterator for Find<'a> { + type Item = usize; + + #[inline] + fn next(&mut self) -> Option { + if self.pos > self.haystack.len() { + return None; + } + let result = self.searcher.find_with( + &mut self.prestate, + &self.haystack[self.pos..], + ); + match result { + None => None, + Some(i) => { + let pos = self.pos + i; + self.pos = pos + cmp::max(1, self.searcher.needle().len()); + Some(pos) + } + } + } +} + +/// An iterator over non-overlapping substring matches in reverse. +/// +/// Matches are reported by the byte offset at which they begin. +/// +/// `'a` is the shorter of two lifetimes: the byte string being searched or the +/// byte string being looked for. +#[derive(Debug)] +pub struct FindReverse<'a> { + haystack: &'a [u8], + prestate: PrefilterState, + searcher: TwoWay<'a>, + /// When searching with an empty needle, this gets set to `None` after + /// we've yielded the last element at `0`. + pos: Option, +} + +impl<'a> FindReverse<'a> { + fn new(haystack: &'a [u8], needle: &'a [u8]) -> FindReverse<'a> { + let searcher = TwoWay::reverse(needle); + let prestate = searcher.prefilter_state(); + let pos = Some(haystack.len()); + FindReverse { haystack, prestate, searcher, pos } + } + + fn haystack(&self) -> &'a [u8] { + self.haystack + } + + fn needle(&self) -> &[u8] { + self.searcher.needle() + } +} + +impl<'a> Iterator for FindReverse<'a> { + type Item = usize; + + #[inline] + fn next(&mut self) -> Option { + let pos = match self.pos { + None => return None, + Some(pos) => pos, + }; + let result = self.searcher.rfind_with( + &mut self.prestate, + &self.haystack[..pos], + ); + match result { + None => None, + Some(i) => { + if pos == i { + self.pos = pos.checked_sub(1); + } else { + self.pos = Some(i); + } + Some(i) + } + } + } +} + +/// An iterator over the bytes in a byte string. +/// +/// `'a` is the lifetime of the byte string being traversed. +#[derive(Clone, Debug)] +pub struct Bytes<'a> { + it: slice::Iter<'a, u8>, +} + +impl<'a> Iterator for Bytes<'a> { + type Item = u8; + + #[inline] + fn next(&mut self) -> Option { + self.it.next().map(|&b| b) + } +} + +impl<'a> DoubleEndedIterator for Bytes<'a> { + #[inline] + fn next_back(&mut self) -> Option { + self.it.next_back().map(|&b| b) + } +} + +impl<'a> ExactSizeIterator for Bytes<'a> { + #[inline] + fn len(&self) -> usize { + self.it.len() + } +} + +/// An iterator over the fields in a byte string, separated by whitespace. +/// +/// This iterator splits on contiguous runs of whitespace, such that the fields +/// in `foo\t\t\n \nbar` are `foo` and `bar`. +/// +/// `'a` is the lifetime of the byte string being split. +#[derive(Debug)] +pub struct Fields<'a> { + it: FieldsWith<'a, fn(char) -> bool>, +} + +impl<'a> Fields<'a> { + fn new(bytes: &'a [u8]) -> Fields<'a> { + Fields { it: bytes.fields_with(|ch| ch.is_whitespace()) } + } +} + +impl<'a> Iterator for Fields<'a> { + type Item = &'a [u8]; + + #[inline] + fn next(&mut self) -> Option<&'a [u8]> { + self.it.next() + } +} + +/// An iterator over fields in the byte string, separated by a predicate over +/// codepoints. +/// +/// This iterator splits a byte string based on its predicate function such +/// that the elements returned are separated by contiguous runs of codepoints +/// for which the predicate returns true. +/// +/// `'a` is the lifetime of the byte string being split, while `F` is the type +/// of the predicate, i.e., `FnMut(char) -> bool`. +#[derive(Debug)] +pub struct FieldsWith<'a, F> { + f: F, + bytes: &'a [u8], + chars: CharIndices<'a>, +} + +impl<'a, F: FnMut(char) -> bool> FieldsWith<'a, F> { + fn new(bytes: &'a [u8], f: F) -> FieldsWith<'a, F> { + FieldsWith { + f: f, + bytes: bytes, + chars: bytes.char_indices(), + } + } +} + +impl<'a, F: FnMut(char) -> bool> Iterator for FieldsWith<'a, F> { + type Item = &'a [u8]; + + #[inline] + fn next(&mut self) -> Option<&'a [u8]> { + let (start, mut end); + loop { + match self.chars.next() { + None => return None, + Some((s, e, ch)) => { + if !(self.f)(ch) { + start = s; + end = e; + break; + } + } + } + } + while let Some((_, e, ch)) = self.chars.next() { + if (self.f)(ch) { + break; + } + end = e; + } + Some(&self.bytes[start..end]) + } +} + +/// An iterator over substrings in a byte string, split by a separator. +/// +/// `'a` is the lifetime of the byte string being split, while `F` is the type +/// of the predicate, i.e., `FnMut(char) -> bool`. +#[derive(Debug)] +pub struct Split<'a> { + finder: Find<'a>, + /// The end position of the previous match of our splitter. The element + /// we yield corresponds to the substring starting at `last` up to the + /// beginning of the next match of the splitter. + last: usize, + /// Only set when iteration is complete. A corner case here is when a + /// splitter is matched at the end of the haystack. At that point, we still + /// need to yield an empty string following it. + done: bool, +} + +impl<'a> Split<'a> { + fn new(haystack: &'a [u8], splitter: &'a [u8]) -> Split<'a> { + let finder = haystack.find_iter(splitter); + Split { finder, last: 0, done: false } + } +} + +impl<'a> Iterator for Split<'a> { + type Item = &'a [u8]; + + #[inline] + fn next(&mut self) -> Option<&'a [u8]> { + let haystack = self.finder.haystack; + match self.finder.next() { + Some(start) => { + let next = &haystack[self.last..start]; + self.last = start + self.finder.searcher.needle().len(); + Some(next) + } + None => { + if self.last >= haystack.len() { + if !self.done { + self.done = true; + Some(b"") + } else { + None + } + } else { + let s = &haystack[self.last..]; + self.last = haystack.len(); + self.done = true; + Some(s) + } + } + } + } +} + +/// An iterator over substrings in a byte string, split by a separator, in +/// reverse. +/// +/// `'a` is the lifetime of the byte string being split, while `F` is the type +/// of the predicate, i.e., `FnMut(char) -> bool`. +#[derive(Debug)] +pub struct SplitReverse<'a> { + finder: FindReverse<'a>, + /// The end position of the previous match of our splitter. The element + /// we yield corresponds to the substring starting at `last` up to the + /// beginning of the next match of the splitter. + last: usize, + /// Only set when iteration is complete. A corner case here is when a + /// splitter is matched at the end of the haystack. At that point, we still + /// need to yield an empty string following it. + done: bool, +} + +impl<'a> SplitReverse<'a> { + fn new(haystack: &'a [u8], splitter: &'a [u8]) -> SplitReverse<'a> { + let finder = haystack.rfind_iter(splitter); + SplitReverse { finder, last: haystack.len(), done: false } + } +} + +impl<'a> Iterator for SplitReverse<'a> { + type Item = &'a [u8]; + + #[inline] + fn next(&mut self) -> Option<&'a [u8]> { + let haystack = self.finder.haystack(); + match self.finder.next() { + Some(start) => { + let nlen = self.finder.needle().len(); + let next = &haystack[start + nlen..self.last]; + self.last = start; + Some(next) + } + None => { + if self.last == 0 { + if !self.done { + self.done = true; + Some(b"") + } else { + None + } + } else { + let s = &haystack[..self.last]; + self.last = 0; + self.done = true; + Some(s) + } + } + } + } +} + +/// An iterator over at most `n` substrings in a byte string, split by a +/// separator. +/// +/// `'a` is the lifetime of the byte string being split, while `F` is the type +/// of the predicate, i.e., `FnMut(char) -> bool`. +#[derive(Debug)] +pub struct SplitN<'a> { + split: Split<'a>, + limit: usize, + count: usize, +} + +impl<'a> SplitN<'a> { + fn new( + haystack: &'a [u8], + splitter: &'a [u8], + limit: usize, + ) -> SplitN<'a> { + let split = haystack.split_str(splitter); + SplitN { split, limit, count: 0 } + } +} + +impl<'a> Iterator for SplitN<'a> { + type Item = &'a [u8]; + + #[inline] + fn next(&mut self) -> Option<&'a [u8]> { + self.count += 1; + if self.count > self.limit { + None + } else if self.count == self.limit { + Some(&self.split.finder.haystack[self.split.last..]) + } else { + self.split.next() + } + } +} + + +/// An iterator over at most `n` substrings in a byte string, split by a +/// separator, in reverse. +/// +/// `'a` is the lifetime of the byte string being split, while `F` is the type +/// of the predicate, i.e., `FnMut(char) -> bool`. +#[derive(Debug)] +pub struct SplitNReverse<'a> { + split: SplitReverse<'a>, + limit: usize, + count: usize, +} + +impl<'a> SplitNReverse<'a> { + fn new( + haystack: &'a [u8], + splitter: &'a [u8], + limit: usize, + ) -> SplitNReverse<'a> { + let split = haystack.rsplit_str(splitter); + SplitNReverse { split, limit, count: 0 } + } +} + +impl<'a> Iterator for SplitNReverse<'a> { + type Item = &'a [u8]; + + #[inline] + fn next(&mut self) -> Option<&'a [u8]> { + self.count += 1; + if self.count > self.limit { + None + } else if self.count == self.limit { + Some(&self.split.finder.haystack()[..self.split.last]) + } else { + self.split.next() + } + } +} + +/// An iterator over all lines in a byte string, without their terminators. +/// +/// For this iterator, the only line terminators recognized are `\r\n` and +/// `\n`. +/// +/// `'a` is the lifetime of the byte string being iterated over. +pub struct Lines<'a> { + it: LinesWithTerminator<'a>, +} + +impl<'a> Lines<'a> { + fn new(bytes: &'a [u8]) -> Lines<'a> { + Lines { it: LinesWithTerminator::new(bytes) } + } +} + +impl<'a> Iterator for Lines<'a> { + type Item = &'a [u8]; + + #[inline] + fn next(&mut self) -> Option<&'a [u8]> { + let mut line = self.it.next()?; + if line.last_byte() == Some(b'\n') { + line = &line[..line.len() - 1]; + if line.last_byte() == Some(b'\r') { + line = &line[..line.len() - 1]; + } + } + Some(line) + } +} + +/// An iterator over all lines in a byte string, including their terminators. +/// +/// For this iterator, the only line terminator recognized is `\n`. (Since +/// line terminators are included, this also handles `\r\n` line endings.) +/// +/// Line terminators are only included if they are present in the original +/// byte string. For example, the last line in a byte string may not end with +/// a line terminator. +/// +/// Concatenating all elements yielded by this iterator is guaranteed to yield +/// the original byte string. +/// +/// `'a` is the lifetime of the byte string being iterated over. +pub struct LinesWithTerminator<'a> { + bytes: &'a [u8], +} + +impl<'a> LinesWithTerminator<'a> { + fn new(bytes: &'a [u8]) -> LinesWithTerminator<'a> { + LinesWithTerminator { bytes } + } +} + +impl<'a> Iterator for LinesWithTerminator<'a> { + type Item = &'a [u8]; + + #[inline] + fn next(&mut self) -> Option<&'a [u8]> { + match self.bytes.find_byte(b'\n') { + None if self.bytes.is_empty() => None, + None => { + let line = self.bytes; + self.bytes = b""; + Some(line) + } + Some(end) => { + let line = &self.bytes[..end + 1]; + self.bytes = &self.bytes[end + 1..]; + Some(line) + } + } + } +} + +#[cfg(test)] +mod tests { + use ext_slice::{B, ByteSlice}; + use tests::LOSSY_TESTS; + + #[test] + fn to_str_lossy() { + for (i, &(expected, input)) in LOSSY_TESTS.iter().enumerate() { + let got = B(input).to_str_lossy(); + assert_eq!( + expected.as_bytes(), + got.as_bytes(), + "to_str_lossy(ith: {:?}, given: {:?})", + i, input, + ); + + let mut got = String::new(); + B(input).to_str_lossy_into(&mut got); + assert_eq!( + expected.as_bytes(), got.as_bytes(), "to_str_lossy_into", + ); + + let got = String::from_utf8_lossy(input); + assert_eq!(expected.as_bytes(), got.as_bytes(), "std"); + } + } + + #[test] + #[should_panic] + fn copy_within_fail1() { + let mut buf = *b"foobar"; + let s = &mut buf; + s.copy_within_str(0..2, 5); + } + + #[test] + #[should_panic] + fn copy_within_fail2() { + let mut buf = *b"foobar"; + let s = &mut buf; + s.copy_within_str(3..2, 0); + } + + #[test] + #[should_panic] + fn copy_within_fail3() { + let mut buf = *b"foobar"; + let s = &mut buf; + s.copy_within_str(5..7, 0); + } + + #[test] + #[should_panic] + fn copy_within_fail4() { + let mut buf = *b"foobar"; + let s = &mut buf; + s.copy_within_str(0..1, 6); + } +} diff --git a/src/ext_vec.rs b/src/ext_vec.rs new file mode 100644 index 0000000..85e07a9 --- /dev/null +++ b/src/ext_vec.rs @@ -0,0 +1,1070 @@ +#![allow(unused_imports)] + +use std::borrow::Cow; +use std::error; +use std::ffi::{OsStr, OsString}; +use std::fmt; +use std::iter; +use std::ops; +use std::path::{Path, PathBuf}; +use std::ptr; +use std::str; +use std::vec; + +use ext_slice::ByteSlice; +use utf8::{self, Utf8Error}; + +/// Concatenate the elements given by the iterator together into a single +/// `Vec`. +/// +/// The elements may be any type that can be cheaply converted into an `&[u8]`. +/// This includes, but is not limited to, `&str`, `&BStr` and `&[u8]` itself. +/// +/// # Examples +/// +/// Basic usage: +/// +/// ``` +/// use bstr; +/// +/// let s = bstr::concat(&["foo", "bar", "baz"]); +/// assert_eq!(s, "foobarbaz".as_bytes()); +/// ``` +#[inline] +pub fn concat( + elements: I, +) -> Vec +where T: AsRef<[u8]>, + I: IntoIterator +{ + let mut dest = vec![]; + for element in elements { + dest.push_str(element); + } + dest +} + +/// Join the elements given by the iterator with the given separator into a +/// single `Vec`. +/// +/// Both the separator and the elements may be any type that can be cheaply +/// converted into an `&[u8]`. This includes, but is not limited to, +/// `&str`, `&BStr` and `&[u8]` itself. +/// +/// # Examples +/// +/// Basic usage: +/// +/// ``` +/// use bstr; +/// +/// let s = bstr::join(",", &["foo", "bar", "baz"]); +/// assert_eq!(s, "foo,bar,baz".as_bytes()); +/// ``` +#[inline] +pub fn join( + separator: B, + elements: I, +) -> Vec +where B: AsRef<[u8]>, + T: AsRef<[u8]>, + I: IntoIterator +{ + let mut it = elements.into_iter(); + let mut dest = vec![]; + match it.next() { + None => return dest, + Some(first) => { + dest.push_str(first); + } + } + for element in it { + dest.push_str(&separator); + dest.push_str(element); + } + dest +} + +impl ByteVec for Vec { + fn as_vec(&self) -> &Vec { self } + fn as_vec_mut(&mut self) -> &mut Vec { self } + fn into_vec(self) -> Vec { self } +} + +/// Ensure that callers cannot implement `ByteSlice` by making an +/// umplementable trait its super trait. +pub trait Sealed {} +impl Sealed for Vec {} + +/// A trait that extends a slice of bytes with string oriented methods. +pub trait ByteVec: Sealed { + /// A method for accessing the raw vector bytes of this type. This is + /// always a no-op and callers shouldn't care about it. This only exists + /// for making the extension trait work. + #[doc(hidden)] + fn as_vec(&self) -> &Vec; + + /// A method for accessing the raw vector bytes of this type, mutably. This + /// is always a no-op and callers shouldn't care about it. This only exists + /// for making the extension trait work. + #[doc(hidden)] + fn as_vec_mut(&mut self) -> &mut Vec; + + /// A method for consuming ownership of this vector. This is always a no-op + /// and callers shouldn't care about it. This only exists for making the + /// extension trait work. + #[doc(hidden)] + fn into_vec(self) -> Vec where Self: Sized; + + /// Create a new owned byte string from the given byte slice. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::{B, ByteVec}; + /// + /// let s = >::from_slice(b"abc"); + /// assert_eq!(s, B("abc")); + /// ``` + fn from_slice>(bytes: B) -> Vec { + bytes.as_ref().to_vec() + } + + /// Create a new byte string from an owned OS string. + /// + /// On Unix, this always succeeds and is zero cost. On non-Unix systems, + /// this returns the original OS string if it is not valid UTF-8. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use std::ffi::OsString; + /// + /// use bstr::{B, ByteVec}; + /// + /// let os_str = OsString::from("foo"); + /// let bs = Vec::from_os_string(os_str).expect("valid UTF-8"); + /// assert_eq!(bs, B("foo")); + /// ``` + #[inline] + fn from_os_string(os_str: OsString) -> Result, OsString> { + #[cfg(unix)] + #[inline] + fn imp(os_str: OsString) -> Result, OsString> { + use std::os::unix::ffi::OsStringExt; + + Ok(Vec::from(os_str.into_vec())) + } + + #[cfg(not(unix))] + #[inline] + fn imp(os_str: OsString) -> Result, OsString> { + os_str.into_string().map(Vec::from) + } + + imp(os_str) + } + + /// Lossily create a new byte string from an OS string slice. + /// + /// On Unix, this always succeeds, is zero cost and always returns a slice. + /// On non-Unix systems, this does a UTF-8 check. If the given OS string + /// slice is not valid UTF-8, then it is lossily decoded into valid UTF-8 + /// (with invalid bytes replaced by the Unicode replacement codepoint). + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use std::ffi::OsStr; + /// + /// use bstr::{B, ByteVec}; + /// + /// let os_str = OsStr::new("foo"); + /// let bs = Vec::from_os_str_lossy(os_str); + /// assert_eq!(bs, B("foo")); + /// ``` + #[inline] + fn from_os_str_lossy<'a>(os_str: &'a OsStr) -> Cow<'a, [u8]> { + #[cfg(unix)] + #[inline] + fn imp<'a>(os_str: &'a OsStr) -> Cow<'a, [u8]> { + use std::os::unix::ffi::OsStrExt; + + Cow::Borrowed(os_str.as_bytes()) + } + + #[cfg(not(unix))] + #[inline] + fn imp<'a>(os_str: &'a OsStr) -> Cow<'a, [u8]> { + match os_str.to_string_lossy() { + Cow::Borrowed(x) => Cow::Borrowed(x.as_bytes()), + Cow::Owned(x) => Cow::Owned(Vec::from(x)), + } + } + + imp(os_str) + } + + /// Create a new byte string from an owned file path. + /// + /// On Unix, this always succeeds and is zero cost. On non-Unix systems, + /// this returns the original path if it is not valid UTF-8. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use std::path::PathBuf; + /// + /// use bstr::{B, ByteVec}; + /// + /// let path = PathBuf::from("foo"); + /// let bs = Vec::from_path_buf(path).expect("must be valid UTF-8"); + /// assert_eq!(bs, B("foo")); + /// ``` + #[inline] + fn from_path_buf(path: PathBuf) -> Result, PathBuf> { + Vec::from_os_string(path.into_os_string()).map_err(PathBuf::from) + } + + /// Lossily create a new byte string from a file path. + /// + /// On Unix, this always succeeds, is zero cost and always returns a slice. + /// On non-Unix systems, this does a UTF-8 check. If the given path is not + /// valid UTF-8, then it is lossily decoded into valid UTF-8 (with invalid + /// bytes replaced by the Unicode replacement codepoint). + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use std::path::Path; + /// + /// use bstr::{B, ByteVec}; + /// + /// let path = Path::new("foo"); + /// let bs = Vec::from_path_lossy(path); + /// assert_eq!(bs, B("foo")); + /// ``` + #[inline] + fn from_path_lossy<'a>(path: &'a Path) -> Cow<'a, [u8]> { + Vec::from_os_str_lossy(path.as_os_str()) + } + + /// Appends the given byte to the end of this byte string. + /// + /// Note that this is equivalent to the generic `Vec::push` method. This + /// method is provided to permit callers to explicitly differentiate + /// between pushing bytes, codepoints and strings. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::ByteVec; + /// + /// let mut s = >::from("abc"); + /// s.push_byte(b'\xE2'); + /// s.push_byte(b'\x98'); + /// s.push_byte(b'\x83'); + /// assert_eq!(s, "abc☃".as_bytes()); + /// ``` + #[inline] + fn push_byte(&mut self, byte: u8) { + self.as_vec_mut().push(byte); + } + + /// Appends the given `char` to the end of this byte string. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::ByteVec; + /// + /// let mut s = >::from("abc"); + /// s.push_char('1'); + /// s.push_char('2'); + /// s.push_char('3'); + /// assert_eq!(s, "abc123".as_bytes()); + /// ``` + #[inline] + fn push_char(&mut self, ch: char) { + if ch.len_utf8() == 1 { + self.push_byte(ch as u8); + return; + } + self.as_vec_mut().extend_from_slice( + ch.encode_utf8(&mut [0; 4]).as_bytes(), + ); + } + + /// Appends the given slice to the end of this byte string. This accepts + /// any type that be converted to a `&[u8]`. This includes, but is not + /// limited to, `&str`, `&BStr`, and of course, `&[u8]` itself. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::ByteVec; + /// + /// let mut s = >::from("abc"); + /// s.push_str(b"123"); + /// assert_eq!(s, "abc123".as_bytes()); + /// ``` + #[inline] + fn push_str>(&mut self, bytes: B) { + self.as_vec_mut().extend_from_slice(bytes.as_ref()); + } + + /// Converts a `Vec` into a `String` if and only if this byte string is + /// valid UTF-8. + /// + /// If it is not valid UTF-8, then a + /// [`FromUtf8Error`](struct.FromUtf8Error.html) + /// is returned. (This error can be used to examine why UTF-8 validation + /// failed, or to regain the original byte string.) + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::ByteVec; + /// + /// # fn example() -> Result<(), Box> { + /// let bytes = Vec::from("hello"); + /// let string = bytes.into_string()?; + /// + /// assert_eq!("hello", string); + /// # Ok(()) }; example().unwrap() + /// ``` + /// + /// If this byte string is not valid UTF-8, then an error will be returned. + /// That error can then be used to inspect the location at which invalid + /// UTF-8 was found, or to regain the original byte string: + /// + /// ``` + /// use bstr::{B, ByteVec}; + /// + /// let bytes = Vec::from_slice(b"foo\xFFbar"); + /// let err = bytes.into_string().unwrap_err(); + /// + /// assert_eq!(err.utf8_error().valid_up_to(), 3); + /// assert_eq!(err.utf8_error().error_len(), Some(1)); + /// + /// // At no point in this example is an allocation performed. + /// let bytes = Vec::from(err.into_vec()); + /// assert_eq!(bytes, B(b"foo\xFFbar")); + /// ``` + #[inline] + fn into_string(self) -> Result where Self: Sized { + match utf8::validate(self.as_vec()) { + Err(err) => { + Err(FromUtf8Error { original: self.into_vec(), err: err }) + } + Ok(()) => { + // SAFETY: This is safe because of the guarantees provided by + // utf8::validate. + unsafe { Ok(self.into_string_unchecked()) } + } + } + } + + /// Lossily converts a `Vec` into a `String`. If this byte string + /// contains invalid UTF-8, then the invalid bytes are replaced with the + /// Unicode replacement codepoint. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::ByteVec; + /// + /// let bytes = Vec::from_slice(b"foo\xFFbar"); + /// let string = bytes.into_string_lossy(); + /// assert_eq!(string, "foo\u{FFFD}bar"); + /// ``` + #[inline] + fn into_string_lossy(self) -> String where Self: Sized { + let v = self.as_vec(); + if let Ok(allutf8) = v.to_str() { + return allutf8.to_string(); + } + let mut dst = String::with_capacity(v.len()); + for ch in v.chars() { + dst.push(ch); + } + dst + } + + /// Unsafely convert this byte string into a `String`, without checking for + /// valid UTF-8. + /// + /// # Safety + /// + /// Callers *must* ensure that this byte string is valid UTF-8 before + /// calling this method. Converting a byte string into a `String` that is + /// not valid UTF-8 is considered undefined behavior. + /// + /// This routine is useful in performance sensitive contexts where the + /// UTF-8 validity of the byte string is already known and it is + /// undesirable to pay the cost of an additional UTF-8 validation check + /// that [`into_string`](#method.into_string) performs. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::ByteVec; + /// + /// // SAFETY: This is safe because string literals are guaranteed to be + /// // valid UTF-8 by the Rust compiler. + /// let s = unsafe { Vec::from("☃βツ").into_string_unchecked() }; + /// assert_eq!("☃βツ", s); + /// ``` + unsafe fn into_string_unchecked(self) -> String where Self: Sized { + String::from_utf8_unchecked(self.into_vec()) + } + + /// Converts this byte string into an OS string, in place. + /// + /// On Unix, this always succeeds and is zero cost. On non-Unix systems, + /// this returns the original byte string if it is not valid UTF-8. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use std::ffi::OsStr; + /// + /// use bstr::ByteVec; + /// + /// let bs = Vec::from("foo"); + /// let os_str = bs.into_os_string().expect("should be valid UTF-8"); + /// assert_eq!(os_str, OsStr::new("foo")); + /// ``` + #[inline] + fn into_os_string(self) -> Result> where Self: Sized { + #[cfg(unix)] + #[inline] + fn imp(v: Vec) -> Result> { + use std::os::unix::ffi::OsStringExt; + + Ok(OsString::from_vec(v)) + } + + #[cfg(not(unix))] + #[inline] + fn imp(v: Vec) -> Result> { + match v.into_string() { + Ok(s) => Ok(OsString::from(s)), + Err(err) => Err(err.into_vec()), + } + } + + imp(self.into_vec()) + } + + /// Lossily converts this byte string into an OS string, in place. + /// + /// On Unix, this always succeeds and is zero cost. On non-Unix systems, + /// this will perform a UTF-8 check and lossily convert this byte string + /// into valid UTF-8 using the Unicode replacement codepoint. + /// + /// Note that this can prevent the correct roundtripping of file paths on + /// non-Unix systems such as Windows, where file paths are an arbitrary + /// sequence of 16-bit integers. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::ByteVec; + /// + /// let bs = Vec::from_slice(b"foo\xFFbar"); + /// let os_str = bs.into_os_string_lossy(); + /// assert_eq!(os_str.to_string_lossy(), "foo\u{FFFD}bar"); + /// ``` + #[inline] + fn into_os_string_lossy(self) -> OsString where Self: Sized { + #[cfg(unix)] + #[inline] + fn imp(v: Vec) -> OsString { + use std::os::unix::ffi::OsStringExt; + + OsString::from_vec(v) + } + + #[cfg(not(unix))] + #[inline] + fn imp(v: Vec) -> OsString { + OsString::from(v.into_string_lossy()) + } + + imp(self.into_vec()) + } + + /// Converts this byte string into an owned file path, in place. + /// + /// On Unix, this always succeeds and is zero cost. On non-Unix systems, + /// this returns the original byte string if it is not valid UTF-8. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::ByteVec; + /// + /// let bs = Vec::from("foo"); + /// let path = bs.into_path_buf().expect("should be valid UTF-8"); + /// assert_eq!(path.as_os_str(), "foo"); + /// ``` + #[inline] + fn into_path_buf(self) -> Result> where Self: Sized { + self.into_os_string().map(PathBuf::from) + } + + /// Lossily converts this byte string into an owned file path, in place. + /// + /// On Unix, this always succeeds and is zero cost. On non-Unix systems, + /// this will perform a UTF-8 check and lossily convert this byte string + /// into valid UTF-8 using the Unicode replacement codepoint. + /// + /// Note that this can prevent the correct roundtripping of file paths on + /// non-Unix systems such as Windows, where file paths are an arbitrary + /// sequence of 16-bit integers. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::ByteVec; + /// + /// let bs = Vec::from_slice(b"foo\xFFbar"); + /// let path = bs.into_path_buf_lossy(); + /// assert_eq!(path.to_string_lossy(), "foo\u{FFFD}bar"); + /// ``` + #[inline] + fn into_path_buf_lossy(self) -> PathBuf where Self: Sized { + PathBuf::from(self.into_os_string_lossy()) + } + + /// Removes the last byte from this `Vec` and returns it. + /// + /// If this byte string is empty, then `None` is returned. + /// + /// If the last codepoint in this byte string is not ASCII, then removing + /// the last byte could make this byte string contain invalid UTF-8. + /// + /// Note that this is equivalent to the generic `Vec::pop` method. This + /// method is provided to permit callers to explicitly differentiate + /// between popping bytes and codepoints. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::ByteVec; + /// + /// let mut s = Vec::from("foo"); + /// assert_eq!(s.pop_byte(), Some(b'o')); + /// assert_eq!(s.pop_byte(), Some(b'o')); + /// assert_eq!(s.pop_byte(), Some(b'f')); + /// assert_eq!(s.pop_byte(), None); + /// ``` + #[inline] + fn pop_byte(&mut self) -> Option { + self.as_vec_mut().pop() + } + + /// Removes the last codepoint from this `Vec` and returns it. + /// + /// If this byte string is empty, then `None` is returned. If the last + /// bytes of this byte string do not correspond to a valid UTF-8 code unit + /// sequence, then the Unicode replacement codepoint is yielded instead in + /// accordance with the + /// [replacement codepoint substitution policy](index.html#handling-of-invalid-utf8-8). + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::ByteVec; + /// + /// let mut s = Vec::from("foo"); + /// assert_eq!(s.pop_char(), Some('o')); + /// assert_eq!(s.pop_char(), Some('o')); + /// assert_eq!(s.pop_char(), Some('f')); + /// assert_eq!(s.pop_char(), None); + /// ``` + /// + /// This shows the replacement codepoint substitution policy. Note that + /// the first pop yields a replacement codepoint but actually removes two + /// bytes. This is in contrast with subsequent pops when encountering + /// `\xFF` since `\xFF` is never a valid prefix for any valid UTF-8 + /// code unit sequence. + /// + /// ``` + /// use bstr::ByteVec; + /// + /// let mut s = Vec::from_slice(b"f\xFF\xFF\xFFoo\xE2\x98"); + /// assert_eq!(s.pop_char(), Some('\u{FFFD}')); + /// assert_eq!(s.pop_char(), Some('o')); + /// assert_eq!(s.pop_char(), Some('o')); + /// assert_eq!(s.pop_char(), Some('\u{FFFD}')); + /// assert_eq!(s.pop_char(), Some('\u{FFFD}')); + /// assert_eq!(s.pop_char(), Some('\u{FFFD}')); + /// assert_eq!(s.pop_char(), Some('f')); + /// assert_eq!(s.pop_char(), None); + /// ``` + #[inline] + fn pop_char(&mut self) -> Option { + let (ch, size) = utf8::decode_last_lossy(self.as_vec()); + if size == 0 { + return None; + } + let new_len = self.as_vec().len() - size; + self.as_vec_mut().truncate(new_len); + Some(ch) + } + + /// Removes a `char` from this `Vec` at the given byte position and + /// returns it. + /// + /// If the bytes at the given position do not lead to a valid UTF-8 code + /// unit sequence, then a + /// [replacement codepoint is returned instead](index.html#handling-of-invalid-utf8-8). + /// + /// # Panics + /// + /// Panics if `at` is larger than or equal to this byte string's length. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::ByteVec; + /// + /// let mut s = Vec::from("foo☃bar"); + /// assert_eq!(s.remove_char(3), '☃'); + /// assert_eq!(s, b"foobar"); + /// ``` + /// + /// This example shows how the Unicode replacement codepoint policy is + /// used: + /// + /// ``` + /// use bstr::ByteVec; + /// + /// let mut s = Vec::from_slice(b"foo\xFFbar"); + /// assert_eq!(s.remove_char(3), '\u{FFFD}'); + /// assert_eq!(s, b"foobar"); + /// ``` + #[inline] + fn remove_char(&mut self, at: usize) -> char { + let (ch, size) = utf8::decode_lossy(&self.as_vec()[at..]); + assert!( + size > 0, + "expected {} to be less than {}", + at, + self.as_vec().len(), + ); + self.as_vec_mut().drain(at..at + size); + ch + } + + /// Inserts the given codepoint into this `Vec` at a particular byte + /// position. + /// + /// This is an `O(n)` operation as it may copy a number of elements in this + /// byte string proportional to its length. + /// + /// # Panics + /// + /// Panics if `at` is larger than the byte string's length. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::ByteVec; + /// + /// let mut s = Vec::from("foobar"); + /// s.insert_char(3, '☃'); + /// assert_eq!(s, "foo☃bar".as_bytes()); + /// ``` + #[inline] + fn insert_char(&mut self, at: usize, ch: char) { + self.insert_str(at, ch.encode_utf8(&mut [0; 4]).as_bytes()); + } + + /// Inserts the given byte string into this byte string at a particular + /// byte position. + /// + /// This is an `O(n)` operation as it may copy a number of elements in this + /// byte string proportional to its length. + /// + /// The given byte string may be any type that can be cheaply converted + /// into a `&[u8]`. This includes, but is not limited to, `&str` and + /// `&[u8]`. + /// + /// # Panics + /// + /// Panics if `at` is larger than the byte string's length. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::ByteVec; + /// + /// let mut s = Vec::from("foobar"); + /// s.insert_str(3, "☃☃☃"); + /// assert_eq!(s, "foo☃☃☃bar".as_bytes()); + /// ``` + #[inline] + fn insert_str>(&mut self, at: usize, bytes: B) { + let bytes = bytes.as_ref(); + let len = self.as_vec().len(); + assert!(at <= len, "expected {} to be <= {}", at, len); + + // SAFETY: We'd like to efficiently splice in the given bytes into + // this byte string. Since we are only working with `u8` elements here, + // we only need to consider whether our bounds are correct and whether + // our byte string has enough space. + self.as_vec_mut().reserve(bytes.len()); + unsafe { + // Shift bytes after `at` over by the length of `bytes` to make + // room for it. This requires referencing two regions of memory + // that may overlap, so we use ptr::copy. + ptr::copy( + self.as_vec().as_ptr().add(at), + self.as_vec_mut().as_mut_ptr().add(at + bytes.len()), + len - at, + ); + // Now copy the bytes given into the room we made above. In this + // case, we know that the given bytes cannot possibly overlap + // with this byte string since we have a mutable borrow of the + // latter. Thus, we can use a nonoverlapping copy. + ptr::copy_nonoverlapping( + bytes.as_ptr(), + self.as_vec_mut().as_mut_ptr().add(at), + bytes.len(), + ); + self.as_vec_mut().set_len(len + bytes.len()); + } + } + + /// Removes the specified range in this byte string and replaces it with + /// the given bytes. The given bytes do not need to have the same length + /// as the range provided. + /// + /// # Panics + /// + /// Panics if the given range is invalid. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::ByteVec; + /// + /// let mut s = Vec::from("foobar"); + /// s.replace_range(2..4, "xxxxx"); + /// assert_eq!(s, "foxxxxxar".as_bytes()); + /// ``` + #[inline] + fn replace_range( + &mut self, + range: R, + replace_with: B, + ) where R: ops::RangeBounds, + B: AsRef<[u8]> + { + self.as_vec_mut().splice(range, replace_with.as_ref().iter().cloned()); + } + + /// Creates a draining iterator that removes the specified range in this + /// `Vec` and yields each of the removed bytes. + /// + /// Note that the elements specified by the given range are removed + /// regardless of whether the returned iterator is fully exhausted. + /// + /// Also note that is is unspecified how many bytes are removed from the + /// `Vec` if the `DrainBytes` iterator is leaked. + /// + /// # Panics + /// + /// Panics if the given range is not valid. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::ByteVec; + /// + /// let mut s = Vec::from("foobar"); + /// { + /// let mut drainer = s.drain_bytes(2..4); + /// assert_eq!(drainer.next(), Some(b'o')); + /// assert_eq!(drainer.next(), Some(b'b')); + /// assert_eq!(drainer.next(), None); + /// } + /// assert_eq!(s, "foar".as_bytes()); + /// ``` + #[inline] + fn drain_bytes( + &mut self, + range: R, + ) -> DrainBytes + where R: ops::RangeBounds + { + DrainBytes { it: self.as_vec_mut().drain(range) } + } +} + +/// A draining byte oriented iterator for `Vec`. +/// +/// This iterator is created by +/// [`ByteVec::drain_bytes`](trait.ByteVec.html#method.drain_bytes). +/// +/// # Examples +/// +/// Basic usage: +/// +/// ``` +/// use bstr::ByteVec; +/// +/// let mut s = Vec::from("foobar"); +/// { +/// let mut drainer = s.drain_bytes(2..4); +/// assert_eq!(drainer.next(), Some(b'o')); +/// assert_eq!(drainer.next(), Some(b'b')); +/// assert_eq!(drainer.next(), None); +/// } +/// assert_eq!(s, "foar".as_bytes()); +/// ``` +#[derive(Debug)] +pub struct DrainBytes<'a> { + it: vec::Drain<'a, u8>, +} + +impl<'a> iter::FusedIterator for DrainBytes<'a> {} + +impl<'a> Iterator for DrainBytes<'a> { + type Item = u8; + + #[inline] + fn next(&mut self) -> Option { + self.it.next() + } +} + +impl<'a> DoubleEndedIterator for DrainBytes<'a> { + #[inline] + fn next_back(&mut self) -> Option { + self.it.next_back() + } +} + +impl<'a> ExactSizeIterator for DrainBytes<'a> { + #[inline] + fn len(&self) -> usize { + self.it.len() + } +} + +/// An error that may occur when converting a `Vec` to a `String`. +/// +/// This error includes the original `Vec` that failed to convert to a +/// `String`. This permits callers to recover the allocation used even if it +/// it not valid UTF-8. +/// +/// # Examples +/// +/// Basic usage: +/// +/// ``` +/// use bstr::{B, ByteVec}; +/// +/// let bytes = Vec::from_slice(b"foo\xFFbar"); +/// let err = bytes.into_string().unwrap_err(); +/// +/// assert_eq!(err.utf8_error().valid_up_to(), 3); +/// assert_eq!(err.utf8_error().error_len(), Some(1)); +/// +/// // At no point in this example is an allocation performed. +/// let bytes = Vec::from(err.into_vec()); +/// assert_eq!(bytes, B(b"foo\xFFbar")); +/// ``` +#[derive(Debug, Eq, PartialEq)] +pub struct FromUtf8Error { + original: Vec, + err: Utf8Error, +} + +impl FromUtf8Error { + /// Return the original bytes as a slice that failed to convert to a + /// `String`. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::{B, ByteVec}; + /// + /// let bytes = Vec::from_slice(b"foo\xFFbar"); + /// let err = bytes.into_string().unwrap_err(); + /// + /// // At no point in this example is an allocation performed. + /// assert_eq!(err.as_bytes(), B(b"foo\xFFbar")); + /// ``` + #[inline] + pub fn as_bytes(&self) -> &[u8] { + &self.original + } + + /// Consume this error and return the original byte string that failed to + /// convert to a `String`. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::{B, ByteVec}; + /// + /// let bytes = Vec::from_slice(b"foo\xFFbar"); + /// let err = bytes.into_string().unwrap_err(); + /// let original = err.into_vec(); + /// + /// // At no point in this example is an allocation performed. + /// assert_eq!(original, B(b"foo\xFFbar")); + /// ``` + #[inline] + pub fn into_vec(self) -> Vec { + self.original + } + + /// Return the underlying UTF-8 error that occurred. This error provides + /// information on the nature and location of the invalid UTF-8 detected. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::{B, ByteVec}; + /// + /// let bytes = Vec::from_slice(b"foo\xFFbar"); + /// let err = bytes.into_string().unwrap_err(); + /// + /// assert_eq!(err.utf8_error().valid_up_to(), 3); + /// assert_eq!(err.utf8_error().error_len(), Some(1)); + /// ``` + #[inline] + pub fn utf8_error(&self) -> &Utf8Error { + &self.err + } +} + +impl error::Error for FromUtf8Error { + #[inline] + fn description(&self) -> &str { "invalid UTF-8 vector" } +} + +impl fmt::Display for FromUtf8Error { + #[inline] + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}", self.err) + } +} + +#[cfg(test)] +mod tests { + use ext_slice::B; + use ext_vec::ByteVec; + + #[test] + fn insert() { + let mut s = vec![]; + s.insert_str(0, "foo"); + assert_eq!(s, "foo".as_bytes()); + + let mut s = Vec::from("a"); + s.insert_str(0, "foo"); + assert_eq!(s, "fooa".as_bytes()); + + let mut s = Vec::from("a"); + s.insert_str(1, "foo"); + assert_eq!(s, "afoo".as_bytes()); + + let mut s = Vec::from("foobar"); + s.insert_str(3, "quux"); + assert_eq!(s, "fooquuxbar".as_bytes()); + + let mut s = Vec::from("foobar"); + s.insert_str(3, "x"); + assert_eq!(s, "fooxbar".as_bytes()); + + let mut s = Vec::from("foobar"); + s.insert_str(0, "x"); + assert_eq!(s, "xfoobar".as_bytes()); + + let mut s = Vec::from("foobar"); + s.insert_str(6, "x"); + assert_eq!(s, "foobarx".as_bytes()); + + let mut s = Vec::from("foobar"); + s.insert_str(3, "quuxbazquux"); + assert_eq!(s, "fooquuxbazquuxbar".as_bytes()); + } + + #[test] + #[should_panic] + fn insert_fail1() { + let mut s = vec![]; + s.insert_str(1, "foo"); + } + + #[test] + #[should_panic] + fn insert_fail2() { + let mut s = Vec::from("a"); + s.insert_str(2, "foo"); + } + + #[test] + #[should_panic] + fn insert_fail3() { + let mut s = Vec::from("foobar"); + s.insert_str(7, "foo"); + } +} diff --git a/src/freqs.rs b/src/freqs.rs deleted file mode 100644 index bad6aaf..0000000 --- a/src/freqs.rs +++ /dev/null @@ -1,258 +0,0 @@ -pub const BYTE_FREQUENCIES: [u8; 256] = [ - 55, // '\x00' - 52, // '\x01' - 51, // '\x02' - 50, // '\x03' - 49, // '\x04' - 48, // '\x05' - 47, // '\x06' - 46, // '\x07' - 45, // '\x08' - 103, // '\t' - 242, // '\n' - 66, // '\x0b' - 67, // '\x0c' - 229, // '\r' - 44, // '\x0e' - 43, // '\x0f' - 42, // '\x10' - 41, // '\x11' - 40, // '\x12' - 39, // '\x13' - 38, // '\x14' - 37, // '\x15' - 36, // '\x16' - 35, // '\x17' - 34, // '\x18' - 33, // '\x19' - 56, // '\x1a' - 32, // '\x1b' - 31, // '\x1c' - 30, // '\x1d' - 29, // '\x1e' - 28, // '\x1f' - 255, // ' ' - 148, // '!' - 164, // '"' - 149, // '#' - 136, // '$' - 160, // '%' - 155, // '&' - 173, // "'" - 221, // '(' - 222, // ')' - 134, // '*' - 122, // '+' - 232, // ',' - 202, // '-' - 215, // '.' - 224, // '/' - 208, // '0' - 220, // '1' - 204, // '2' - 187, // '3' - 183, // '4' - 179, // '5' - 177, // '6' - 168, // '7' - 178, // '8' - 200, // '9' - 226, // ':' - 195, // ';' - 154, // '<' - 184, // '=' - 174, // '>' - 126, // '?' - 120, // '@' - 191, // 'A' - 157, // 'B' - 194, // 'C' - 170, // 'D' - 189, // 'E' - 162, // 'F' - 161, // 'G' - 150, // 'H' - 193, // 'I' - 142, // 'J' - 137, // 'K' - 171, // 'L' - 176, // 'M' - 185, // 'N' - 167, // 'O' - 186, // 'P' - 112, // 'Q' - 175, // 'R' - 192, // 'S' - 188, // 'T' - 156, // 'U' - 140, // 'V' - 143, // 'W' - 123, // 'X' - 133, // 'Y' - 128, // 'Z' - 147, // '[' - 138, // '\\' - 146, // ']' - 114, // '^' - 223, // '_' - 151, // '`' - 249, // 'a' - 216, // 'b' - 238, // 'c' - 236, // 'd' - 253, // 'e' - 227, // 'f' - 218, // 'g' - 230, // 'h' - 247, // 'i' - 135, // 'j' - 180, // 'k' - 241, // 'l' - 233, // 'm' - 246, // 'n' - 244, // 'o' - 231, // 'p' - 139, // 'q' - 245, // 'r' - 243, // 's' - 251, // 't' - 235, // 'u' - 201, // 'v' - 196, // 'w' - 240, // 'x' - 214, // 'y' - 152, // 'z' - 182, // '{' - 205, // '|' - 181, // '}' - 127, // '~' - 27, // '\x7f' - 212, // '\x80' - 211, // '\x81' - 210, // '\x82' - 213, // '\x83' - 228, // '\x84' - 197, // '\x85' - 169, // '\x86' - 159, // '\x87' - 131, // '\x88' - 172, // '\x89' - 105, // '\x8a' - 80, // '\x8b' - 98, // '\x8c' - 96, // '\x8d' - 97, // '\x8e' - 81, // '\x8f' - 207, // '\x90' - 145, // '\x91' - 116, // '\x92' - 115, // '\x93' - 144, // '\x94' - 130, // '\x95' - 153, // '\x96' - 121, // '\x97' - 107, // '\x98' - 132, // '\x99' - 109, // '\x9a' - 110, // '\x9b' - 124, // '\x9c' - 111, // '\x9d' - 82, // '\x9e' - 108, // '\x9f' - 118, // '\xa0' - 141, // '¡' - 113, // '¢' - 129, // '£' - 119, // '¤' - 125, // '¥' - 165, // '¦' - 117, // '§' - 92, // '¨' - 106, // '©' - 83, // 'ª' - 72, // '«' - 99, // '¬' - 93, // '\xad' - 65, // '®' - 79, // '¯' - 166, // '°' - 237, // '±' - 163, // '²' - 199, // '³' - 190, // '´' - 225, // 'µ' - 209, // '¶' - 203, // '·' - 198, // '¸' - 217, // '¹' - 219, // 'º' - 206, // '»' - 234, // '¼' - 248, // '½' - 158, // '¾' - 239, // '¿' - 255, // 'À' - 255, // 'Á' - 255, // 'Â' - 255, // 'Ã' - 255, // 'Ä' - 255, // 'Å' - 255, // 'Æ' - 255, // 'Ç' - 255, // 'È' - 255, // 'É' - 255, // 'Ê' - 255, // 'Ë' - 255, // 'Ì' - 255, // 'Í' - 255, // 'Î' - 255, // 'Ï' - 255, // 'Ð' - 255, // 'Ñ' - 255, // 'Ò' - 255, // 'Ó' - 255, // 'Ô' - 255, // 'Õ' - 255, // 'Ö' - 255, // '×' - 255, // 'Ø' - 255, // 'Ù' - 255, // 'Ú' - 255, // 'Û' - 255, // 'Ü' - 255, // 'Ý' - 255, // 'Þ' - 255, // 'ß' - 255, // 'à' - 255, // 'á' - 255, // 'â' - 255, // 'ã' - 255, // 'ä' - 255, // 'å' - 255, // 'æ' - 255, // 'ç' - 255, // 'è' - 255, // 'é' - 255, // 'ê' - 255, // 'ë' - 255, // 'ì' - 255, // 'í' - 255, // 'î' - 255, // 'ï' - 255, // 'ð' - 255, // 'ñ' - 255, // 'ò' - 255, // 'ó' - 255, // 'ô' - 255, // 'õ' - 255, // 'ö' - 255, // '÷' - 255, // 'ø' - 255, // 'ù' - 255, // 'ú' - 255, // 'û' - 255, // 'ü' - 255, // 'ý' - 255, // 'þ' - 255, // 'ÿ' -]; diff --git a/src/impls.rs b/src/impls.rs index 614139b..0b51bdd 100644 --- a/src/impls.rs +++ b/src/impls.rs @@ -68,6 +68,7 @@ mod bstring { use bstr::BStr; use bstring::BString; + use ext_vec::ByteVec; impl fmt::Display for BString { #[inline] @@ -84,25 +85,25 @@ mod bstring { } impl ops::Deref for BString { - type Target = BStr; + type Target = Vec; #[inline] - fn deref(&self) -> &BStr { - self.as_bstr() + fn deref(&self) -> &Vec { + &self.bytes } } impl ops::DerefMut for BString { #[inline] - fn deref_mut(&mut self) -> &mut BStr { - self.as_mut_bstr() + fn deref_mut(&mut self) -> &mut Vec { + &mut self.bytes } } impl AsRef<[u8]> for BString { #[inline] fn as_ref(&self) -> &[u8] { - self.as_bytes() + &self.bytes } } @@ -116,7 +117,7 @@ mod bstring { impl AsMut<[u8]> for BString { #[inline] fn as_mut(&mut self) -> &mut [u8] { - self.as_bytes_mut() + &mut self.bytes } } @@ -139,49 +140,49 @@ mod bstring { #[inline] fn to_owned(&self) -> BString { - self.to_bstring() + BString::from(self) } } impl<'a> From<&'a [u8]> for BString { #[inline] fn from(s: &'a [u8]) -> BString { - BString::from_vec(s.to_vec()) + BString::from(s.to_vec()) } } impl From> for BString { #[inline] fn from(s: Vec) -> BString { - BString::from_vec(s) + BString { bytes: s } } } impl From for Vec { #[inline] fn from(s: BString) -> Vec { - s.into_vec() + s.bytes } } impl<'a> From<&'a str> for BString { #[inline] fn from(s: &'a str) -> BString { - BString::from_vec(s.as_bytes().to_vec()) + BString::from(s.as_bytes().to_vec()) } } impl From for BString { #[inline] fn from(s: String) -> BString { - BString::from_vec(s.into_bytes()) + BString::from(s.into_bytes()) } } impl<'a> From<&'a BStr> for BString { #[inline] fn from(s: &'a BStr) -> BString { - s.to_bstring() + BString::from(s.bytes.to_vec()) } } @@ -209,44 +210,44 @@ mod bstring { impl<'a> FromIterator<&'a str> for BString { #[inline] fn from_iter>(iter: T) -> BString { - let mut buf = BString::new(); + let mut buf = vec![]; for b in iter { - buf.push(b); + buf.push_str(b); } - buf + BString::from(buf) } } impl<'a> FromIterator<&'a [u8]> for BString { #[inline] fn from_iter>(iter: T) -> BString { - let mut buf = BString::new(); + let mut buf = vec![]; for b in iter { - buf.push(b); + buf.push_str(b); } - buf + BString::from(buf) } } impl<'a> FromIterator<&'a BStr> for BString { #[inline] fn from_iter>(iter: T) -> BString { - let mut buf = BString::new(); + let mut buf = vec![]; for b in iter { - buf.push(b); + buf.push_str(b); } - buf + BString::from(buf) } } impl FromIterator for BString { #[inline] fn from_iter>(iter: T) -> BString { - let mut buf = BString::new(); + let mut buf = vec![]; for b in iter { - buf.push(b); + buf.push_str(b); } - buf + BString::from(buf) } } @@ -271,7 +272,7 @@ mod bstring { impl PartialOrd for BString { #[inline] fn partial_cmp(&self, other: &BString) -> Option { - PartialOrd::partial_cmp(self.as_bytes(), other.as_bytes()) + PartialOrd::partial_cmp(&self.bytes, &other.bytes) } } @@ -301,6 +302,7 @@ mod bstr { use core::ops; use bstr::BStr; + use ext_slice::ByteSlice; impl fmt::Display for BStr { #[inline] @@ -333,6 +335,22 @@ mod bstr { } } + impl ops::Deref for BStr { + type Target = [u8]; + + #[inline] + fn deref(&self) -> &[u8] { + &self.bytes + } + } + + impl ops::DerefMut for BStr { + #[inline] + fn deref_mut(&mut self) -> &mut [u8] { + &mut self.bytes + } + } + impl ops::Index for BStr { type Output = u8; @@ -399,7 +417,7 @@ mod bstr { impl ops::IndexMut for BStr { #[inline] fn index_mut(&mut self, idx: usize) -> &mut u8 { - &mut self.as_bytes_mut()[idx] + &mut self.bytes[idx] } } @@ -413,35 +431,35 @@ mod bstr { impl ops::IndexMut> for BStr { #[inline] fn index_mut(&mut self, r: ops::Range) -> &mut BStr { - BStr::from_bytes_mut(&mut self.as_bytes_mut()[r.start..r.end]) + BStr::from_bytes_mut(&mut self.bytes[r.start..r.end]) } } impl ops::IndexMut> for BStr { #[inline] fn index_mut(&mut self, r: ops::RangeInclusive) -> &mut BStr { - BStr::from_bytes_mut(&mut self.as_bytes_mut()[*r.start()..=*r.end()]) + BStr::from_bytes_mut(&mut self.bytes[*r.start()..=*r.end()]) } } impl ops::IndexMut> for BStr { #[inline] fn index_mut(&mut self, r: ops::RangeFrom) -> &mut BStr { - BStr::from_bytes_mut(&mut self.as_bytes_mut()[r.start..]) + BStr::from_bytes_mut(&mut self.bytes[r.start..]) } } impl ops::IndexMut> for BStr { #[inline] fn index_mut(&mut self, r: ops::RangeTo) -> &mut BStr { - BStr::from_bytes_mut(&mut self.as_bytes_mut()[..r.end]) + BStr::from_bytes_mut(&mut self.bytes[..r.end]) } } impl ops::IndexMut> for BStr { #[inline] fn index_mut(&mut self, r: ops::RangeToInclusive) -> &mut BStr { - BStr::from_bytes_mut(&mut self.as_bytes_mut()[..=r.end]) + BStr::from_bytes_mut(&mut self.bytes[..=r.end]) } } @@ -469,7 +487,7 @@ mod bstr { impl AsMut<[u8]> for BStr { #[inline] fn as_mut(&mut self) -> &mut [u8] { - self.as_bytes_mut() + &mut self.bytes } } @@ -724,7 +742,7 @@ mod bstring_arbitrary { } fn shrink(&self) -> Box> { - Box::new(self.as_vec().shrink().map(BString::from)) + Box::new(self.bytes.shrink().map(BString::from)) } } } diff --git a/src/io.rs b/src/io.rs index 6937be2..15ab7e7 100644 --- a/src/io.rs +++ b/src/io.rs @@ -9,8 +9,8 @@ More APIs may be added in the future. use std::io; -use bstr::BStr; -use bstring::BString; +use ext_slice::ByteSlice; +use ext_vec::ByteVec; /// An extention trait for /// [`std::io::BufRead`](https://doc.rust-lang.org/std/io/trait.BufRead.html) @@ -19,7 +19,7 @@ pub trait BufReadExt: io::BufRead { /// Returns an iterator over the lines of this reader, where each line /// is represented as a byte string. /// - /// Each item yielded by this iterator is a `io::Result`, where + /// Each item yielded by this iterator is a `io::Result>`, where /// an error is yielded if there was a problem reading from the underlying /// reader. /// @@ -44,9 +44,9 @@ pub trait BufReadExt: io::BufRead { /// lines.push(line); /// } /// assert_eq!(lines.len(), 3); - /// assert_eq!(lines[0], "lorem"); - /// assert_eq!(lines[1], "ipsum"); - /// assert_eq!(lines[2], "dolor"); + /// assert_eq!(lines[0], "lorem".as_bytes()); + /// assert_eq!(lines[1], "ipsum".as_bytes()); + /// assert_eq!(lines[2], "dolor".as_bytes()); /// # Ok(()) }; example().unwrap() /// ``` fn byte_lines(self) -> ByteLines where Self: Sized { @@ -80,13 +80,13 @@ pub trait BufReadExt: io::BufRead { /// /// let mut lines = vec![]; /// cursor.for_byte_line(|line| { - /// lines.push(line.to_bstring()); + /// lines.push(line.to_vec()); /// Ok(true) /// })?; /// assert_eq!(lines.len(), 3); - /// assert_eq!(lines[0], "lorem"); - /// assert_eq!(lines[1], "ipsum"); - /// assert_eq!(lines[2], "dolor"); + /// assert_eq!(lines[0], "lorem".as_bytes()); + /// assert_eq!(lines[1], "ipsum".as_bytes()); + /// assert_eq!(lines[2], "dolor".as_bytes()); /// # Ok(()) }; example().unwrap() /// ``` fn for_byte_line( @@ -94,10 +94,10 @@ pub trait BufReadExt: io::BufRead { mut for_each_line: F, ) -> io::Result<()> where Self: Sized, - F: FnMut(&BStr) -> io::Result + F: FnMut(&[u8]) -> io::Result { - let mut bytes = BString::new(); - while self.read_until(b'\n', bytes.as_mut_vec())? > 0 { + let mut bytes = vec![]; + while self.read_until(b'\n', &mut bytes)? > 0 { trim_line(&mut bytes); if !for_each_line(&bytes)? { break; @@ -135,13 +135,13 @@ pub trait BufReadExt: io::BufRead { /// /// let mut lines = vec![]; /// cursor.for_byte_line_with_terminator(|line| { - /// lines.push(line.to_bstring()); + /// lines.push(line.to_vec()); /// Ok(true) /// })?; /// assert_eq!(lines.len(), 3); - /// assert_eq!(lines[0], "lorem\n"); - /// assert_eq!(lines[1], "ipsum\r\n"); - /// assert_eq!(lines[2], "dolor"); + /// assert_eq!(lines[0], "lorem\n".as_bytes()); + /// assert_eq!(lines[1], "ipsum\r\n".as_bytes()); + /// assert_eq!(lines[2], "dolor".as_bytes()); /// # Ok(()) }; example().unwrap() /// ``` fn for_byte_line_with_terminator( @@ -149,10 +149,10 @@ pub trait BufReadExt: io::BufRead { mut for_each_line: F, ) -> io::Result<()> where Self: Sized, - F: FnMut(&BStr) -> io::Result + F: FnMut(&[u8]) -> io::Result { - let mut bytes = BString::new(); - while self.read_until(b'\n', bytes.as_mut_vec())? > 0 { + let mut bytes = vec![]; + while self.read_until(b'\n', &mut bytes)? > 0 { if !for_each_line(&bytes)? { break; } @@ -178,11 +178,11 @@ pub struct ByteLines { } impl Iterator for ByteLines { - type Item = io::Result; + type Item = io::Result>; - fn next(&mut self) -> Option> { - let mut bytes = BString::new(); - match self.buf.read_until(b'\n', bytes.as_mut_vec()) { + fn next(&mut self) -> Option>> { + let mut bytes = vec![]; + match self.buf.read_until(b'\n', &mut bytes) { Err(e) => Some(Err(e)), Ok(0) => None, Ok(_) => { @@ -193,10 +193,10 @@ impl Iterator for ByteLines { } } -fn trim_line(line: &mut BString) { - if line.last() == Some(b'\n') { +fn trim_line(line: &mut Vec) { + if line.last_byte() == Some(b'\n') { line.pop_byte(); - if line.last() == Some(b'\r') { + if line.last_byte() == Some(b'\r') { line.pop_byte(); } } diff --git a/src/lib.rs b/src/lib.rs index 89b6219..e4f45ed 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -4,32 +4,42 @@ An experimental byte string library. Byte strings are just like standard Unicode strings with one very important difference: byte strings are only *conventionally* UTF-8 while Rust's standard Unicode strings are *guaranteed* to be valid UTF-8. The primary motivation for -this type is for handling arbitrary bytes that are mostly UTF-8. +byte strings is for handling arbitrary bytes that are mostly UTF-8. # Overview -There are two primary types in this crate: +This crate provides two important traits that provide string oriented methods +on `&[u8]` and `Vec` types: +* [`ByteSlice`](trait.ByteSlice.html) extends the `[u8]` type with additional + string oriented methods. +* [`ByteVec`](trait.ByteVec.html) extends the `Vec` type with additional + string oriented methods. + +Additionally, this crate provides two concrete byte string types that deref to +`[u8]` and `Vec`. These are useful for storing byte string types, and come +with convenient `std::fmt::Debug` implementations: + +* [`BStr`](struct.BStr.html) is a byte string slice, analogous to `str`. * [`BString`](struct.BString.html) is an owned growable byte string buffer, analogous to `String`. -* [`BStr`](struct.BStr.html) is a byte string slice, analogous to `str`. Additionally, the free function [`B`](fn.B.html) serves as a convenient short hand for writing byte string literals. # Quick examples -Byte strings are effectively the same thing as a `Vec` or a `&[u8]`, except -they provide a string oriented API. Operations such as iterating over +Byte strings build on the existing APIs for `Vec` and `&[u8]`, with +additional string oriented methods. Operations such as iterating over graphemes, searching for substrings, replacing substrings, trimming and case -conversion are examples of things not provided on the standard `&[u8]` APIs -but are provided by this crate. For example, this code iterates over all of -occurrences of a subtring: +conversion are examples of things not provided on the standard library `&[u8]` +APIs but are provided by this crate. For example, this code iterates over all +of occurrences of a subtring: ``` -use bstr::B; +use bstr::ByteSlice; -let s = B("foo bar foo foo quux foo"); +let s = b"foo bar foo foo quux foo"; let mut matches = vec![]; for start in s.find_iter("foo") { @@ -38,28 +48,54 @@ for start in s.find_iter("foo") { assert_eq!(matches, [0, 8, 12, 21]); ``` -Here's another example showing how to do a search and replace: +Here's another example showing how to do a search and replace (and also showing +use of the `B` function): ``` -use bstr::B; +use bstr::{B, ByteSlice}; -let old = B("foo bar foo foo quux foo"); +let old = B("foo ☃☃☃ foo foo quux foo"); let new = old.replace("foo", "hello"); -assert_eq!(new, "hello bar hello hello quux hello"); +assert_eq!(new, B("hello ☃☃☃ hello hello quux hello")); ``` And here's an example that shows case conversion, even in the presence of invalid UTF-8: ``` -use bstr::{B, BString}; +use bstr::{ByteSlice, ByteVec}; -let mut lower = BString::from("hello β"); +let mut lower = Vec::from("hello β"); lower[0] = b'\xFF'; // lowercase β is uppercased to Β -assert_eq!(lower.to_uppercase(), B(b"\xFFELLO \xCE\x92")); +assert_eq!(lower.to_uppercase(), b"\xFFELLO \xCE\x92"); ``` +# Convenient debug representation + +When working with byte strings, it is often useful to be able to print them +as if they were byte strings and not sequences of integers. While this crate +cannot affect the `std::fmt::Debug` implementations for `[u8]` and `Vec`, +this crate does provide the `BStr` and `BString` types which have convenient +`std::fmt::Debug` implementations. + +For example, this + +``` +use bstr::ByteSlice; + +let mut bytes = Vec::from("hello β"); +bytes[0] = b'\xFF'; + +println!("{:?}", bytes.as_bstr()); +``` + +will output `"\xFFello β"`. + +This example works because the +[`ByteSlice::as_bstr`](trait.ByteSlice.html#method.as_bstr) +method converts any `&[u8]` to a `&BStr`. + # When should I use byte strings? This library is somewhat of an experiment that reflects my hypothesis that @@ -106,33 +142,36 @@ useful they are more broadly isn't clear yet. Since this library is still experimental, you should not use it in the public API of your crates until it hits `1.0` (unless you're OK with with tracking -breaking releases of `bstr`). It is a priority to move this crate to `1.0` -expediently so that `BString` and `BStr` may be used in the public APIs of -other crates. While both `BString` and `BStr` do provide zero cost ways of -converting between `Vec` and `&[u8]`, it is often convenient to provide -trait implementations for `BString` and `BStr`, which requires making `bstr` a -public dependency. +breaking releases of `bstr`). + +In general, it should be possible to avoid putting anything in this crate into +your public APIs. Namely, you should never need to use the `ByteSlice` or +`ByteVec` traits as bounds on public APIs, since their only purpose is to +extend the methods on the concrete types `[u8]` and `Vec`, respectively. +Similarly, it should not be necessary to put either the `BStr` or `BString` +types into public APIs. If you want to use them internally, then they can +be converted to/from `[u8]`/`Vec` as needed. # Differences with standard strings -The primary difference between `BStr` and `str` is that the former is +The primary difference between `[u8]` and `str` is that the former is conventionally UTF-8 while the latter is guaranteed to be UTF-8. The phrase -"conventionally UTF-8" means that a `BStr` may contain bytes that do not form -a valid UTF-8 sequence, but operations defined on the type are generally most -useful on valid UTF-8 sequences. For example, iterating over Unicode codepoints -or grapheme clusters is an operation that is only defined on valid UTF-8. -Therefore, when invalid UTF-8 is encountered, the Unicode replacement codepoint -is substituted. Thus, a byte string that is not UTF-8 at all is of limited -utility when using these methods. +"conventionally UTF-8" means that a `[u8]` may contain bytes that do not form +a valid UTF-8 sequence, but operations defined on the type in this crate are +generally most useful on valid UTF-8 sequences. For example, iterating over +Unicode codepoints or grapheme clusters is an operation that is only defined +on valid UTF-8. Therefore, when invalid UTF-8 is encountered, the Unicode +replacement codepoint is substituted. Thus, a byte string that is not UTF-8 at +all is of limited utility when using these crate. However, not all operations on byte strings are specifically Unicode aware. For example, substring search has no specific Unicode semantics ascribed to it. It works just as well for byte strings that are completely valid UTF-8 as for byte strings that contain no valid UTF-8 at all. Similarly for replacements and -various other operations. +various other operations that do not need any Unicode specific tailoring. -Aside from the difference in how UTF-8 is handled, the APIs between `BStr` and -`str` (and `BString` and `String`) are intentionally very similar, including +Aside from the difference in how UTF-8 is handled, the APIs between `[u8]` and +`str` (and `Vec` and `String`) are intentionally very similar, including maintaining the same behavior for corner cases in things like substring splitting. There are, however, some differences: @@ -155,9 +194,16 @@ splitting. There are, however, some differences: in this crate, as is consistent with treating byte strings as a sequence of bytes. This means callers are responsible for maintaining a UTF-8 invariant if that's important. +* Some routines provided by this crate, such as `starts_with_str`, have a + `_str` suffix to differentiate them from similar routines already defined + on the `[u8]` type. The difference is that `starts_with` requires its + parameter to be a `&[u8]`, where as `starts_with_str` permits its parameter + to by anything that implements `AsRef<[u8]>`, which is more flexible. This + means you can write `bytes.starts_with_str("☃")` instead of + `bytes.starts_with("☃".as_bytes())`. Otherwise, you should find most of the APIs between this crate and the standard -library to be very similar, if not identical. +library string APIs to be very similar, if not identical. # Handling of invalid UTF-8 @@ -176,9 +222,9 @@ codepoint, `U+FFFD`, which looks like this: `�`. For example, an replacement codepoint whenever it comes across bytes that are not valid UTF-8: ``` -use bstr::B; +use bstr::ByteSlice; -let bs = B(b"a\xFF\xFFz"); +let bs = b"a\xFF\xFFz"; let chars: Vec = bs.chars().collect(); assert_eq!(vec!['a', '\u{FFFD}', '\u{FFFD}', 'z'], chars); ``` @@ -196,9 +242,9 @@ sequence, then all of those bytes (up to 3) are substituted with a single replacement codepoint. For example: ``` -use bstr::B; +use bstr::ByteSlice; -let bs = B(b"a\xF0\x9F\x87z"); +let bs = b"a\xF0\x9F\x87z"; let chars: Vec = bs.chars().collect(); // The bytes \xF0\x9F\x87 could lead to a valid UTF-8 sequence, but 3 of them // on their own are invalid. Only one replacement codepoint is substituted, @@ -212,9 +258,9 @@ the byte offsets containing the invalid UTF-8 bytes that were substituted with the replacement codepoint. For example: ``` -use bstr::{B, BStr}; +use bstr::{B, ByteSlice}; -let bs = B(b"a\xE2\x98z"); +let bs = b"a\xE2\x98z"; let chars: Vec<(usize, usize, char)> = bs.char_indices().collect(); // Even though the replacement codepoint is encoded as 3 bytes itself, the // byte range given here is only two bytes, corresponding to the original @@ -223,7 +269,7 @@ assert_eq!(vec![(0, 1, 'a'), (1, 3, '\u{FFFD}'), (3, 4, 'z')], chars); // Thus, getting the original raw bytes is as simple as slicing the original // byte string: -let chars: Vec<&BStr> = bs.char_indices().map(|(s, e, _)| &bs[s..e]).collect(); +let chars: Vec<&[u8]> = bs.char_indices().map(|(s, e, _)| &bs[s..e]).collect(); assert_eq!(vec![B("a"), B(b"\xE2\x98"), B("z")], chars); ``` @@ -281,25 +327,25 @@ they can do: While this library may provide facilities for (1) in the future, currently, this library only provides facilities for (2) and (3). In particular, a suite of conversion functions are provided that permit converting between byte -strings, OS strings and file paths. For owned `BString`s, they are: +strings, OS strings and file paths. For owned byte strings, they are: -* [`BString::from_os_string`](struct.BString.html#method.from_os_string) -* [`BString::from_os_str_lossy`](struct.BString.html#method.from_os_str_lossy) -* [`BString::from_path_buf`](struct.BString.html#method.from_path_buf) -* [`BString::from_path_lossy`](struct.BString.html#method.from_path_lossy) -* [`BString::into_os_string`](struct.BString.html#method.into_os_string) -* [`BString::into_os_string_lossy`](struct.BString.html#method.into_os_string_lossy) -* [`BString::into_path_buf`](struct.BString.html#method.into_path_buf) -* [`BString::into_path_buf_lossy`](struct.BString.html#method.into_path_buf_lossy) +* [`ByteVec::from_os_string`](trait.ByteVec.html#method.from_os_string) +* [`ByteVec::from_os_str_lossy`](trait.ByteVec.html#method.from_os_str_lossy) +* [`ByteVec::from_path_buf`](trait.ByteVec.html#method.from_path_buf) +* [`ByteVec::from_path_lossy`](trait.ByteVec.html#method.from_path_lossy) +* [`ByteVec::into_os_string`](trait.ByteVec.html#method.into_os_string) +* [`ByteVec::into_os_string_lossy`](trait.ByteVec.html#method.into_os_string_lossy) +* [`ByteVec::into_path_buf`](trait.ByteVec.html#method.into_path_buf) +* [`ByteVec::into_path_buf_lossy`](trait.ByteVec.html#method.into_path_buf_lossy) For byte string slices, they are: -* [`BStr::from_os_str`](struct.BStr.html#method.from_os_str) -* [`BStr::from_path`](struct.BStr.html#method.from_path) -* [`BStr::to_os_str`](struct.BStr.html#method.to_os_str) -* [`BStr::to_os_str_lossy`](struct.BStr.html#method.to_os_str_lossy) -* [`BStr::to_path`](struct.BStr.html#method.to_path) -* [`BStr::to_path_lossy`](struct.BStr.html#method.to_path_lossy) +* [`ByteSlice::from_os_str`](trait.ByteSlice.html#method.from_os_str) +* [`ByteSlice::from_path`](trait.ByteSlice.html#method.from_path) +* [`ByteSlice::to_os_str`](trait.ByteSlice.html#method.to_os_str) +* [`ByteSlice::to_os_str_lossy`](trait.ByteSlice.html#method.to_os_str_lossy) +* [`ByteSlice::to_path`](trait.ByteSlice.html#method.to_path) +* [`ByteSlice::to_path_lossy`](trait.ByteSlice.html#method.to_path_lossy) On Unix, all of these conversions are rigorously zero cost, which gives one a way to ergonomically deal with raw file paths exactly as they are using @@ -321,6 +367,7 @@ Windows. */ #![cfg_attr(not(feature = "std"), no_std)] +#![allow(dead_code)] #[cfg(feature = "std")] extern crate core; @@ -339,8 +386,10 @@ extern crate serde; #[cfg(test)] extern crate ucd_parse; -pub use bstr::{ - B, BStr, +pub use bstr::BStr; +pub use bstring::BString; +pub use ext_slice::{ + B, ByteSlice, Bytes, Finder, FinderReverse, Find, FindReverse, Split, SplitReverse, SplitN, SplitNReverse, @@ -348,8 +397,7 @@ pub use bstr::{ Lines, LinesWithTerminator, }; #[cfg(feature = "std")] -pub use bstring::{BString, DrainBytes, FromUtf8Error, concat, join}; -pub use slice_index::SliceIndex; +pub use ext_vec::{ByteVec, DrainBytes, FromUtf8Error, concat, join}; #[cfg(feature = "unicode")] pub use unicode::{ Graphemes, GraphemeIndices, @@ -367,11 +415,12 @@ mod bstr; #[cfg(feature = "std")] mod bstring; mod cow; +mod ext_slice; +mod ext_vec; mod impls; #[cfg(feature = "std")] pub mod io; mod search; -mod slice_index; #[cfg(test)] mod tests; #[cfg(feature = "unicode")] @@ -380,7 +429,9 @@ mod utf8; #[cfg(test)] mod apitests { - use super::*; + use bstr::BStr; + use bstring::BString; + use ext_slice::{Finder, FinderReverse}; #[test] fn oibits() { diff --git a/src/search/prefilter.rs b/src/search/prefilter.rs index 4331542..0a0965e 100644 --- a/src/search/prefilter.rs +++ b/src/search/prefilter.rs @@ -1,6 +1,6 @@ use core::mem; -use bstr::BStr; +use ext_slice::ByteSlice; use search::byte_frequencies::BYTE_FREQUENCIES; /// PrefilterState tracks state associated with the effectiveness of a @@ -148,7 +148,7 @@ impl Freqy { } /// Return search info for the given needle in the forward direction. - pub fn forward(needle: &BStr) -> Freqy { + pub fn forward(needle: &[u8]) -> Freqy { if needle.is_empty() { return Freqy::inert(); } @@ -184,7 +184,7 @@ impl Freqy { } /// Return search info for the given needle in the reverse direction. - pub fn reverse(needle: &BStr) -> Freqy { + pub fn reverse(needle: &[u8]) -> Freqy { if needle.is_empty() { return Freqy::inert(); } @@ -233,7 +233,7 @@ impl Freqy { pub fn find_candidate( &self, prestate: &mut PrefilterState, - haystack: &BStr, + haystack: &[u8], ) -> Option { debug_assert!(!self.inert); @@ -289,7 +289,7 @@ impl Freqy { pub fn rfind_candidate( &self, prestate: &mut PrefilterState, - haystack: &BStr, + haystack: &[u8], ) -> Option { debug_assert!(!self.inert); @@ -341,7 +341,7 @@ impl Freqy { #[cfg(test)] mod tests { - use bstr::B; + use ext_slice::B; use super::*; #[test] diff --git a/src/search/tests.rs b/src/search/tests.rs index aa7f6b2..c6d647e 100644 --- a/src/search/tests.rs +++ b/src/search/tests.rs @@ -1,5 +1,3 @@ -use bstr::{B, BStr}; -use bstring::BString; use search::twoway::TwoWay; /// Each test is a (needle, haystack, expected_fwd, expected_rev) tuple. @@ -63,10 +61,10 @@ fn unit_twoway_rev() { /// needle in the haystack, or `None` if one doesn't exist. fn run_search_tests_fwd( name: &str, - mut search: impl FnMut(&BStr, &BStr) -> Option, + mut search: impl FnMut(&[u8], &[u8]) -> Option, ) { for &(needle, haystack, expected_fwd, _) in SEARCH_TESTS { - let (n, h) = (B(needle), B(haystack)); + let (n, h) = (needle.as_bytes(), haystack.as_bytes()); assert_eq!( expected_fwd, search(n, h), @@ -82,10 +80,10 @@ fn run_search_tests_fwd( /// needle in the haystack, or `None` if one doesn't exist. fn run_search_tests_rev( name: &str, - mut search: impl FnMut(&BStr, &BStr) -> Option, + mut search: impl FnMut(&[u8], &[u8]) -> Option, ) { for &(needle, haystack, _, expected_rev) in SEARCH_TESTS { - let (n, h) = (B(needle), B(haystack)); + let (n, h) = (needle.as_bytes(), haystack.as_bytes()); assert_eq!( expected_rev, search(n, h), @@ -96,25 +94,25 @@ fn run_search_tests_rev( } quickcheck! { - fn qc_twoway_fwd_prefix_is_substring(bs: BString) -> bool { + fn qc_twoway_fwd_prefix_is_substring(bs: Vec) -> bool { prop_prefix_is_substring(false, &bs, |n, h| TwoWay::forward(n).find(h)) } - fn qc_twoway_fwd_suffix_is_substring(bs: BString) -> bool { + fn qc_twoway_fwd_suffix_is_substring(bs: Vec) -> bool { prop_suffix_is_substring(false, &bs, |n, h| TwoWay::forward(n).find(h)) } - fn qc_twoway_rev_prefix_is_substring(bs: BString) -> bool { + fn qc_twoway_rev_prefix_is_substring(bs: Vec) -> bool { prop_prefix_is_substring(true, &bs, |n, h| TwoWay::reverse(n).rfind(h)) } - fn qc_twoway_rev_suffix_is_substring(bs: BString) -> bool { + fn qc_twoway_rev_suffix_is_substring(bs: Vec) -> bool { prop_suffix_is_substring(true, &bs, |n, h| TwoWay::reverse(n).rfind(h)) } fn qc_twoway_fwd_matches_naive( - needle: BString, - haystack: BString + needle: Vec, + haystack: Vec ) -> bool { prop_matches_naive( false, @@ -125,8 +123,8 @@ quickcheck! { } fn qc_twoway_rev_matches_naive( - needle: BString, - haystack: BString + needle: Vec, + haystack: Vec ) -> bool { prop_matches_naive( true, @@ -140,8 +138,8 @@ quickcheck! { /// Check that every prefix of the given byte string is a substring. fn prop_prefix_is_substring( reverse: bool, - bs: &BStr, - mut search: impl FnMut(&BStr, &BStr) -> Option, + bs: &[u8], + mut search: impl FnMut(&[u8], &[u8]) -> Option, ) -> bool { if bs.is_empty() { return true; @@ -160,8 +158,8 @@ fn prop_prefix_is_substring( /// Check that every suffix of the given byte string is a substring. fn prop_suffix_is_substring( reverse: bool, - bs: &BStr, - mut search: impl FnMut(&BStr, &BStr) -> Option, + bs: &[u8], + mut search: impl FnMut(&[u8], &[u8]) -> Option, ) -> bool { if bs.is_empty() { return true; @@ -181,9 +179,9 @@ fn prop_suffix_is_substring( /// algorithm. fn prop_matches_naive( reverse: bool, - needle: &BStr, - haystack: &BStr, - mut search: impl FnMut(&BStr, &BStr) -> Option, + needle: &[u8], + haystack: &[u8], + mut search: impl FnMut(&[u8], &[u8]) -> Option, ) -> bool { if reverse { naive_rfind(needle, haystack) == search(needle, haystack) @@ -193,7 +191,7 @@ fn prop_matches_naive( } /// Naively search forwards for the given needle in the given haystack. -fn naive_find(needle: &BStr, haystack: &BStr) -> Option { +fn naive_find(needle: &[u8], haystack: &[u8]) -> Option { if needle.is_empty() { return Some(0); } else if haystack.len() < needle.len() { @@ -208,7 +206,7 @@ fn naive_find(needle: &BStr, haystack: &BStr) -> Option { } /// Naively search in reverse for the given needle in the given haystack. -fn naive_rfind(needle: &BStr, haystack: &BStr) -> Option { +fn naive_rfind(needle: &[u8], haystack: &[u8]) -> Option { if needle.is_empty() { return Some(haystack.len()); } else if haystack.len() < needle.len() { diff --git a/src/search/twoway.rs b/src/search/twoway.rs index b3b45e3..8d98cab 100644 --- a/src/search/twoway.rs +++ b/src/search/twoway.rs @@ -1,7 +1,7 @@ use core::cmp; -use bstr::BStr; -use cow::CowBStr; +use cow::CowBytes; +use ext_slice::ByteSlice; use search::prefilter::{Freqy, PrefilterState}; /// An implementation of the TwoWay substring search algorithm, with heuristics @@ -30,7 +30,7 @@ use search::prefilter::{Freqy, PrefilterState}; #[derive(Clone, Debug)] pub struct TwoWay<'b> { /// The needle that we're looking for. - needle: CowBStr<'b>, + needle: CowBytes<'b>, /// An implementation of a fast skip loop based on hard-coded frequency /// data. This is only used when conditions are deemed favorable. freqy: Freqy, @@ -51,11 +51,11 @@ pub struct TwoWay<'b> { impl<'b> TwoWay<'b> { /// Create a searcher that uses the Two-Way algorithm by searching forwards /// through any haystack. - pub fn forward(needle: &'b BStr) -> TwoWay<'b> { + pub fn forward(needle: &'b [u8]) -> TwoWay<'b> { let freqy = Freqy::forward(needle); if needle.is_empty() { return TwoWay { - needle: CowBStr::new(needle), + needle: CowBytes::new(needle), freqy, critical_pos: 0, shift: Shift::Large { shift: 0 }, @@ -71,17 +71,17 @@ impl<'b> TwoWay<'b> { (max_suffix.period, max_suffix.pos) }; let shift = Shift::forward(needle, period_lower_bound, critical_pos); - let needle = CowBStr::new(needle); + let needle = CowBytes::new(needle); TwoWay { needle, freqy, critical_pos, shift } } /// Create a searcher that uses the Two-Way algorithm by searching in /// reverse through any haystack. - pub fn reverse(needle: &'b BStr) -> TwoWay<'b> { + pub fn reverse(needle: &'b [u8]) -> TwoWay<'b> { let freqy = Freqy::reverse(needle); if needle.is_empty() { return TwoWay { - needle: CowBStr::new(needle), + needle: CowBytes::new(needle), freqy, critical_pos: 0, shift: Shift::Large { shift: 0 }, @@ -97,7 +97,7 @@ impl<'b> TwoWay<'b> { (max_suffix.period, max_suffix.pos) }; let shift = Shift::reverse(needle, period_lower_bound, critical_pos); - let needle = CowBStr::new(needle); + let needle = CowBytes::new(needle); TwoWay { needle, freqy, critical_pos, shift } } @@ -115,8 +115,8 @@ impl<'b> TwoWay<'b> { } /// Return the needle used by this searcher. - pub fn needle(&self) -> &BStr { - self.needle.as_bstr() + pub fn needle(&self) -> &[u8] { + self.needle.as_slice() } /// Convert this searched into an owned version, where the needle is @@ -136,7 +136,7 @@ impl<'b> TwoWay<'b> { /// /// This will automatically initialize prefilter state. This should only /// be used for one-off searches. - pub fn find(&self, haystack: &BStr) -> Option { + pub fn find(&self, haystack: &[u8]) -> Option { self.find_with(&mut self.prefilter_state(), haystack) } @@ -145,7 +145,7 @@ impl<'b> TwoWay<'b> { /// /// This will automatically initialize prefilter state. This should only /// be used for one-off searches. - pub fn rfind(&self, haystack: &BStr) -> Option { + pub fn rfind(&self, haystack: &[u8]) -> Option { self.rfind_with(&mut self.prefilter_state(), haystack) } @@ -157,7 +157,7 @@ impl<'b> TwoWay<'b> { pub fn find_with( &self, prestate: &mut PrefilterState, - haystack: &BStr, + haystack: &[u8], ) -> Option { if self.needle.is_empty() { return Some(0); @@ -184,7 +184,7 @@ impl<'b> TwoWay<'b> { pub fn rfind_with( &self, prestate: &mut PrefilterState, - haystack: &BStr, + haystack: &[u8], ) -> Option { if self.needle.is_empty() { return Some(haystack.len()); @@ -218,7 +218,7 @@ impl<'b> TwoWay<'b> { fn find_small( &self, prestate: &mut PrefilterState, - haystack: &BStr, + haystack: &[u8], period: usize, ) -> Option { if prestate.is_effective() { @@ -233,10 +233,10 @@ impl<'b> TwoWay<'b> { &self, prestate: &mut PrefilterState, prefilter: bool, - haystack: &BStr, + haystack: &[u8], period: usize, ) -> Option { - let needle = self.needle.as_bstr(); + let needle = self.needle.as_slice(); let mut pos = 0; let mut shift = 0; while pos + needle.len() <= haystack.len() { @@ -279,7 +279,7 @@ impl<'b> TwoWay<'b> { fn find_large( &self, prestate: &mut PrefilterState, - haystack: &BStr, + haystack: &[u8], shift: usize, ) -> Option { if prestate.is_effective() { @@ -294,10 +294,10 @@ impl<'b> TwoWay<'b> { &self, prestate: &mut PrefilterState, prefilter: bool, - haystack: &BStr, + haystack: &[u8], shift: usize, ) -> Option { - let needle = self.needle.as_bstr(); + let needle = self.needle.as_slice(); let mut pos = 0; while pos + needle.len() <= haystack.len() { let mut i = self.critical_pos; @@ -335,7 +335,7 @@ impl<'b> TwoWay<'b> { fn rfind_small( &self, prestate: &mut PrefilterState, - haystack: &BStr, + haystack: &[u8], period: usize, ) -> Option { if prestate.is_effective() { @@ -350,7 +350,7 @@ impl<'b> TwoWay<'b> { &self, prestate: &mut PrefilterState, prefilter: bool, - haystack: &BStr, + haystack: &[u8], period: usize, ) -> Option { let needle = &*self.needle; @@ -397,7 +397,7 @@ impl<'b> TwoWay<'b> { fn rfind_large( &self, prestate: &mut PrefilterState, - haystack: &BStr, + haystack: &[u8], shift: usize, ) -> Option { if prestate.is_effective() { @@ -412,7 +412,7 @@ impl<'b> TwoWay<'b> { &self, prestate: &mut PrefilterState, prefilter: bool, - haystack: &BStr, + haystack: &[u8], shift: usize, ) -> Option { let needle = &*self.needle; @@ -496,7 +496,7 @@ impl Shift { /// lexicographic suffixes, and choosing the right-most starting position. /// The lower bound on the period is then the period of the chosen suffix. fn forward( - needle: &BStr, + needle: &[u8], period_lower_bound: usize, critical_pos: usize, ) -> Shift { @@ -519,7 +519,7 @@ impl Shift { /// lexicographic suffixes, and choosing the left-most starting position. /// The lower bound on the period is then the period of the chosen suffix. fn reverse( - needle: &BStr, + needle: &[u8], period_lower_bound: usize, critical_pos: usize, ) -> Shift { @@ -555,7 +555,7 @@ struct Suffix { } impl Suffix { - fn forward(needle: &BStr, kind: SuffixKind) -> Suffix { + fn forward(needle: &[u8], kind: SuffixKind) -> Suffix { debug_assert!(!needle.is_empty()); // suffix represents our maximal (or minimal) suffix, along with @@ -605,7 +605,7 @@ impl Suffix { suffix } - fn reverse(needle: &BStr, kind: SuffixKind) -> Suffix { + fn reverse(needle: &[u8], kind: SuffixKind) -> Suffix { debug_assert!(!needle.is_empty()); // See the comments in `forward` for how this works. @@ -703,30 +703,28 @@ impl SuffixKind { // N.B. There are more holistic tests in src/search/tests.rs. #[cfg(test)] mod tests { - use bstr::{B, BStr}; - use bstring::BString; - + use ext_slice::B; use super::*; /// Convenience wrapper for computing the suffix as a byte string. - fn get_suffix_forward(needle: &BStr, kind: SuffixKind) -> (&BStr, usize) { + fn get_suffix_forward(needle: &[u8], kind: SuffixKind) -> (&[u8], usize) { let s = Suffix::forward(needle, kind); (&needle[s.pos..], s.period) } /// Convenience wrapper for computing the reverse suffix as a byte string. - fn get_suffix_reverse(needle: &BStr, kind: SuffixKind) -> (&BStr, usize) { + fn get_suffix_reverse(needle: &[u8], kind: SuffixKind) -> (&[u8], usize) { let s = Suffix::reverse(needle, kind); (&needle[..s.pos], s.period) } /// Return all of the non-empty suffixes in the given byte string. - fn suffixes(bytes: &BStr) -> Vec<&BStr> { + fn suffixes(bytes: &[u8]) -> Vec<&[u8]> { (0..bytes.len()).map(|i| &bytes[i..]).collect() } /// Return the lexicographically maximal suffix of the given byte string. - fn naive_maximal_suffix_forward(needle: &BStr) -> &BStr { + fn naive_maximal_suffix_forward(needle: &[u8]) -> &[u8] { let mut sufs = suffixes(needle); sufs.sort(); sufs.pop().unwrap() @@ -734,11 +732,11 @@ mod tests { /// Return the lexicographically maximal suffix of the reverse of the given /// byte string. - fn naive_maximal_suffix_reverse(needle: &BStr) -> BString { - let mut reversed = needle.to_bstring(); - reversed.reverse_bytes(); - let mut got = naive_maximal_suffix_forward(&reversed).to_bstring(); - got.reverse_bytes(); + fn naive_maximal_suffix_reverse(needle: &[u8]) -> Vec { + let mut reversed = needle.to_vec(); + reversed.reverse(); + let mut got = naive_maximal_suffix_forward(&reversed).to_vec(); + got.reverse(); got } @@ -747,7 +745,7 @@ mod tests { macro_rules! assert_suffix_min { ($given:expr, $expected:expr, $period:expr) => { let (got_suffix, got_period) = get_suffix_forward( - B($given), + $given.as_bytes(), SuffixKind::Minimal, ); assert_eq!((B($expected), $period), (got_suffix, got_period)); @@ -757,7 +755,7 @@ mod tests { macro_rules! assert_suffix_max { ($given:expr, $expected:expr, $period:expr) => { let (got_suffix, got_period) = get_suffix_forward( - B($given), + $given.as_bytes(), SuffixKind::Maximal, ); assert_eq!((B($expected), $period), (got_suffix, got_period)); @@ -806,7 +804,7 @@ mod tests { macro_rules! assert_suffix_min { ($given:expr, $expected:expr, $period:expr) => { let (got_suffix, got_period) = get_suffix_reverse( - B($given), + $given.as_bytes(), SuffixKind::Minimal, ); assert_eq!((B($expected), $period), (got_suffix, got_period)); @@ -816,7 +814,7 @@ mod tests { macro_rules! assert_suffix_max { ($given:expr, $expected:expr, $period:expr) => { let (got_suffix, got_period) = get_suffix_reverse( - B($given), + $given.as_bytes(), SuffixKind::Maximal, ); assert_eq!((B($expected), $period), (got_suffix, got_period)); @@ -859,7 +857,6 @@ mod tests { quickcheck! { fn qc_suffix_forward_maximal(bytes: Vec) -> bool { - let bytes = BString::from(bytes); if bytes.is_empty() { return true; } @@ -870,14 +867,13 @@ mod tests { } fn qc_suffix_reverse_maximal(bytes: Vec) -> bool { - let bytes = BString::from(bytes); if bytes.is_empty() { return true; } let (got, _) = get_suffix_reverse(&bytes, SuffixKind::Maximal); let expected = naive_maximal_suffix_reverse(&bytes); - got == expected + expected == got } } } diff --git a/src/slice_index.rs b/src/slice_index.rs deleted file mode 100644 index 300ff60..0000000 --- a/src/slice_index.rs +++ /dev/null @@ -1,292 +0,0 @@ -use core::ops; - -use bstr::BStr; - -/// Ensure that callers cannot implement `SliceIndex` by making an -/// umplementable trait its super trait. -pub trait Sealed {} -impl Sealed for usize {} -impl Sealed for ops::Range {} -impl Sealed for ops::RangeTo {} -impl Sealed for ops::RangeFrom {} -impl Sealed for ops::RangeFull {} -impl Sealed for ops::RangeInclusive {} -impl Sealed for ops::RangeToInclusive {} - -/// A trait that parameterizes the different types of indexing a byte string. -/// -/// In general, this trait makes it possible to define generic routines like -/// `get` that can accept either single positions or ranges, and return single -/// bytes or slices, respectively. -/// -/// This trait is sealed such that callers cannot implement it. In general, -/// callers should not need to interact with this trait directly unless you're -/// defining generic functions that index or slice a byte string. -pub trait SliceIndex: Sealed { - /// The output type returned by methods. For indexing by position, this - /// is always a single byte (`u8`). For ranges, this is always a slice - /// (`BStr`). - type Output: ?Sized; - - /// Returns a shared reference to the output at this location, if in - /// bounds. - fn get(self, slice: &BStr) -> Option<&Self::Output>; - - /// Returns a mutable reference to the output at this location, if in - /// bounds. - fn get_mut(self, slice: &mut BStr) -> Option<&mut Self::Output>; - - /// Returns a shared reference to the output at this location, without - /// performing any bounds checking. - unsafe fn get_unchecked(self, slice: &BStr) -> &Self::Output; - - /// Returns a mutable reference to the output at this location, without - /// performing any bounds checking. - unsafe fn get_unchecked_mut(self, slice: &mut BStr) -> &mut Self::Output; - - /// Returns a shared reference to the output at this location, panicking - /// if out of bounds. - fn index(self, slice: &BStr) -> &Self::Output; - - /// Returns a mutable reference to the output at this location, panicking - /// if out of bounds. - fn index_mut(self, slice: &mut BStr) -> &mut Self::Output; -} - -impl SliceIndex for usize { - type Output = u8; - - #[inline] - fn get(self, slice: &BStr) -> Option<&u8> { - slice.as_bytes().get(self) - } - - #[inline] - fn get_mut(self, slice: &mut BStr) -> Option<&mut u8> { - slice.as_bytes_mut().get_mut(self) - } - - #[inline] - unsafe fn get_unchecked(self, slice: &BStr) -> &u8 { - slice.as_bytes().get_unchecked(self) - } - - #[inline] - unsafe fn get_unchecked_mut(self, slice: &mut BStr) -> &mut u8 { - slice.as_bytes_mut().get_unchecked_mut(self) - } - - #[inline] - fn index(self, slice: &BStr) -> &u8 { - &slice.as_bytes()[self] - } - - #[inline] - fn index_mut(self, slice: &mut BStr) -> &mut u8 { - &mut slice.as_bytes_mut()[self] - } -} - -impl SliceIndex for ops::Range { - type Output = BStr; - - #[inline] - fn get(self, slice: &BStr) -> Option<&BStr> { - slice.as_bytes().get(self).map(BStr::new) - } - - #[inline] - fn get_mut(self, slice: &mut BStr) -> Option<&mut BStr> { - slice.as_bytes_mut().get_mut(self).map(BStr::new_mut) - } - - #[inline] - unsafe fn get_unchecked(self, slice: &BStr) -> &BStr { - BStr::new(slice.as_bytes().get_unchecked(self)) - } - - #[inline] - unsafe fn get_unchecked_mut(self, slice: &mut BStr) -> &mut BStr { - BStr::new_mut(slice.as_bytes_mut().get_unchecked_mut(self)) - } - - #[inline] - fn index(self, slice: &BStr) -> &BStr { - &slice[self] - } - - #[inline] - fn index_mut(self, slice: &mut BStr) -> &mut BStr { - &mut slice[self] - } -} - -impl SliceIndex for ops::RangeTo { - type Output = BStr; - - #[inline] - fn get(self, slice: &BStr) -> Option<&BStr> { - slice.as_bytes().get(self).map(BStr::new) - } - - #[inline] - fn get_mut(self, slice: &mut BStr) -> Option<&mut BStr> { - slice.as_bytes_mut().get_mut(self).map(BStr::new_mut) - } - - #[inline] - unsafe fn get_unchecked(self, slice: &BStr) -> &BStr { - BStr::new(slice.as_bytes().get_unchecked(self)) - } - - #[inline] - unsafe fn get_unchecked_mut(self, slice: &mut BStr) -> &mut BStr { - BStr::new_mut(slice.as_bytes_mut().get_unchecked_mut(self)) - } - - #[inline] - fn index(self, slice: &BStr) -> &BStr { - &slice[self] - } - - #[inline] - fn index_mut(self, slice: &mut BStr) -> &mut BStr { - &mut slice[self] - } -} - -impl SliceIndex for ops::RangeFrom { - type Output = BStr; - - #[inline] - fn get(self, slice: &BStr) -> Option<&BStr> { - slice.as_bytes().get(self).map(BStr::new) - } - - #[inline] - fn get_mut(self, slice: &mut BStr) -> Option<&mut BStr> { - slice.as_bytes_mut().get_mut(self).map(BStr::new_mut) - } - - #[inline] - unsafe fn get_unchecked(self, slice: &BStr) -> &BStr { - BStr::new(slice.as_bytes().get_unchecked(self)) - } - - #[inline] - unsafe fn get_unchecked_mut(self, slice: &mut BStr) -> &mut BStr { - BStr::new_mut(slice.as_bytes_mut().get_unchecked_mut(self)) - } - - #[inline] - fn index(self, slice: &BStr) -> &BStr { - &slice[self] - } - - #[inline] - fn index_mut(self, slice: &mut BStr) -> &mut BStr { - &mut slice[self] - } -} - -impl SliceIndex for ops::RangeFull { - type Output = BStr; - - #[inline] - fn get(self, slice: &BStr) -> Option<&BStr> { - slice.as_bytes().get(self).map(BStr::new) - } - - #[inline] - fn get_mut(self, slice: &mut BStr) -> Option<&mut BStr> { - slice.as_bytes_mut().get_mut(self).map(BStr::new_mut) - } - - #[inline] - unsafe fn get_unchecked(self, slice: &BStr) -> &BStr { - BStr::new(slice.as_bytes().get_unchecked(self)) - } - - #[inline] - unsafe fn get_unchecked_mut(self, slice: &mut BStr) -> &mut BStr { - BStr::new_mut(slice.as_bytes_mut().get_unchecked_mut(self)) - } - - #[inline] - fn index(self, slice: &BStr) -> &BStr { - &slice[self] - } - - #[inline] - fn index_mut(self, slice: &mut BStr) -> &mut BStr { - &mut slice[self] - } -} - -impl SliceIndex for ops::RangeInclusive { - type Output = BStr; - - #[inline] - fn get(self, slice: &BStr) -> Option<&BStr> { - slice.as_bytes().get(self).map(BStr::new) - } - - #[inline] - fn get_mut(self, slice: &mut BStr) -> Option<&mut BStr> { - slice.as_bytes_mut().get_mut(self).map(BStr::new_mut) - } - - #[inline] - unsafe fn get_unchecked(self, slice: &BStr) -> &BStr { - BStr::new(slice.as_bytes().get_unchecked(self)) - } - - #[inline] - unsafe fn get_unchecked_mut(self, slice: &mut BStr) -> &mut BStr { - BStr::new_mut(slice.as_bytes_mut().get_unchecked_mut(self)) - } - - #[inline] - fn index(self, slice: &BStr) -> &BStr { - &slice[self] - } - - #[inline] - fn index_mut(self, slice: &mut BStr) -> &mut BStr { - &mut slice[self] - } -} - -impl SliceIndex for ops::RangeToInclusive { - type Output = BStr; - - #[inline] - fn get(self, slice: &BStr) -> Option<&BStr> { - slice.as_bytes().get(self).map(BStr::new) - } - - #[inline] - fn get_mut(self, slice: &mut BStr) -> Option<&mut BStr> { - slice.as_bytes_mut().get_mut(self).map(BStr::new_mut) - } - - #[inline] - unsafe fn get_unchecked(self, slice: &BStr) -> &BStr { - BStr::new(slice.as_bytes().get_unchecked(self)) - } - - #[inline] - unsafe fn get_unchecked_mut(self, slice: &mut BStr) -> &mut BStr { - BStr::new_mut(slice.as_bytes_mut().get_unchecked_mut(self)) - } - - #[inline] - fn index(self, slice: &BStr) -> &BStr { - &slice[self] - } - - #[inline] - fn index_mut(self, slice: &mut BStr) -> &mut BStr { - &mut slice[self] - } -} diff --git a/src/unicode/grapheme.rs b/src/unicode/grapheme.rs index fe9a852..ec2c647 100644 --- a/src/unicode/grapheme.rs +++ b/src/unicode/grapheme.rs @@ -1,6 +1,6 @@ use regex_automata::DFA; -use bstr::BStr; +use ext_slice::ByteSlice; use unicode::fsm::grapheme_break_fwd::GRAPHEME_BREAK_FWD; use unicode::fsm::grapheme_break_rev::GRAPHEME_BREAK_REV; use unicode::fsm::regional_indicator_rev::REGIONAL_INDICATOR_REV; @@ -9,7 +9,7 @@ use utf8; /// An iterator over grapheme clusters in a byte string. /// /// This iterator is typically constructed by -/// [`bstr::graphemes`](struct.BStr.html#method.graphemes). +/// [`ByteSlice::graphemes`](trait.ByteSlice.html#method.graphemes). /// /// Unicode defines a grapheme cluster as an *approximation* to a single user /// visible character. A grapheme cluster, or just "grapheme," is made up of @@ -28,11 +28,11 @@ use utf8; /// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Grapheme_Cluster_Boundaries). #[derive(Clone, Debug)] pub struct Graphemes<'a> { - bs: &'a BStr, + bs: &'a [u8], } impl<'a> Graphemes<'a> { - pub(crate) fn new(bs: &'a BStr) -> Graphemes<'a> { + pub(crate) fn new(bs: &'a [u8]) -> Graphemes<'a> { Graphemes { bs } } @@ -44,19 +44,19 @@ impl<'a> Graphemes<'a> { /// # Examples /// /// ``` - /// use bstr::B; + /// use bstr::ByteSlice; /// - /// let mut it = B("abc").graphemes(); + /// let mut it = b"abc".graphemes(); /// - /// assert_eq!("abc", it.as_bstr()); + /// assert_eq!(b"abc", it.as_bytes()); /// it.next(); - /// assert_eq!("bc", it.as_bstr()); + /// assert_eq!(b"bc", it.as_bytes()); /// it.next(); /// it.next(); - /// assert_eq!("", it.as_bstr()); + /// assert_eq!(b"", it.as_bytes()); /// ``` #[inline] - pub fn as_bstr(&self) -> &'a BStr { + pub fn as_bytes(&self) -> &'a [u8] { self.bs } } @@ -91,7 +91,7 @@ impl<'a> DoubleEndedIterator for Graphemes<'a> { /// positions. /// /// This iterator is typically constructed by -/// [`bstr::grapheme_indices`](struct.BStr.html#method.grapheme_indices). +/// [`ByteSlice::grapheme_indices`](trait.ByteSlice.html#method.grapheme_indices). /// /// Unicode defines a grapheme cluster as an *approximation* to a single user /// visible character. A grapheme cluster, or just "grapheme," is made up of @@ -118,13 +118,13 @@ impl<'a> DoubleEndedIterator for Graphemes<'a> { /// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Grapheme_Cluster_Boundaries). #[derive(Clone, Debug)] pub struct GraphemeIndices<'a> { - bs: &'a BStr, + bs: &'a [u8], forward_index: usize, reverse_index: usize, } impl<'a> GraphemeIndices<'a> { - pub(crate) fn new(bs: &'a BStr) -> GraphemeIndices<'a> { + pub(crate) fn new(bs: &'a [u8]) -> GraphemeIndices<'a> { GraphemeIndices { bs: bs, forward_index: 0, reverse_index: bs.len() } } @@ -136,19 +136,19 @@ impl<'a> GraphemeIndices<'a> { /// # Examples /// /// ``` - /// use bstr::B; + /// use bstr::ByteSlice; /// - /// let mut it = B("abc").grapheme_indices(); + /// let mut it = b"abc".grapheme_indices(); /// - /// assert_eq!("abc", it.as_bstr()); + /// assert_eq!(b"abc", it.as_bytes()); /// it.next(); - /// assert_eq!("bc", it.as_bstr()); + /// assert_eq!(b"bc", it.as_bytes()); /// it.next(); /// it.next(); - /// assert_eq!("", it.as_bstr()); + /// assert_eq!(b"", it.as_bytes()); /// ``` #[inline] - pub fn as_bstr(&self) -> &'a BStr { + pub fn as_bytes(&self) -> &'a [u8] { self.bs } } @@ -188,25 +188,25 @@ impl<'a> DoubleEndedIterator for GraphemeIndices<'a> { /// codepoint if invalid UTF-8 was found), along with the number of bytes /// decoded in the byte string. The number of bytes decoded may not be the /// same as the length of grapheme in the case where invalid UTF-8 is found. -pub fn decode_grapheme(bs: &BStr) -> (&str, usize) { +pub fn decode_grapheme(bs: &[u8]) -> (&str, usize) { if bs.is_empty() { ("", 0) - } else if let Some(end) = GRAPHEME_BREAK_FWD.find(bs.as_bytes()) { + } else if let Some(end) = GRAPHEME_BREAK_FWD.find(bs) { // Safe because a match can only occur for valid UTF-8. let grapheme = unsafe { bs[..end].to_str_unchecked() }; (grapheme, grapheme.len()) } else { const INVALID: &'static str = "\u{FFFD}"; // No match on non-empty bytes implies we found invalid UTF-8. - let (_, size) = utf8::decode_lossy(bs.as_bytes()); + let (_, size) = utf8::decode_lossy(bs); (INVALID, size) } } -fn decode_last_grapheme(bs: &BStr) -> (&str, usize) { +fn decode_last_grapheme(bs: &[u8]) -> (&str, usize) { if bs.is_empty() { ("", 0) - } else if let Some(mut start) = GRAPHEME_BREAK_REV.rfind(bs.as_bytes()) { + } else if let Some(mut start) = GRAPHEME_BREAK_REV.rfind(bs) { start = adjust_rev_for_regional_indicator(bs, start); // Safe because a match can only occur for valid UTF-8. let grapheme = unsafe { bs[start..].to_str_unchecked() }; @@ -214,7 +214,7 @@ fn decode_last_grapheme(bs: &BStr) -> (&str, usize) { } else { const INVALID: &'static str = "\u{FFFD}"; // No match on non-empty bytes implies we found invalid UTF-8. - let (_, size) = utf8::decode_last_lossy(bs.as_bytes()); + let (_, size) = utf8::decode_last_lossy(bs); (INVALID, size) } } @@ -232,7 +232,7 @@ fn decode_last_grapheme(bs: &BStr) -> (&str, usize) { /// occur between regional indicators where it would cause an odd number of /// regional indicators to exist before the break from the *start* of the /// string. A reverse regex cannot detect this case easily without look-around. -fn adjust_rev_for_regional_indicator(mut bs: &BStr, i: usize) -> usize { +fn adjust_rev_for_regional_indicator(mut bs: &[u8], i: usize) -> usize { // All regional indicators use a 4 byte encoding, and we only care about // the case where we found a pair of regional indicators. if bs.len() - i != 8 { @@ -246,7 +246,7 @@ fn adjust_rev_for_regional_indicator(mut bs: &BStr, i: usize) -> usize { // regional indicator codepoints. A fix probably requires refactoring this // code a bit such that we don't rescan regional indicators. let mut count = 0; - while let Some(start) = REGIONAL_INDICATOR_REV.rfind(bs.as_bytes()) { + while let Some(start) = REGIONAL_INDICATOR_REV.rfind(bs) { bs = &bs[..start]; count += 1; } @@ -261,7 +261,7 @@ fn adjust_rev_for_regional_indicator(mut bs: &BStr, i: usize) -> usize { mod tests { use ucd_parse::GraphemeClusterBreakTest; - use bstr::B; + use ext_slice::ByteSlice; use tests::LOSSY_TESTS; use super::*; @@ -269,7 +269,7 @@ mod tests { fn forward_ucd() { for (i, test) in ucdtests().into_iter().enumerate() { let given = test.grapheme_clusters.concat(); - let got: Vec = Graphemes::new(B(&given)) + let got: Vec = Graphemes::new(given.as_bytes()) .map(|cluster| cluster.to_string()) .collect(); assert_eq!( @@ -291,7 +291,7 @@ mod tests { fn reverse_ucd() { for (i, test) in ucdtests().into_iter().enumerate() { let given = test.grapheme_clusters.concat(); - let mut got: Vec = Graphemes::new(B(&given)) + let mut got: Vec = Graphemes::new(given.as_bytes()) .rev() .map(|cluster| cluster.to_string()) .collect(); @@ -314,7 +314,7 @@ mod tests { #[test] fn forward_lossy() { for &(expected, input) in LOSSY_TESTS { - let got = Graphemes::new(B(input)).collect::(); + let got = Graphemes::new(input.as_bytes()).collect::(); assert_eq!(expected, got); } } @@ -323,7 +323,7 @@ mod tests { fn reverse_lossy() { for &(expected, input) in LOSSY_TESTS { let expected: String = expected.chars().rev().collect(); - let got = Graphemes::new(B(input)) + let got = Graphemes::new(input.as_bytes()) .rev() .collect::(); assert_eq!(expected, got); diff --git a/src/unicode/sentence.rs b/src/unicode/sentence.rs index f732f08..ccd72b2 100644 --- a/src/unicode/sentence.rs +++ b/src/unicode/sentence.rs @@ -1,13 +1,13 @@ use regex_automata::DFA; -use bstr::BStr; +use ext_slice::ByteSlice; use unicode::fsm::sentence_break_fwd::SENTENCE_BREAK_FWD; use utf8; /// An iterator over sentences in a byte string. /// /// This iterator is typically constructed by -/// [`bstr::sentences`](struct.BStr.html#method.sentences). +/// [`ByteSlice::sentences`](trait.ByteSlice.html#method.sentences). /// /// Sentences typically include their trailing punctuation and whitespace. /// @@ -20,11 +20,11 @@ use utf8; /// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Sentence_Boundaries). #[derive(Clone, Debug)] pub struct Sentences<'a> { - bs: &'a BStr, + bs: &'a [u8], } impl<'a> Sentences<'a> { - pub(crate) fn new(bs: &'a BStr) -> Sentences<'a> { + pub(crate) fn new(bs: &'a [u8]) -> Sentences<'a> { Sentences { bs } } @@ -36,19 +36,19 @@ impl<'a> Sentences<'a> { /// # Examples /// /// ``` - /// use bstr::B; + /// use bstr::ByteSlice; /// - /// let mut it = B("I want this. Not that. Right now.").sentences(); + /// let mut it = b"I want this. Not that. Right now.".sentences(); /// - /// assert_eq!("I want this. Not that. Right now.", it.as_bstr()); + /// assert_eq!(&b"I want this. Not that. Right now."[..], it.as_bytes()); /// it.next(); - /// assert_eq!("Not that. Right now.", it.as_bstr()); + /// assert_eq!(b"Not that. Right now.", it.as_bytes()); /// it.next(); /// it.next(); - /// assert_eq!("", it.as_bstr()); + /// assert_eq!(b"", it.as_bytes()); /// ``` #[inline] - pub fn as_bstr(&self) -> &'a BStr { + pub fn as_bytes(&self) -> &'a [u8] { self.bs } } @@ -70,7 +70,7 @@ impl<'a> Iterator for Sentences<'a> { /// An iterator over sentences in a byte string, along with their byte offsets. /// /// This iterator is typically constructed by -/// [`bstr::sentence_indices`](struct.BStr.html#method.sentence_indices). +/// [`ByteSlice::sentence_indices`](trait.ByteSlice.html#method.sentence_indices). /// /// Sentences typically include their trailing punctuation and whitespace. /// @@ -91,12 +91,12 @@ impl<'a> Iterator for Sentences<'a> { /// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Sentence_Boundaries). #[derive(Clone, Debug)] pub struct SentenceIndices<'a> { - bs: &'a BStr, + bs: &'a [u8], forward_index: usize, } impl<'a> SentenceIndices<'a> { - pub(crate) fn new(bs: &'a BStr) -> SentenceIndices<'a> { + pub(crate) fn new(bs: &'a [u8]) -> SentenceIndices<'a> { SentenceIndices { bs: bs, forward_index: 0 } } @@ -108,19 +108,19 @@ impl<'a> SentenceIndices<'a> { /// # Examples /// /// ``` - /// use bstr::B; + /// use bstr::ByteSlice; /// - /// let mut it = B("I want this. Not that. Right now.").sentence_indices(); + /// let mut it = b"I want this. Not that. Right now.".sentence_indices(); /// - /// assert_eq!("I want this. Not that. Right now.", it.as_bstr()); + /// assert_eq!(&b"I want this. Not that. Right now."[..], it.as_bytes()); /// it.next(); - /// assert_eq!("Not that. Right now.", it.as_bstr()); + /// assert_eq!(b"Not that. Right now.", it.as_bytes()); /// it.next(); /// it.next(); - /// assert_eq!("", it.as_bstr()); + /// assert_eq!(b"", it.as_bytes()); /// ``` #[inline] - pub fn as_bstr(&self) -> &'a BStr { + pub fn as_bytes(&self) -> &'a [u8] { self.bs } } @@ -141,17 +141,17 @@ impl<'a> Iterator for SentenceIndices<'a> { } } -fn decode_sentence(bs: &BStr) -> (&str, usize) { +fn decode_sentence(bs: &[u8]) -> (&str, usize) { if bs.is_empty() { ("", 0) - } else if let Some(end) = SENTENCE_BREAK_FWD.find(bs.as_bytes()) { + } else if let Some(end) = SENTENCE_BREAK_FWD.find(bs) { // Safe because a match can only occur for valid UTF-8. let sentence = unsafe { bs[..end].to_str_unchecked() }; (sentence, sentence.len()) } else { const INVALID: &'static str = "\u{FFFD}"; // No match on non-empty bytes implies we found invalid UTF-8. - let (_, size) = utf8::decode_lossy(bs.as_bytes()); + let (_, size) = utf8::decode_lossy(bs); (INVALID, size) } } @@ -160,7 +160,7 @@ fn decode_sentence(bs: &BStr) -> (&str, usize) { mod tests { use ucd_parse::SentenceBreakTest; - use bstr::{B, BStr}; + use ext_slice::ByteSlice; #[test] fn forward_ucd() { @@ -175,7 +175,7 @@ mod tests { expected: {:?}\n\ got: {:?}\n", i, - BStr::new(&given), + given, strs_to_bstrs(&test.sentences), strs_to_bstrs(&got), ); @@ -195,11 +195,11 @@ mod tests { } fn sentences(bytes: &[u8]) -> Vec<&str> { - BStr::new(bytes).sentences().collect() + bytes.sentences().collect() } - fn strs_to_bstrs>(strs: &[S]) -> Vec<&BStr> { - strs.iter().map(|s| B(s.as_ref())).collect() + fn strs_to_bstrs>(strs: &[S]) -> Vec<&[u8]> { + strs.iter().map(|s| s.as_ref().as_bytes()).collect() } /// Return all of the UCD for sentence breaks. diff --git a/src/unicode/word.rs b/src/unicode/word.rs index d55dfc5..85c9787 100644 --- a/src/unicode/word.rs +++ b/src/unicode/word.rs @@ -1,6 +1,6 @@ use regex_automata::DFA; -use bstr::BStr; +use ext_slice::ByteSlice; use unicode::fsm::simple_word_fwd::SIMPLE_WORD_FWD; use unicode::fsm::word_break_fwd::WORD_BREAK_FWD; use utf8; @@ -8,7 +8,7 @@ use utf8; /// An iterator over words in a byte string. /// /// This iterator is typically constructed by -/// [`bstr::words`](struct.BStr.html#method.words). +/// [`ByteSlice::words`](trait.ByteSlice.html#method.words). /// /// This is similar to the [`WordsWithBreaks`](struct.WordsWithBreaks.html) /// iterator, except it only returns elements that contain a "word" character. @@ -29,7 +29,7 @@ use utf8; pub struct Words<'a>(WordsWithBreaks<'a>); impl<'a> Words<'a> { - pub(crate) fn new(bs: &'a BStr) -> Words<'a> { + pub(crate) fn new(bs: &'a [u8]) -> Words<'a> { Words(WordsWithBreaks::new(bs)) } @@ -41,20 +41,20 @@ impl<'a> Words<'a> { /// # Examples /// /// ``` - /// use bstr::B; + /// use bstr::ByteSlice; /// - /// let mut it = B("foo bar baz").words(); + /// let mut it = b"foo bar baz".words(); /// - /// assert_eq!("foo bar baz", it.as_bstr()); + /// assert_eq!(b"foo bar baz", it.as_bytes()); /// it.next(); /// it.next(); - /// assert_eq!(" baz", it.as_bstr()); + /// assert_eq!(b" baz", it.as_bytes()); /// it.next(); - /// assert_eq!("", it.as_bstr()); + /// assert_eq!(b"", it.as_bytes()); /// ``` #[inline] - pub fn as_bstr(&self) -> &'a BStr { - self.0.as_bstr() + pub fn as_bytes(&self) -> &'a [u8] { + self.0.as_bytes() } } @@ -75,7 +75,7 @@ impl<'a> Iterator for Words<'a> { /// An iterator over words in a byte string and their byte index positions. /// /// This iterator is typically constructed by -/// [`bstr::word_indices`](struct.BStr.html#method.word_indices). +/// [`ByteSlice::word_indices`](trait.ByteSlice.html#method.word_indices). /// /// This is similar to the /// [`WordsWithBreakIndices`](struct.WordsWithBreakIndices.html) iterator, @@ -104,7 +104,7 @@ impl<'a> Iterator for Words<'a> { pub struct WordIndices<'a>(WordsWithBreakIndices<'a>); impl<'a> WordIndices<'a> { - pub(crate) fn new(bs: &'a BStr) -> WordIndices<'a> { + pub(crate) fn new(bs: &'a [u8]) -> WordIndices<'a> { WordIndices(WordsWithBreakIndices::new(bs)) } @@ -116,21 +116,21 @@ impl<'a> WordIndices<'a> { /// # Examples /// /// ``` - /// use bstr::B; + /// use bstr::ByteSlice; /// - /// let mut it = B("foo bar baz").word_indices(); + /// let mut it = b"foo bar baz".word_indices(); /// - /// assert_eq!("foo bar baz", it.as_bstr()); + /// assert_eq!(b"foo bar baz", it.as_bytes()); /// it.next(); /// it.next(); - /// assert_eq!(" baz", it.as_bstr()); + /// assert_eq!(b" baz", it.as_bytes()); /// it.next(); /// it.next(); - /// assert_eq!("", it.as_bstr()); + /// assert_eq!(b"", it.as_bytes()); /// ``` #[inline] - pub fn as_bstr(&self) -> &'a BStr { - self.0.as_bstr() + pub fn as_bytes(&self) -> &'a [u8] { + self.0.as_bytes() } } @@ -151,7 +151,7 @@ impl<'a> Iterator for WordIndices<'a> { /// An iterator over all word breaks in a byte string. /// /// This iterator is typically constructed by -/// [`bstr::words_with_breaks`](struct.BStr.html#method.words_with_breaks). +/// [`ByteSlice::words_with_breaks`](trait.ByteSlice.html#method.words_with_breaks). /// /// This iterator yields not only all words, but the content that comes between /// words. In particular, if all elements yielded by this iterator are @@ -169,11 +169,11 @@ impl<'a> Iterator for WordIndices<'a> { /// that do not use spaces between words. #[derive(Clone, Debug)] pub struct WordsWithBreaks<'a> { - bs: &'a BStr, + bs: &'a [u8], } impl<'a> WordsWithBreaks<'a> { - pub(crate) fn new(bs: &'a BStr) -> WordsWithBreaks<'a> { + pub(crate) fn new(bs: &'a [u8]) -> WordsWithBreaks<'a> { WordsWithBreaks { bs } } @@ -185,22 +185,22 @@ impl<'a> WordsWithBreaks<'a> { /// # Examples /// /// ``` - /// use bstr::B; + /// use bstr::ByteSlice; /// - /// let mut it = B("foo bar baz").words_with_breaks(); + /// let mut it = b"foo bar baz".words_with_breaks(); /// - /// assert_eq!("foo bar baz", it.as_bstr()); + /// assert_eq!(b"foo bar baz", it.as_bytes()); /// it.next(); - /// assert_eq!(" bar baz", it.as_bstr()); + /// assert_eq!(b" bar baz", it.as_bytes()); /// it.next(); /// it.next(); - /// assert_eq!(" baz", it.as_bstr()); + /// assert_eq!(b" baz", it.as_bytes()); /// it.next(); /// it.next(); - /// assert_eq!("", it.as_bstr()); + /// assert_eq!(b"", it.as_bytes()); /// ``` #[inline] - pub fn as_bstr(&self) -> &'a BStr { + pub fn as_bytes(&self) -> &'a [u8] { self.bs } } @@ -223,7 +223,7 @@ impl<'a> Iterator for WordsWithBreaks<'a> { /// index positions. /// /// This iterator is typically constructed by -/// [`bstr::words_with_break_indices`](struct.BStr.html#method.words_with_break_indices). +/// [`ByteSlice::words_with_break_indices`](trait.ByteSlice.html#method.words_with_break_indices). /// /// This iterator yields not only all words, but the content that comes between /// words. In particular, if all elements yielded by this iterator are @@ -248,12 +248,12 @@ impl<'a> Iterator for WordsWithBreaks<'a> { /// that do not use spaces between words. #[derive(Clone, Debug)] pub struct WordsWithBreakIndices<'a> { - bs: &'a BStr, + bs: &'a [u8], forward_index: usize, } impl<'a> WordsWithBreakIndices<'a> { - pub(crate) fn new(bs: &'a BStr) -> WordsWithBreakIndices<'a> { + pub(crate) fn new(bs: &'a [u8]) -> WordsWithBreakIndices<'a> { WordsWithBreakIndices { bs: bs, forward_index: 0 } } @@ -265,22 +265,22 @@ impl<'a> WordsWithBreakIndices<'a> { /// # Examples /// /// ``` - /// use bstr::B; + /// use bstr::ByteSlice; /// - /// let mut it = B("foo bar baz").words_with_break_indices(); + /// let mut it = b"foo bar baz".words_with_break_indices(); /// - /// assert_eq!("foo bar baz", it.as_bstr()); + /// assert_eq!(b"foo bar baz", it.as_bytes()); /// it.next(); - /// assert_eq!(" bar baz", it.as_bstr()); + /// assert_eq!(b" bar baz", it.as_bytes()); /// it.next(); /// it.next(); - /// assert_eq!(" baz", it.as_bstr()); + /// assert_eq!(b" baz", it.as_bytes()); /// it.next(); /// it.next(); - /// assert_eq!("", it.as_bstr()); + /// assert_eq!(b"", it.as_bytes()); /// ``` #[inline] - pub fn as_bstr(&self) -> &'a BStr { + pub fn as_bytes(&self) -> &'a [u8] { self.bs } } @@ -301,17 +301,17 @@ impl<'a> Iterator for WordsWithBreakIndices<'a> { } } -fn decode_word(bs: &BStr) -> (&str, usize) { +fn decode_word(bs: &[u8]) -> (&str, usize) { if bs.is_empty() { ("", 0) - } else if let Some(end) = WORD_BREAK_FWD.find(bs.as_bytes()) { + } else if let Some(end) = WORD_BREAK_FWD.find(bs) { // Safe because a match can only occur for valid UTF-8. let word = unsafe { bs[..end].to_str_unchecked() }; (word, word.len()) } else { const INVALID: &'static str = "\u{FFFD}"; // No match on non-empty bytes implies we found invalid UTF-8. - let (_, size) = utf8::decode_lossy(bs.as_bytes()); + let (_, size) = utf8::decode_lossy(bs); (INVALID, size) } } @@ -320,7 +320,7 @@ fn decode_word(bs: &BStr) -> (&str, usize) { mod tests { use ucd_parse::WordBreakTest; - use bstr::BStr; + use ext_slice::ByteSlice; #[test] fn forward_ucd() { @@ -335,7 +335,7 @@ mod tests { expected: {:?}\n\ got: {:?}\n", i, - BStr::new(&given), + given, strs_to_bstrs(&test.words), strs_to_bstrs(&got), ); @@ -421,11 +421,11 @@ mod tests { } fn words(bytes: &[u8]) -> Vec<&str> { - BStr::new(bytes).words_with_breaks().collect() + bytes.words_with_breaks().collect() } - fn strs_to_bstrs>(strs: &[S]) -> Vec<&BStr> { - strs.iter().map(|s| BStr::new(s.as_ref())).collect() + fn strs_to_bstrs>(strs: &[S]) -> Vec<&[u8]> { + strs.iter().map(|s| s.as_ref().as_bytes()).collect() } /// Return all of the UCD for word breaks. diff --git a/src/utf8.rs b/src/utf8.rs index e35da36..2194141 100644 --- a/src/utf8.rs +++ b/src/utf8.rs @@ -5,7 +5,6 @@ use std::error; use core::fmt; use ascii; -use bstr::BStr; // The UTF-8 decoder provided here is based on the one presented here: // https://bjoern.hoehrmann.de/utf-8/decoder/dfa/ @@ -70,15 +69,15 @@ static STATES_FORWARD: &'static [u8] = &[ /// ["maximal subpart" strategy](http://www.unicode.org/review/pr-121.html). /// /// This iterator is created by the -/// [`chars`](struct.BStr.html#method.chars) method on -/// [`BStr`](struct.BStr.html). +/// [`chars`](trait.ByteSlice.html#method.chars) method provided by the +/// [`ByteSlice`](trait.ByteSlice.html) extension trait for `&[u8]`. #[derive(Clone, Debug)] pub struct Chars<'a> { - bs: &'a BStr, + bs: &'a [u8], } impl<'a> Chars<'a> { - pub(crate) fn new(bs: &'a BStr) -> Chars<'a> { + pub(crate) fn new(bs: &'a [u8]) -> Chars<'a> { Chars { bs } } @@ -90,19 +89,19 @@ impl<'a> Chars<'a> { /// # Examples /// /// ``` - /// use bstr::BStr; + /// use bstr::ByteSlice; /// - /// let mut chars = BStr::new("abc").chars(); + /// let mut chars = b"abc".chars(); /// - /// assert_eq!("abc", chars.as_bstr()); + /// assert_eq!(b"abc", chars.as_bytes()); /// chars.next(); - /// assert_eq!("bc", chars.as_bstr()); + /// assert_eq!(b"bc", chars.as_bytes()); /// chars.next(); /// chars.next(); - /// assert_eq!("", chars.as_bstr()); + /// assert_eq!(b"", chars.as_bytes()); /// ``` #[inline] - pub fn as_bstr(&self) -> &'a BStr { + pub fn as_bytes(&self) -> &'a [u8] { self.bs } } @@ -112,7 +111,7 @@ impl<'a> Iterator for Chars<'a> { #[inline] fn next(&mut self) -> Option { - let (ch, size) = decode_lossy(self.bs.as_bytes()); + let (ch, size) = decode_lossy(self.bs); if size == 0 { return None; } @@ -124,7 +123,7 @@ impl<'a> Iterator for Chars<'a> { impl<'a> DoubleEndedIterator for Chars<'a> { #[inline] fn next_back(&mut self) -> Option { - let (ch, size) = decode_last_lossy(self.bs.as_bytes()); + let (ch, size) = decode_last_lossy(self.bs); if size == 0 { return None; } @@ -149,17 +148,17 @@ impl<'a> DoubleEndedIterator for Chars<'a> { /// substitute anywhere from 1 to 3 invalid bytes (inclusive). /// /// This iterator is created by the -/// [`char_indices`](struct.BStr.html#method.char_indices) method on -/// [`BStr`](struct.BStr.html). +/// [`char_indices`](trait.ByteSlice.html#method.char_indices) method provided +/// by the [`ByteSlice`](trait.ByteSlice.html) extension trait for `&[u8]`. #[derive(Clone, Debug)] pub struct CharIndices<'a> { - bs: &'a BStr, + bs: &'a [u8], forward_index: usize, reverse_index: usize, } impl<'a> CharIndices<'a> { - pub(crate) fn new(bs: &'a BStr) -> CharIndices<'a> { + pub(crate) fn new(bs: &'a [u8]) -> CharIndices<'a> { CharIndices { bs: bs, forward_index: 0, reverse_index: bs.len() } } @@ -171,19 +170,19 @@ impl<'a> CharIndices<'a> { /// # Examples /// /// ``` - /// use bstr::B; + /// use bstr::ByteSlice; /// - /// let mut it = B("abc").char_indices(); + /// let mut it = b"abc".char_indices(); /// - /// assert_eq!("abc", it.as_bstr()); + /// assert_eq!(b"abc", it.as_bytes()); /// it.next(); - /// assert_eq!("bc", it.as_bstr()); + /// assert_eq!(b"bc", it.as_bytes()); /// it.next(); /// it.next(); - /// assert_eq!("", it.as_bstr()); + /// assert_eq!(b"", it.as_bytes()); /// ``` #[inline] - pub fn as_bstr(&self) -> &'a BStr { + pub fn as_bytes(&self) -> &'a [u8] { self.bs } } @@ -194,7 +193,7 @@ impl<'a> Iterator for CharIndices<'a> { #[inline] fn next(&mut self) -> Option<(usize, usize, char)> { let index = self.forward_index; - let (ch, size) = decode_lossy(self.bs.as_bytes()); + let (ch, size) = decode_lossy(self.bs); if size == 0 { return None; } @@ -207,7 +206,7 @@ impl<'a> Iterator for CharIndices<'a> { impl<'a> DoubleEndedIterator for CharIndices<'a> { #[inline] fn next_back(&mut self) -> Option<(usize, usize, char)> { - let (ch, size) = decode_last_lossy(self.bs.as_bytes()); + let (ch, size) = decode_last_lossy(self.bs); if size == 0 { return None; } @@ -221,7 +220,7 @@ impl<'a> DoubleEndedIterator for CharIndices<'a> { /// /// This error occurs when attempting to convert a non-UTF-8 byte /// string to a Rust string that must be valid UTF-8. For example, -/// [`to_str`](struct.BStr.html#method.to_str) is one such method. +/// [`to_str`](trait.ByteSlice.html#method.to_str) is one such method. /// /// # Example /// @@ -229,7 +228,7 @@ impl<'a> DoubleEndedIterator for CharIndices<'a> { /// but ends with a sequence that is a possible prefix of valid UTF-8. /// /// ``` -/// use bstr::B; +/// use bstr::{B, ByteSlice}; /// /// let s = B(b"foobar\xF1\x80\x80"); /// let err = s.to_str().unwrap_err(); @@ -241,9 +240,9 @@ impl<'a> DoubleEndedIterator for CharIndices<'a> { /// invalid UTF-8. /// /// ``` -/// use bstr::B; +/// use bstr::ByteSlice; /// -/// let s = B(b"foobar\xF1\x80\x80quux"); +/// let s = b"foobar\xF1\x80\x80quux"; /// let err = s.to_str().unwrap_err(); /// assert_eq!(err.valid_up_to(), 6); /// // The error length reports the maximum number of bytes that correspond to @@ -253,14 +252,14 @@ impl<'a> DoubleEndedIterator for CharIndices<'a> { /// // In contrast to the above which contains a single invalid prefix, /// // consider the case of multiple individal bytes that are never valid /// // prefixes. Note how the value of error_len changes! -/// let s = B(b"foobar\xFF\xFFquux"); +/// let s = b"foobar\xFF\xFFquux"; /// let err = s.to_str().unwrap_err(); /// assert_eq!(err.valid_up_to(), 6); /// assert_eq!(err.error_len(), Some(1)); /// /// // The fact that it's an invalid prefix does not change error_len even /// // when it immediately precedes the end of the string. -/// let s = B(b"foobar\xFF"); +/// let s = b"foobar\xFF"; /// let err = s.to_str().unwrap_err(); /// assert_eq!(err.valid_up_to(), 6); /// assert_eq!(err.error_len(), Some(1)); @@ -281,9 +280,9 @@ impl Utf8Error { /// possibly empty prefix that is guaranteed to be valid UTF-8: /// /// ``` - /// use bstr::B; + /// use bstr::ByteSlice; /// - /// let s = B(b"foobar\xF1\x80\x80quux"); + /// let s = b"foobar\xF1\x80\x80quux"; /// let err = s.to_str().unwrap_err(); /// /// // This is guaranteed to never panic. @@ -455,9 +454,9 @@ pub fn validate(slice: &[u8]) -> Result<(), Utf8Error> { /// codepoint: /// /// ``` -/// use bstr::decode_utf8; +/// use bstr::{B, decode_utf8}; /// -/// let mut bytes = &b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61"[..]; +/// let mut bytes = B(b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61"); /// let mut chars = vec![]; /// while !bytes.is_empty() { /// let (ch, size) = decode_utf8(bytes); @@ -529,9 +528,9 @@ pub fn decode>(slice: B) -> (Option, usize) { /// codepoint: /// /// ```ignore -/// use bstr::decode_utf8_lossy; +/// use bstr::{B, decode_utf8_lossy}; /// -/// let mut bytes = &b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61"[..]; +/// let mut bytes = B(b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61"); /// let mut chars = vec![]; /// while !bytes.is_empty() { /// let (ch, size) = decode_utf8_lossy(bytes); @@ -582,9 +581,9 @@ pub fn decode_lossy>(slice: B) -> (char, usize) { /// replacement codepoint: /// /// ``` -/// use bstr::decode_last_utf8; +/// use bstr::{B, decode_last_utf8}; /// -/// let mut bytes = &b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61"[..]; +/// let mut bytes = B(b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61"); /// let mut chars = vec![]; /// while !bytes.is_empty() { /// let (ch, size) = decode_last_utf8(bytes); @@ -655,7 +654,7 @@ pub fn decode_last>(slice: B) -> (Option, usize) { /// ```ignore /// use bstr::decode_last_utf8_lossy; /// -/// let mut bytes = &b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61"[..]; +/// let mut bytes = B(b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61"); /// let mut chars = vec![]; /// while !bytes.is_empty() { /// let (ch, size) = decode_last_utf8_lossy(bytes); @@ -694,7 +693,7 @@ fn is_leading_utf8_byte(b: u8) -> bool { mod tests { use std::char; - use bstr::B; + use ext_slice::{B, ByteSlice}; use tests::LOSSY_TESTS; use utf8::{self, Utf8Error};