Skip to content

Commit

Permalink
Auto merge of #21488 - aturon:os-str, r=alexcrichton
Browse files Browse the repository at this point in the history
Per [RFC 517](rust-lang/rfcs#575), this commit introduces platform-native strings. The API is essentially as described in the RFC.

The WTF-8 implementation is adapted from @SimonSapin's [implementation](https://github.com/SimonSapin/rust-wtf8). To make this work, some encodign and decoding functionality in `libcore` is now exported in a "raw" fashion reusable for WTF-8. These exports are *not* reexported in `std`, nor are they stable.
  • Loading branch information
bors committed Jan 24, 2015
2 parents 76fbb35 + c5369eb commit bb7cc4e
Show file tree
Hide file tree
Showing 12 changed files with 1,850 additions and 92 deletions.
96 changes: 58 additions & 38 deletions src/libcore/char.rs
Expand Up @@ -258,49 +258,69 @@ impl CharExt for char {
#[inline]
#[unstable = "pending decision about Iterator/Writer/Reader"]
fn encode_utf8(self, dst: &mut [u8]) -> Option<uint> {
// Marked #[inline] to allow llvm optimizing it away
let code = self as u32;
if code < MAX_ONE_B && dst.len() >= 1 {
dst[0] = code as u8;
Some(1)
} else if code < MAX_TWO_B && dst.len() >= 2 {
dst[0] = (code >> 6u & 0x1F_u32) as u8 | TAG_TWO_B;
dst[1] = (code & 0x3F_u32) as u8 | TAG_CONT;
Some(2)
} else if code < MAX_THREE_B && dst.len() >= 3 {
dst[0] = (code >> 12u & 0x0F_u32) as u8 | TAG_THREE_B;
dst[1] = (code >> 6u & 0x3F_u32) as u8 | TAG_CONT;
dst[2] = (code & 0x3F_u32) as u8 | TAG_CONT;
Some(3)
} else if dst.len() >= 4 {
dst[0] = (code >> 18u & 0x07_u32) as u8 | TAG_FOUR_B;
dst[1] = (code >> 12u & 0x3F_u32) as u8 | TAG_CONT;
dst[2] = (code >> 6u & 0x3F_u32) as u8 | TAG_CONT;
dst[3] = (code & 0x3F_u32) as u8 | TAG_CONT;
Some(4)
} else {
None
}
encode_utf8_raw(self as u32, dst)
}

#[inline]
#[unstable = "pending decision about Iterator/Writer/Reader"]
fn encode_utf16(self, dst: &mut [u16]) -> Option<uint> {
// Marked #[inline] to allow llvm optimizing it away
let mut ch = self as u32;
if (ch & 0xFFFF_u32) == ch && dst.len() >= 1 {
// The BMP falls through (assuming non-surrogate, as it should)
dst[0] = ch as u16;
Some(1)
} else if dst.len() >= 2 {
// Supplementary planes break into surrogates.
ch -= 0x1_0000_u32;
dst[0] = 0xD800_u16 | ((ch >> 10) as u16);
dst[1] = 0xDC00_u16 | ((ch as u16) & 0x3FF_u16);
Some(2)
} else {
None
}
encode_utf16_raw(self as u32, dst)
}
}

/// Encodes a raw u32 value as UTF-8 into the provided byte buffer,
/// and then returns the number of bytes written.
///
/// If the buffer is not large enough, nothing will be written into it
/// and a `None` will be returned.
#[inline]
#[unstable]
pub fn encode_utf8_raw(code: u32, dst: &mut [u8]) -> Option<uint> {
// Marked #[inline] to allow llvm optimizing it away
if code < MAX_ONE_B && dst.len() >= 1 {
dst[0] = code as u8;
Some(1)
} else if code < MAX_TWO_B && dst.len() >= 2 {
dst[0] = (code >> 6u & 0x1F_u32) as u8 | TAG_TWO_B;
dst[1] = (code & 0x3F_u32) as u8 | TAG_CONT;
Some(2)
} else if code < MAX_THREE_B && dst.len() >= 3 {
dst[0] = (code >> 12u & 0x0F_u32) as u8 | TAG_THREE_B;
dst[1] = (code >> 6u & 0x3F_u32) as u8 | TAG_CONT;
dst[2] = (code & 0x3F_u32) as u8 | TAG_CONT;
Some(3)
} else if dst.len() >= 4 {
dst[0] = (code >> 18u & 0x07_u32) as u8 | TAG_FOUR_B;
dst[1] = (code >> 12u & 0x3F_u32) as u8 | TAG_CONT;
dst[2] = (code >> 6u & 0x3F_u32) as u8 | TAG_CONT;
dst[3] = (code & 0x3F_u32) as u8 | TAG_CONT;
Some(4)
} else {
None
}
}

/// Encodes a raw u32 value as UTF-16 into the provided `u16` buffer,
/// and then returns the number of `u16`s written.
///
/// If the buffer is not large enough, nothing will be written into it
/// and a `None` will be returned.
#[inline]
#[unstable]
pub fn encode_utf16_raw(mut ch: u32, dst: &mut [u16]) -> Option<uint> {
// Marked #[inline] to allow llvm optimizing it away
if (ch & 0xFFFF_u32) == ch && dst.len() >= 1 {
// The BMP falls through (assuming non-surrogate, as it should)
dst[0] = ch as u16;
Some(1)
} else if dst.len() >= 2 {
// Supplementary planes break into surrogates.
ch -= 0x1_0000_u32;
dst[0] = 0xD800_u16 | ((ch >> 10) as u16);
dst[1] = 0xDC00_u16 | ((ch as u16) & 0x3FF_u16);
Some(2)
} else {
None
}
}

Expand Down
116 changes: 67 additions & 49 deletions src/libcore/str/mod.rs
Expand Up @@ -305,43 +305,52 @@ fn unwrap_or_0(opt: Option<&u8>) -> u8 {
}
}

/// Reads the next code point out of a byte iterator (assuming a
/// UTF-8-like encoding).
#[unstable]
pub fn next_code_point(bytes: &mut slice::Iter<u8>) -> Option<u32> {
// Decode UTF-8
let x = match bytes.next() {
None => return None,
Some(&next_byte) if next_byte < 128 => return Some(next_byte as u32),
Some(&next_byte) => next_byte,
};

// Multibyte case follows
// Decode from a byte combination out of: [[[x y] z] w]
// NOTE: Performance is sensitive to the exact formulation here
let init = utf8_first_byte!(x, 2);
let y = unwrap_or_0(bytes.next());
let mut ch = utf8_acc_cont_byte!(init, y);
if x >= 0xE0 {
// [[x y z] w] case
// 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid
let z = unwrap_or_0(bytes.next());
let y_z = utf8_acc_cont_byte!((y & CONT_MASK) as u32, z);
ch = init << 12 | y_z;
if x >= 0xF0 {
// [x y z w] case
// use only the lower 3 bits of `init`
let w = unwrap_or_0(bytes.next());
ch = (init & 7) << 18 | utf8_acc_cont_byte!(y_z, w);
}
}

Some(ch)
}

#[stable]
impl<'a> Iterator for Chars<'a> {
type Item = char;

#[inline]
fn next(&mut self) -> Option<char> {
// Decode UTF-8, using the valid UTF-8 invariant
let x = match self.iter.next() {
None => return None,
Some(&next_byte) if next_byte < 128 => return Some(next_byte as char),
Some(&next_byte) => next_byte,
};

// Multibyte case follows
// Decode from a byte combination out of: [[[x y] z] w]
// NOTE: Performance is sensitive to the exact formulation here
let init = utf8_first_byte!(x, 2);
let y = unwrap_or_0(self.iter.next());
let mut ch = utf8_acc_cont_byte!(init, y);
if x >= 0xE0 {
// [[x y z] w] case
// 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid
let z = unwrap_or_0(self.iter.next());
let y_z = utf8_acc_cont_byte!((y & CONT_MASK) as u32, z);
ch = init << 12 | y_z;
if x >= 0xF0 {
// [x y z w] case
// use only the lower 3 bits of `init`
let w = unwrap_or_0(self.iter.next());
ch = (init & 7) << 18 | utf8_acc_cont_byte!(y_z, w);
next_code_point(&mut self.iter).map(|ch| {
// str invariant says `ch` is a valid Unicode Scalar Value
unsafe {
mem::transmute(ch)
}
}

// str invariant says `ch` is a valid Unicode Scalar Value
unsafe {
Some(mem::transmute(ch))
}
})
}

#[inline]
Expand Down Expand Up @@ -1517,25 +1526,8 @@ impl StrExt for str {

#[inline]
fn char_range_at(&self, i: uint) -> CharRange {
if self.as_bytes()[i] < 128u8 {
return CharRange {ch: self.as_bytes()[i] as char, next: i + 1 };
}

// Multibyte case is a fn to allow char_range_at to inline cleanly
fn multibyte_char_range_at(s: &str, i: uint) -> CharRange {
let mut val = s.as_bytes()[i] as u32;
let w = UTF8_CHAR_WIDTH[val as uint] as uint;
assert!((w != 0));

val = utf8_first_byte!(val, w);
val = utf8_acc_cont_byte!(val, s.as_bytes()[i + 1]);
if w > 2 { val = utf8_acc_cont_byte!(val, s.as_bytes()[i + 2]); }
if w > 3 { val = utf8_acc_cont_byte!(val, s.as_bytes()[i + 3]); }

return CharRange {ch: unsafe { mem::transmute(val) }, next: i + w};
}

return multibyte_char_range_at(self, i);
let (c, n) = char_range_at_raw(self.as_bytes(), i);
CharRange { ch: unsafe { mem::transmute(c) }, next: n }
}

#[inline]
Expand Down Expand Up @@ -1653,6 +1645,32 @@ impl StrExt for str {
fn parse<T: FromStr>(&self) -> Option<T> { FromStr::from_str(self) }
}

/// Pluck a code point out of a UTF-8-like byte slice and return the
/// index of the next code point.
#[inline]
#[unstable]
pub fn char_range_at_raw(bytes: &[u8], i: uint) -> (u32, usize) {
if bytes[i] < 128u8 {
return (bytes[i] as u32, i + 1);
}

// Multibyte case is a fn to allow char_range_at to inline cleanly
fn multibyte_char_range_at(bytes: &[u8], i: uint) -> (u32, usize) {
let mut val = bytes[i] as u32;
let w = UTF8_CHAR_WIDTH[val as uint] as uint;
assert!((w != 0));

val = utf8_first_byte!(val, w);
val = utf8_acc_cont_byte!(val, bytes[i + 1]);
if w > 2 { val = utf8_acc_cont_byte!(val, bytes[i + 2]); }
if w > 3 { val = utf8_acc_cont_byte!(val, bytes[i + 3]); }

return (val, i + w);
}

multibyte_char_range_at(bytes, i)
}

#[stable]
impl<'a> Default for &'a str {
#[stable]
Expand Down
5 changes: 5 additions & 0 deletions src/libstd/ffi/mod.rs
Expand Up @@ -17,4 +17,9 @@ pub use self::c_str::CString;
pub use self::c_str::c_str_to_bytes;
pub use self::c_str::c_str_to_bytes_with_nul;

pub use self::os_str::OsString;
pub use self::os_str::OsStr;
pub use self::os_str::AsOsStr;

mod c_str;
mod os_str;

0 comments on commit bb7cc4e

Please sign in to comment.