Skip to content

Commit

Permalink
Auto merge of #32204 - alexcrichton:redesign-char-encoding-types, r=a…
Browse files Browse the repository at this point in the history
…turon

std: Change `encode_utf{8,16}` to return iterators

Currently these have non-traditional APIs which take a buffer and report how
much was filled in, but they're not necessarily ergonomic to use. Returning an
iterator which *also* exposes an underlying slice shouldn't result in any
performance loss as it's just a lazy version of the same implementation, and
it's also much more ergonomic!

cc #27784
  • Loading branch information
bors committed Mar 22, 2016
2 parents e3f2dfd + 48d5fe9 commit 0dcc413
Show file tree
Hide file tree
Showing 10 changed files with 195 additions and 201 deletions.
25 changes: 5 additions & 20 deletions src/libcollections/string.rs
Expand Up @@ -61,7 +61,6 @@ use core::iter::FromIterator;
use core::mem;
use core::ops::{self, Add, Index, IndexMut};
use core::ptr;
use core::slice;
use core::str::pattern::Pattern;
use rustc_unicode::char::{decode_utf16, REPLACEMENT_CHARACTER};
use rustc_unicode::str as unicode_str;
Expand Down Expand Up @@ -970,22 +969,7 @@ impl String {
pub fn push(&mut self, ch: char) {
match ch.len_utf8() {
1 => self.vec.push(ch as u8),
ch_len => {
let cur_len = self.len();
// This may use up to 4 bytes.
self.vec.reserve(ch_len);

unsafe {
// Attempt to not use an intermediate buffer by just pushing bytes
// directly onto this string.
let slice = slice::from_raw_parts_mut(self.vec
.as_mut_ptr()
.offset(cur_len as isize),
ch_len);
let used = ch.encode_utf8(slice).unwrap_or(0);
self.vec.set_len(cur_len + used);
}
}
_ => self.vec.extend_from_slice(ch.encode_utf8().as_slice()),
}
}

Expand Down Expand Up @@ -1136,9 +1120,10 @@ impl String {
let len = self.len();
assert!(idx <= len);
assert!(self.is_char_boundary(idx));
self.vec.reserve(4);
let mut bits = [0; 4];
let amt = ch.encode_utf8(&mut bits).unwrap();
let bits = ch.encode_utf8();
let bits = bits.as_slice();
let amt = bits.len();
self.vec.reserve(amt);

unsafe {
ptr::copy(self.vec.as_ptr().offset(idx as isize),
Expand Down
10 changes: 4 additions & 6 deletions src/libcollectionstest/str.rs
Expand Up @@ -794,10 +794,9 @@ fn test_rev_iterator() {

#[test]
fn test_chars_decoding() {
let mut bytes = [0; 4];
for c in (0..0x110000).filter_map(::std::char::from_u32) {
let len = c.encode_utf8(&mut bytes).unwrap_or(0);
let s = ::std::str::from_utf8(&bytes[..len]).unwrap();
let bytes = c.encode_utf8();
let s = ::std::str::from_utf8(bytes.as_slice()).unwrap();
if Some(c) != s.chars().next() {
panic!("character {:x}={} does not decode correctly", c as u32, c);
}
Expand All @@ -806,10 +805,9 @@ fn test_chars_decoding() {

#[test]
fn test_chars_rev_decoding() {
let mut bytes = [0; 4];
for c in (0..0x110000).filter_map(::std::char::from_u32) {
let len = c.encode_utf8(&mut bytes).unwrap_or(0);
let s = ::std::str::from_utf8(&bytes[..len]).unwrap();
let bytes = c.encode_utf8();
let s = ::std::str::from_utf8(bytes.as_slice()).unwrap();
if Some(c) != s.chars().rev().next() {
panic!("character {:x}={} does not decode correctly", c as u32, c);
}
Expand Down
189 changes: 119 additions & 70 deletions src/libcore/char.rs
Expand Up @@ -269,10 +269,10 @@ pub trait CharExt {
fn len_utf8(self) -> usize;
#[stable(feature = "core", since = "1.6.0")]
fn len_utf16(self) -> usize;
#[stable(feature = "core", since = "1.6.0")]
fn encode_utf8(self, dst: &mut [u8]) -> Option<usize>;
#[stable(feature = "core", since = "1.6.0")]
fn encode_utf16(self, dst: &mut [u16]) -> Option<usize>;
#[unstable(feature = "unicode", issue = "27784")]
fn encode_utf8(self) -> EncodeUtf8;
#[unstable(feature = "unicode", issue = "27784")]
fn encode_utf16(self) -> EncodeUtf16;
}

#[stable(feature = "core", since = "1.6.0")]
Expand Down Expand Up @@ -336,75 +336,47 @@ impl CharExt for char {
}

#[inline]
fn encode_utf8(self, dst: &mut [u8]) -> Option<usize> {
encode_utf8_raw(self as u32, dst)
fn encode_utf8(self) -> EncodeUtf8 {
let code = self as u32;
let mut buf = [0; 4];
let pos = if code < MAX_ONE_B {
buf[3] = code as u8;
3
} else if code < MAX_TWO_B {
buf[2] = (code >> 6 & 0x1F) as u8 | TAG_TWO_B;
buf[3] = (code & 0x3F) as u8 | TAG_CONT;
2
} else if code < MAX_THREE_B {
buf[1] = (code >> 12 & 0x0F) as u8 | TAG_THREE_B;
buf[2] = (code >> 6 & 0x3F) as u8 | TAG_CONT;
buf[3] = (code & 0x3F) as u8 | TAG_CONT;
1
} else {
buf[0] = (code >> 18 & 0x07) as u8 | TAG_FOUR_B;
buf[1] = (code >> 12 & 0x3F) as u8 | TAG_CONT;
buf[2] = (code >> 6 & 0x3F) as u8 | TAG_CONT;
buf[3] = (code & 0x3F) as u8 | TAG_CONT;
0
};
EncodeUtf8 { buf: buf, pos: pos }
}

#[inline]
fn encode_utf16(self, dst: &mut [u16]) -> Option<usize> {
encode_utf16_raw(self as u32, dst)
}
}

/// Encodes a raw u32 value as UTF-8 into the provided byte buffer,
/// and then returns the number of bytes written.
///
/// If the buffer is not large enough, nothing will be written into it
/// and a `None` will be returned.
#[inline]
#[unstable(feature = "char_internals",
reason = "this function should not be exposed publicly",
issue = "0")]
#[doc(hidden)]
pub fn encode_utf8_raw(code: u32, dst: &mut [u8]) -> Option<usize> {
// Marked #[inline] to allow llvm optimizing it away
if code < MAX_ONE_B && !dst.is_empty() {
dst[0] = code as u8;
Some(1)
} else if code < MAX_TWO_B && dst.len() >= 2 {
dst[0] = (code >> 6 & 0x1F) as u8 | TAG_TWO_B;
dst[1] = (code & 0x3F) as u8 | TAG_CONT;
Some(2)
} else if code < MAX_THREE_B && dst.len() >= 3 {
dst[0] = (code >> 12 & 0x0F) as u8 | TAG_THREE_B;
dst[1] = (code >> 6 & 0x3F) as u8 | TAG_CONT;
dst[2] = (code & 0x3F) as u8 | TAG_CONT;
Some(3)
} else if dst.len() >= 4 {
dst[0] = (code >> 18 & 0x07) as u8 | TAG_FOUR_B;
dst[1] = (code >> 12 & 0x3F) as u8 | TAG_CONT;
dst[2] = (code >> 6 & 0x3F) as u8 | TAG_CONT;
dst[3] = (code & 0x3F) as u8 | TAG_CONT;
Some(4)
} else {
None
}
}

/// Encodes a raw u32 value as UTF-16 into the provided `u16` buffer,
/// and then returns the number of `u16`s written.
///
/// If the buffer is not large enough, nothing will be written into it
/// and a `None` will be returned.
#[inline]
#[unstable(feature = "char_internals",
reason = "this function should not be exposed publicly",
issue = "0")]
#[doc(hidden)]
pub fn encode_utf16_raw(mut ch: u32, dst: &mut [u16]) -> Option<usize> {
// Marked #[inline] to allow llvm optimizing it away
if (ch & 0xFFFF) == ch && !dst.is_empty() {
// The BMP falls through (assuming non-surrogate, as it should)
dst[0] = ch as u16;
Some(1)
} else if dst.len() >= 2 {
// Supplementary planes break into surrogates.
ch -= 0x1_0000;
dst[0] = 0xD800 | ((ch >> 10) as u16);
dst[1] = 0xDC00 | ((ch as u16) & 0x3FF);
Some(2)
} else {
None
fn encode_utf16(self) -> EncodeUtf16 {
let mut buf = [0; 2];
let mut code = self as u32;
let pos = if (code & 0xFFFF) == code {
// The BMP falls through (assuming non-surrogate, as it should)
buf[1] = code as u16;
1
} else {
// Supplementary planes break into surrogates.
code -= 0x1_0000;
buf[0] = 0xD800 | ((code >> 10) as u16);
buf[1] = 0xDC00 | ((code as u16) & 0x3FF);
0
};
EncodeUtf16 { buf: buf, pos: pos }
}
}

Expand Down Expand Up @@ -583,3 +555,80 @@ impl Iterator for EscapeDefault {
}
}
}

/// An iterator over `u8` entries represending the UTF-8 encoding of a `char`
/// value.
///
/// Constructed via the `.encode_utf8()` method on `char`.
#[unstable(feature = "unicode", issue = "27784")]
#[derive(Debug)]
pub struct EncodeUtf8 {
buf: [u8; 4],
pos: usize,
}

impl EncodeUtf8 {
/// Returns the remaining bytes of this iterator as a slice.
#[unstable(feature = "unicode", issue = "27784")]
pub fn as_slice(&self) -> &[u8] {
&self.buf[self.pos..]
}
}

#[unstable(feature = "unicode", issue = "27784")]
impl Iterator for EncodeUtf8 {
type Item = u8;

fn next(&mut self) -> Option<u8> {
if self.pos == self.buf.len() {
None
} else {
let ret = Some(self.buf[self.pos]);
self.pos += 1;
ret
}
}

fn size_hint(&self) -> (usize, Option<usize>) {
self.as_slice().iter().size_hint()
}
}

/// An iterator over `u16` entries represending the UTF-16 encoding of a `char`
/// value.
///
/// Constructed via the `.encode_utf16()` method on `char`.
#[unstable(feature = "unicode", issue = "27784")]
#[derive(Debug)]
pub struct EncodeUtf16 {
buf: [u16; 2],
pos: usize,
}

impl EncodeUtf16 {
/// Returns the remaining bytes of this iterator as a slice.
#[unstable(feature = "unicode", issue = "27784")]
pub fn as_slice(&self) -> &[u16] {
&self.buf[self.pos..]
}
}


#[unstable(feature = "unicode", issue = "27784")]
impl Iterator for EncodeUtf16 {
type Item = u16;

fn next(&mut self) -> Option<u16> {
if self.pos == self.buf.len() {
None
} else {
let ret = Some(self.buf[self.pos]);
self.pos += 1;
ret
}
}

fn size_hint(&self) -> (usize, Option<usize>) {
self.as_slice().iter().size_hint()
}
}
27 changes: 13 additions & 14 deletions src/libcore/fmt/mod.rs
Expand Up @@ -99,9 +99,9 @@ pub trait Write {
/// This function will return an instance of `Error` on error.
#[stable(feature = "fmt_write_char", since = "1.1.0")]
fn write_char(&mut self, c: char) -> Result {
let mut utf_8 = [0u8; 4];
let bytes_written = c.encode_utf8(&mut utf_8).unwrap_or(0);
self.write_str(unsafe { str::from_utf8_unchecked(&utf_8[..bytes_written]) })
self.write_str(unsafe {
str::from_utf8_unchecked(c.encode_utf8().as_slice())
})
}

/// Glue for usage of the `write!` macro with implementors of this trait.
Expand Down Expand Up @@ -897,10 +897,9 @@ impl<'a> Formatter<'a> {
// Writes the sign if it exists, and then the prefix if it was requested
let write_prefix = |f: &mut Formatter| {
if let Some(c) = sign {
let mut b = [0; 4];
let n = c.encode_utf8(&mut b).unwrap_or(0);
let b = unsafe { str::from_utf8_unchecked(&b[..n]) };
try!(f.buf.write_str(b));
try!(f.buf.write_str(unsafe {
str::from_utf8_unchecked(c.encode_utf8().as_slice())
}));
}
if prefixed { f.buf.write_str(prefix) }
else { Ok(()) }
Expand Down Expand Up @@ -1003,9 +1002,10 @@ impl<'a> Formatter<'a> {
rt::v1::Alignment::Center => (padding / 2, (padding + 1) / 2),
};

let mut fill = [0; 4];
let len = self.fill.encode_utf8(&mut fill).unwrap_or(0);
let fill = unsafe { str::from_utf8_unchecked(&fill[..len]) };
let fill = self.fill.encode_utf8();
let fill = unsafe {
str::from_utf8_unchecked(fill.as_slice())
};

for _ in 0..pre_pad {
try!(self.buf.write_str(fill));
Expand Down Expand Up @@ -1391,10 +1391,9 @@ impl Display for char {
if f.width.is_none() && f.precision.is_none() {
f.write_char(*self)
} else {
let mut utf8 = [0; 4];
let amt = self.encode_utf8(&mut utf8).unwrap_or(0);
let s: &str = unsafe { str::from_utf8_unchecked(&utf8[..amt]) };
f.pad(s)
f.pad(unsafe {
str::from_utf8_unchecked(self.encode_utf8().as_slice())
})
}
}
}
Expand Down
14 changes: 8 additions & 6 deletions src/libcoretest/char.rs
Expand Up @@ -175,9 +175,10 @@ fn test_escape_unicode() {
#[test]
fn test_encode_utf8() {
fn check(input: char, expect: &[u8]) {
let mut buf = [0; 4];
let n = input.encode_utf8(&mut buf).unwrap_or(0);
assert_eq!(&buf[..n], expect);
assert_eq!(input.encode_utf8().as_slice(), expect);
for (a, b) in input.encode_utf8().zip(expect) {
assert_eq!(a, *b);
}
}

check('x', &[0x78]);
Expand All @@ -189,9 +190,10 @@ fn test_encode_utf8() {
#[test]
fn test_encode_utf16() {
fn check(input: char, expect: &[u16]) {
let mut buf = [0; 2];
let n = input.encode_utf16(&mut buf).unwrap_or(0);
assert_eq!(&buf[..n], expect);
assert_eq!(input.encode_utf16().as_slice(), expect);
for (a, b) in input.encode_utf16().zip(expect) {
assert_eq!(a, *b);
}
}

check('x', &[0x0078]);
Expand Down

0 comments on commit 0dcc413

Please sign in to comment.