Skip to content

Commit

Permalink
Refactor low-level UTF-16 decoding.
Browse files Browse the repository at this point in the history
* Rename `utf16_items` to `decode_utf16`. "Items" is meaningless.
* Move it to `rustc_unicode::char`, exposed in `std::char`.
* Generalize it to any `u16` iterable, not just `&[u16]`.
* Make it yield `Result` instead of a custom `Utf16Item` enum that was isomorphic to `Result`. This enable using the `FromIterator for Result` impl.
* Add a `REPLACEMENT_CHARACTER` constant.
* Document how `result.unwrap_or(REPLACEMENT_CHARACTER)` replaces `Utf16Item::to_char_lossy`.
  • Loading branch information
SimonSapin committed Aug 22, 2015
1 parent c408b78 commit 6174b8d
Show file tree
Hide file tree
Showing 10 changed files with 164 additions and 61 deletions.
1 change: 1 addition & 0 deletions src/libcollections/lib.rs
Expand Up @@ -56,6 +56,7 @@
#![feature(unicode)]
#![feature(unique)]
#![feature(unsafe_no_drop_flag, filling_drop)]
#![feature(decode_utf16)]
#![feature(utf8_error)]
#![cfg_attr(test, feature(rand, test))]

Expand Down
13 changes: 3 additions & 10 deletions src/libcollections/string.rs
Expand Up @@ -20,8 +20,8 @@ use core::ops::{self, Deref, Add, Index};
use core::ptr;
use core::slice;
use core::str::pattern::Pattern;
use rustc_unicode::char::{decode_utf16, REPLACEMENT_CHARACTER};
use rustc_unicode::str as unicode_str;
use rustc_unicode::str::Utf16Item;

use borrow::{Cow, IntoCow};
use range::RangeArgument;
Expand Down Expand Up @@ -267,14 +267,7 @@ impl String {
/// ```
#[stable(feature = "rust1", since = "1.0.0")]
pub fn from_utf16(v: &[u16]) -> Result<String, FromUtf16Error> {
let mut s = String::with_capacity(v.len());
for c in unicode_str::utf16_items(v) {
match c {
Utf16Item::ScalarValue(c) => s.push(c),
Utf16Item::LoneSurrogate(_) => return Err(FromUtf16Error(())),
}
}
Ok(s)
decode_utf16(v.iter().cloned()).collect::<Result<_, _>>().map_err(|_| FromUtf16Error(()))
}

/// Decode a UTF-16 encoded vector `v` into a string, replacing
Expand All @@ -294,7 +287,7 @@ impl String {
#[inline]
#[stable(feature = "rust1", since = "1.0.0")]
pub fn from_utf16_lossy(v: &[u16]) -> String {
unicode_str::utf16_items(v).map(|c| c.to_char_lossy()).collect()
decode_utf16(v.iter().cloned()).map(|r| r.unwrap_or(REPLACEMENT_CHARACTER)).collect()
}

/// Creates a new `String` from a length, capacity, and pointer.
Expand Down
9 changes: 9 additions & 0 deletions src/libcoretest/char.rs
Expand Up @@ -211,3 +211,12 @@ fn test_len_utf16() {
assert!('\u{a66e}'.len_utf16() == 1);
assert!('\u{1f4a9}'.len_utf16() == 2);
}

#[test]
fn test_decode_utf16() {
fn check(s: &[u16], expected: &[Result<char, u16>]) {
assert_eq!(::std::char::decode_utf16(s.iter().cloned()).collect::<Vec<_>>(), expected);
}
check(&[0xD800, 0x41, 0x42], &[Err(0xD800), Ok('A'), Ok('B')]);
check(&[0xD800, 0], &[Err(0xD800), Ok('\0')]);
}
1 change: 1 addition & 0 deletions src/libcoretest/lib.rs
Expand Up @@ -19,6 +19,7 @@
#![feature(float_from_str_radix)]
#![feature(flt2dec)]
#![feature(dec2flt)]
#![feature(decode_utf16)]
#![feature(fmt_radix)]
#![feature(iter_arith)]
#![feature(iter_arith)]
Expand Down
113 changes: 113 additions & 0 deletions src/librustc_unicode/char.rs
Expand Up @@ -503,3 +503,116 @@ impl char {
ToUppercase(CaseMappingIter::new(conversions::to_upper(self)))
}
}

/// An iterator that decodes UTF-16 encoded codepoints from an iterator of `u16`s.
#[unstable(feature = "decode_utf16", reason = "recently exposed", issue = "27830")]
#[derive(Clone)]
pub struct DecodeUtf16<I> where I: Iterator<Item=u16> {
iter: I,
buf: Option<u16>,
}

/// Create an iterator over the UTF-16 encoded codepoints in `iterable`,
/// returning unpaired surrogates as `Err`s.
///
/// # Examples
///
/// ```
/// #![feature(decode_utf16)]
///
/// use std::char::decode_utf16;
///
/// fn main() {
/// // 𝄞mus<invalid>ic<invalid>
/// let v = [0xD834, 0xDD1E, 0x006d, 0x0075,
/// 0x0073, 0xDD1E, 0x0069, 0x0063,
/// 0xD834];
///
/// assert_eq!(decode_utf16(v.iter().cloned()).collect::<Vec<_>>(),
/// vec![Ok('𝄞'),
/// Ok('m'), Ok('u'), Ok('s'),
/// Err(0xDD1E),
/// Ok('i'), Ok('c'),
/// Err(0xD834)]);
/// }
/// ```
///
/// A lossy decoder can be obtained by replacing `Err` results with the replacement character:
///
/// ```
/// #![feature(decode_utf16)]
///
/// use std::char::{decode_utf16, REPLACEMENT_CHARACTER};
///
/// fn main() {
/// // 𝄞mus<invalid>ic<invalid>
/// let v = [0xD834, 0xDD1E, 0x006d, 0x0075,
/// 0x0073, 0xDD1E, 0x0069, 0x0063,
/// 0xD834];
///
/// assert_eq!(decode_utf16(v.iter().cloned())
/// .map(|r| r.unwrap_or(REPLACEMENT_CHARACTER))
/// .collect::<String>(),
/// "𝄞mus�ic�");
/// }
/// ```
#[unstable(feature = "decode_utf16", reason = "recently exposed", issue = "27830")]
#[inline]
pub fn decode_utf16<I: IntoIterator<Item=u16>>(iterable: I) -> DecodeUtf16<I::IntoIter> {
DecodeUtf16 {
iter: iterable.into_iter(),
buf: None,
}
}

#[unstable(feature = "decode_utf16", reason = "recently exposed", issue = "27830")]
impl<I: Iterator<Item=u16>> Iterator for DecodeUtf16<I> {
type Item = Result<char, u16>;

fn next(&mut self) -> Option<Result<char, u16>> {
let u = match self.buf.take() {
Some(buf) => buf,
None => match self.iter.next() {
Some(u) => u,
None => return None
}
};

if u < 0xD800 || 0xDFFF < u {
// not a surrogate
Some(Ok(unsafe { from_u32_unchecked(u as u32) }))
} else if u >= 0xDC00 {
// a trailing surrogate
Some(Err(u))
} else {
let u2 = match self.iter.next() {
Some(u2) => u2,
// eof
None => return Some(Err(u))
};
if u2 < 0xDC00 || u2 > 0xDFFF {
// not a trailing surrogate so we're not a valid
// surrogate pair, so rewind to redecode u2 next time.
self.buf = Some(u2);
return Some(Err(u))
}

// all ok, so lets decode it.
let c = (((u - 0xD800) as u32) << 10 | (u2 - 0xDC00) as u32) + 0x1_0000;
Some(Ok(unsafe { from_u32_unchecked(c) }))
}
}

#[inline]
fn size_hint(&self) -> (usize, Option<usize>) {
let (low, high) = self.iter.size_hint();
// we could be entirely valid surrogates (2 elements per
// char), or entirely non-surrogates (1 element per char)
(low / 2, high)
}
}

/// U+FFFD REPLACEMENT CHARACTER (�) is used in Unicode to represent a decoding error.
/// It can occur, for example, when giving ill-formed UTF-8 bytes to `String::from_utf8_lossy`.
#[unstable(feature = "decode_utf16", reason = "recently added", issue = "27830")]
pub const REPLACEMENT_CHARACTER: char = '\u{FFFD}';
1 change: 1 addition & 0 deletions src/librustc_unicode/lib.rs
Expand Up @@ -46,6 +46,7 @@ mod tables;
mod u_str;
pub mod char;

#[allow(deprecated)]
pub mod str {
pub use u_str::{UnicodeStr, SplitWhitespace};
pub use u_str::{utf8_char_width, is_utf16, Utf16Items, Utf16Item};
Expand Down
63 changes: 24 additions & 39 deletions src/librustc_unicode/u_str.rs
Expand Up @@ -13,8 +13,9 @@
//! This module provides functionality to `str` that requires the Unicode methods provided by the
//! unicode parts of the CharExt trait.

use char::{DecodeUtf16, decode_utf16};
use core::char;
use core::iter::Filter;
use core::iter::{Cloned, Filter};
use core::slice;
use core::str::Split;

Expand Down Expand Up @@ -119,11 +120,18 @@ pub fn is_utf16(v: &[u16]) -> bool {

/// An iterator that decodes UTF-16 encoded codepoints from a vector
/// of `u16`s.
#[deprecated(since = "1.4.0", reason = "renamed to `char::DecodeUtf16`")]
#[unstable(feature = "decode_utf16", reason = "not exposed in std", issue = "27830")]
#[allow(deprecated)]
#[derive(Clone)]
pub struct Utf16Items<'a> {
iter: slice::Iter<'a, u16>
decoder: DecodeUtf16<Cloned<slice::Iter<'a, u16>>>
}

/// The possibilities for values decoded from a `u16` stream.
#[deprecated(since = "1.4.0", reason = "`char::DecodeUtf16` uses `Result<char, u16>` instead")]
#[unstable(feature = "decode_utf16", reason = "not exposed in std", issue = "27830")]
#[allow(deprecated)]
#[derive(Copy, PartialEq, Eq, Clone, Debug)]
pub enum Utf16Item {
/// A valid codepoint.
Expand All @@ -132,6 +140,7 @@ pub enum Utf16Item {
LoneSurrogate(u16)
}

#[allow(deprecated)]
impl Utf16Item {
/// Convert `self` to a `char`, taking `LoneSurrogate`s to the
/// replacement character (U+FFFD).
Expand All @@ -144,49 +153,22 @@ impl Utf16Item {
}
}

#[deprecated(since = "1.4.0", reason = "use `char::DecodeUtf16` instead")]
#[unstable(feature = "decode_utf16", reason = "not exposed in std", issue = "27830")]
#[allow(deprecated)]
impl<'a> Iterator for Utf16Items<'a> {
type Item = Utf16Item;

fn next(&mut self) -> Option<Utf16Item> {
let u = match self.iter.next() {
Some(u) => *u,
None => return None
};

if u < 0xD800 || 0xDFFF < u {
// not a surrogate
Some(Utf16Item::ScalarValue(unsafe { char::from_u32_unchecked(u as u32) }))
} else if u >= 0xDC00 {
// a trailing surrogate
Some(Utf16Item::LoneSurrogate(u))
} else {
// preserve state for rewinding.
let old = self.iter.clone();

let u2 = match self.iter.next() {
Some(u2) => *u2,
// eof
None => return Some(Utf16Item::LoneSurrogate(u))
};
if u2 < 0xDC00 || u2 > 0xDFFF {
// not a trailing surrogate so we're not a valid
// surrogate pair, so rewind to redecode u2 next time.
self.iter = old.clone();
return Some(Utf16Item::LoneSurrogate(u))
}

// all ok, so lets decode it.
let c = (((u - 0xD800) as u32) << 10 | (u2 - 0xDC00) as u32) + 0x1_0000;
Some(Utf16Item::ScalarValue(unsafe { char::from_u32_unchecked(c) }))
}
self.decoder.next().map(|result| match result {
Ok(c) => Utf16Item::ScalarValue(c),
Err(s) => Utf16Item::LoneSurrogate(s),
})
}

#[inline]
fn size_hint(&self) -> (usize, Option<usize>) {
let (low, high) = self.iter.size_hint();
// we could be entirely valid surrogates (2 elements per
// char), or entirely non-surrogates (1 element per char)
(low / 2, high)
self.decoder.size_hint()
}
}

Expand All @@ -196,7 +178,7 @@ impl<'a> Iterator for Utf16Items<'a> {
/// # Examples
///
/// ```
/// #![feature(unicode)]
/// #![feature(unicode, decode_utf16)]
///
/// extern crate rustc_unicode;
///
Expand All @@ -216,8 +198,11 @@ impl<'a> Iterator for Utf16Items<'a> {
/// LoneSurrogate(0xD834)]);
/// }
/// ```
#[deprecated(since = "1.4.0", reason = "renamed to `char::decode_utf16`")]
#[unstable(feature = "decode_utf16", reason = "not exposed in std", issue = "27830")]
#[allow(deprecated)]
pub fn utf16_items<'a>(v: &'a [u16]) -> Utf16Items<'a> {
Utf16Items { iter : v.iter() }
Utf16Items { decoder: decode_utf16(v.iter().cloned()) }
}

/// Iterator adaptor for encoding `char`s to UTF-16.
Expand Down
12 changes: 6 additions & 6 deletions src/libserialize/json.rs
Expand Up @@ -209,8 +209,6 @@ use std::str::FromStr;
use std::string;
use std::{char, f64, fmt, str};
use std;
use rustc_unicode::str as unicode_str;
use rustc_unicode::str::Utf16Item;

use Encodable;

Expand Down Expand Up @@ -1712,11 +1710,13 @@ impl<T: Iterator<Item=char>> Parser<T> {
_ => return self.error(UnexpectedEndOfHexEscape),
}

let buf = [n1, try!(self.decode_hex_escape())];
match unicode_str::utf16_items(&buf).next() {
Some(Utf16Item::ScalarValue(c)) => res.push(c),
_ => return self.error(LoneLeadingSurrogateInHexEscape),
let n2 = try!(self.decode_hex_escape());
if n2 < 0xDC00 || n2 > 0xDFFF {
return self.error(LoneLeadingSurrogateInHexEscape)
}
let c = (((n1 - 0xD800) as u32) << 10 |
(n2 - 0xDC00) as u32) + 0x1_0000;
res.push(char::from_u32(c).unwrap());
}

n => match char::from_u32(n as u32) {
Expand Down
1 change: 1 addition & 0 deletions src/libstd/lib.rs
Expand Up @@ -242,6 +242,7 @@
#![feature(unicode)]
#![feature(unique)]
#![feature(unsafe_no_drop_flag, filling_drop)]
#![feature(decode_utf16)]
#![feature(vec_push_all)]
#![feature(vec_resize)]
#![feature(wrapping)]
Expand Down
11 changes: 5 additions & 6 deletions src/libstd/sys/common/wtf8.rs
Expand Up @@ -37,7 +37,6 @@ use hash::{Hash, Hasher};
use iter::FromIterator;
use mem;
use ops;
use rustc_unicode::str::{Utf16Item, utf16_items};
use slice;
use str;
use string::String;
Expand Down Expand Up @@ -186,14 +185,14 @@ impl Wtf8Buf {
/// will always return the original code units.
pub fn from_wide(v: &[u16]) -> Wtf8Buf {
let mut string = Wtf8Buf::with_capacity(v.len());
for item in utf16_items(v) {
for item in char::decode_utf16(v.iter().cloned()) {
match item {
Utf16Item::ScalarValue(c) => string.push_char(c),
Utf16Item::LoneSurrogate(s) => {
Ok(ch) => string.push_char(ch),
Err(surrogate) => {
// Surrogates are known to be in the code point range.
let code_point = unsafe { CodePoint::from_u32_unchecked(s as u32) };
let code_point = unsafe { CodePoint::from_u32_unchecked(surrogate as u32) };
// Skip the WTF-8 concatenation check,
// surrogate pairs are already decoded by utf16_items
// surrogate pairs are already decoded by decode_utf16
string.push_code_point_unchecked(code_point)
}
}
Expand Down

0 comments on commit 6174b8d

Please sign in to comment.