Refactor low-level UTF-16 decoding.

* Rename `utf16_items` to `decode_utf16`. "Items" is meaningless. * Move it to `rustc_unicode::char`, exposed in `std::char`. * Generalize it to any `u16` iterable, not just `&[u16]`. * Make it yield `Result` instead of a custom `Utf16Item` enum that was isomorphic to `Result`. This enable using the `FromIterator for Result` impl. * Add a `REPLACEMENT_CHARACTER` constant. * Document how `result.unwrap_or(REPLACEMENT_CHARACTER)` replaces `Utf16Item::to_char_lossy`.
rust-lang · Aug 22, 2015 · 6174b8d · 6174b8d
1 parent c408b78
commit 6174b8d
Show file tree

Hide file tree

Showing 10 changed files with 164 additions and 61 deletions.
diff --git a/src/libcollections/lib.rs b/src/libcollections/lib.rs
@@ -56,6 +56,7 @@
 #![feature(unicode)]
 #![feature(unique)]
 #![feature(unsafe_no_drop_flag, filling_drop)]
+#![feature(decode_utf16)]
 #![feature(utf8_error)]
 #![cfg_attr(test, feature(rand, test))]
 

diff --git a/src/libcollections/string.rs b/src/libcollections/string.rs
@@ -20,8 +20,8 @@ use core::ops::{self, Deref, Add, Index};
 use core::ptr;
 use core::slice;
 use core::str::pattern::Pattern;
+use rustc_unicode::char::{decode_utf16, REPLACEMENT_CHARACTER};
 use rustc_unicode::str as unicode_str;
-use rustc_unicode::str::Utf16Item;
 
 use borrow::{Cow, IntoCow};
 use range::RangeArgument;
@@ -267,14 +267,7 @@ impl String {
     /// ```
     #[stable(feature = "rust1", since = "1.0.0")]
     pub fn from_utf16(v: &[u16]) -> Result<String, FromUtf16Error> {
-        let mut s = String::with_capacity(v.len());
-        for c in unicode_str::utf16_items(v) {
-            match c {
-                Utf16Item::ScalarValue(c) => s.push(c),
-                Utf16Item::LoneSurrogate(_) => return Err(FromUtf16Error(())),
-            }
-        }
-        Ok(s)
+        decode_utf16(v.iter().cloned()).collect::<Result<_, _>>().map_err(|_| FromUtf16Error(()))
     }
 
     /// Decode a UTF-16 encoded vector `v` into a string, replacing
@@ -294,7 +287,7 @@ impl String {
     #[inline]
     #[stable(feature = "rust1", since = "1.0.0")]
     pub fn from_utf16_lossy(v: &[u16]) -> String {
-        unicode_str::utf16_items(v).map(|c| c.to_char_lossy()).collect()
+        decode_utf16(v.iter().cloned()).map(|r| r.unwrap_or(REPLACEMENT_CHARACTER)).collect()
     }
 
     /// Creates a new `String` from a length, capacity, and pointer.

diff --git a/src/libcoretest/char.rs b/src/libcoretest/char.rs
@@ -211,3 +211,12 @@ fn test_len_utf16() {
     assert!('\u{a66e}'.len_utf16() == 1);
     assert!('\u{1f4a9}'.len_utf16() == 2);
 }
+
+#[test]
+fn test_decode_utf16() {
+    fn check(s: &[u16], expected: &[Result<char, u16>]) {
+        assert_eq!(::std::char::decode_utf16(s.iter().cloned()).collect::<Vec<_>>(), expected);
+    }
+    check(&[0xD800, 0x41, 0x42], &[Err(0xD800), Ok('A'), Ok('B')]);
+    check(&[0xD800, 0], &[Err(0xD800), Ok('\0')]);
+}
diff --git a/src/libcoretest/lib.rs b/src/libcoretest/lib.rs
@@ -19,6 +19,7 @@
 #![feature(float_from_str_radix)]
 #![feature(flt2dec)]
 #![feature(dec2flt)]
+#![feature(decode_utf16)]
 #![feature(fmt_radix)]
 #![feature(iter_arith)]
 #![feature(iter_arith)]

diff --git a/src/librustc_unicode/char.rs b/src/librustc_unicode/char.rs
@@ -503,3 +503,116 @@ impl char {
         ToUppercase(CaseMappingIter::new(conversions::to_upper(self)))
     }
 }
+
+/// An iterator that decodes UTF-16 encoded codepoints from an iterator of `u16`s.
+#[unstable(feature = "decode_utf16", reason = "recently exposed", issue = "27830")]
+#[derive(Clone)]
+pub struct DecodeUtf16<I> where I: Iterator<Item=u16> {
+    iter: I,
+    buf: Option<u16>,
+}
+
+/// Create an iterator over the UTF-16 encoded codepoints in `iterable`,
+/// returning unpaired surrogates as `Err`s.
+///
+/// # Examples
+///
+/// ```
+/// #![feature(decode_utf16)]
+///
+/// use std::char::decode_utf16;
+///
+/// fn main() {
+///     // 𝄞mus<invalid>ic<invalid>
+///     let v = [0xD834, 0xDD1E, 0x006d, 0x0075,
+///              0x0073, 0xDD1E, 0x0069, 0x0063,
+///              0xD834];
+///
+///     assert_eq!(decode_utf16(v.iter().cloned()).collect::<Vec<_>>(),
+///                vec![Ok('𝄞'),
+///                     Ok('m'), Ok('u'), Ok('s'),
+///                     Err(0xDD1E),
+///                     Ok('i'), Ok('c'),
+///                     Err(0xD834)]);
+/// }
+/// ```
+///
+/// A lossy decoder can be obtained by replacing `Err` results with the replacement character:
+///
+/// ```
+/// #![feature(decode_utf16)]
+///
+/// use std::char::{decode_utf16, REPLACEMENT_CHARACTER};
+///
+/// fn main() {
+///     // 𝄞mus<invalid>ic<invalid>
+///     let v = [0xD834, 0xDD1E, 0x006d, 0x0075,
+///              0x0073, 0xDD1E, 0x0069, 0x0063,
+///              0xD834];
+///
+///     assert_eq!(decode_utf16(v.iter().cloned())
+///                    .map(|r| r.unwrap_or(REPLACEMENT_CHARACTER))
+///                    .collect::<String>(),
+///                "𝄞mus�ic�");
+/// }
+/// ```
+#[unstable(feature = "decode_utf16", reason = "recently exposed", issue = "27830")]
+#[inline]
+pub fn decode_utf16<I: IntoIterator<Item=u16>>(iterable: I) -> DecodeUtf16<I::IntoIter> {
+    DecodeUtf16 {
+        iter: iterable.into_iter(),
+        buf: None,
+    }
+}
+
+#[unstable(feature = "decode_utf16", reason = "recently exposed", issue = "27830")]
+impl<I: Iterator<Item=u16>> Iterator for DecodeUtf16<I> {
+    type Item = Result<char, u16>;
+
+    fn next(&mut self) -> Option<Result<char, u16>> {
+        let u = match self.buf.take() {
+            Some(buf) => buf,
+            None => match self.iter.next() {
+                Some(u) => u,
+                None => return None
+            }
+        };
+
+        if u < 0xD800 || 0xDFFF < u {
+            // not a surrogate
+            Some(Ok(unsafe { from_u32_unchecked(u as u32) }))
+        } else if u >= 0xDC00 {
+            // a trailing surrogate
+            Some(Err(u))
+        } else {
+            let u2 = match self.iter.next() {
+                Some(u2) => u2,
+                // eof
+                None => return Some(Err(u))
+            };
+            if u2 < 0xDC00 || u2 > 0xDFFF {
+                // not a trailing surrogate so we're not a valid
+                // surrogate pair, so rewind to redecode u2 next time.
+                self.buf = Some(u2);
+                return Some(Err(u))
+            }
+
+            // all ok, so lets decode it.
+            let c = (((u - 0xD800) as u32) << 10 | (u2 - 0xDC00) as u32) + 0x1_0000;
+            Some(Ok(unsafe { from_u32_unchecked(c) }))
+        }
+    }
+
+    #[inline]
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        let (low, high) = self.iter.size_hint();
+        // we could be entirely valid surrogates (2 elements per
+        // char), or entirely non-surrogates (1 element per char)
+        (low / 2, high)
+    }
+}
+
+/// U+FFFD REPLACEMENT CHARACTER (�) is used in Unicode to represent a decoding error.
+/// It can occur, for example, when giving ill-formed UTF-8 bytes to `String::from_utf8_lossy`.
+#[unstable(feature = "decode_utf16", reason = "recently added", issue = "27830")]
+pub const REPLACEMENT_CHARACTER: char = '\u{FFFD}';
diff --git a/src/librustc_unicode/lib.rs b/src/librustc_unicode/lib.rs
@@ -46,6 +46,7 @@ mod tables;
 mod u_str;
 pub mod char;
 
+#[allow(deprecated)]
 pub mod str {
     pub use u_str::{UnicodeStr, SplitWhitespace};
     pub use u_str::{utf8_char_width, is_utf16, Utf16Items, Utf16Item};

diff --git a/src/librustc_unicode/u_str.rs b/src/librustc_unicode/u_str.rs
@@ -13,8 +13,9 @@
 //! This module provides functionality to `str` that requires the Unicode methods provided by the
 //! unicode parts of the CharExt trait.
 
+use char::{DecodeUtf16, decode_utf16};
 use core::char;
-use core::iter::Filter;
+use core::iter::{Cloned, Filter};
 use core::slice;
 use core::str::Split;
 
@@ -119,11 +120,18 @@ pub fn is_utf16(v: &[u16]) -> bool {
 
 /// An iterator that decodes UTF-16 encoded codepoints from a vector
 /// of `u16`s.
+#[deprecated(since = "1.4.0", reason = "renamed to `char::DecodeUtf16`")]
+#[unstable(feature = "decode_utf16", reason = "not exposed in std", issue = "27830")]
+#[allow(deprecated)]
 #[derive(Clone)]
 pub struct Utf16Items<'a> {
-    iter: slice::Iter<'a, u16>
+    decoder: DecodeUtf16<Cloned<slice::Iter<'a, u16>>>
 }
+
 /// The possibilities for values decoded from a `u16` stream.
+#[deprecated(since = "1.4.0", reason = "`char::DecodeUtf16` uses `Result<char, u16>` instead")]
+#[unstable(feature = "decode_utf16", reason = "not exposed in std", issue = "27830")]
+#[allow(deprecated)]
 #[derive(Copy, PartialEq, Eq, Clone, Debug)]
 pub enum Utf16Item {
     /// A valid codepoint.
@@ -132,6 +140,7 @@ pub enum Utf16Item {
     LoneSurrogate(u16)
 }
 
+#[allow(deprecated)]
 impl Utf16Item {
     /// Convert `self` to a `char`, taking `LoneSurrogate`s to the
     /// replacement character (U+FFFD).
@@ -144,49 +153,22 @@ impl Utf16Item {
     }
 }
 
+#[deprecated(since = "1.4.0", reason = "use `char::DecodeUtf16` instead")]
+#[unstable(feature = "decode_utf16", reason = "not exposed in std", issue = "27830")]
+#[allow(deprecated)]
 impl<'a> Iterator for Utf16Items<'a> {
     type Item = Utf16Item;
 
     fn next(&mut self) -> Option<Utf16Item> {
-        let u = match self.iter.next() {
-            Some(u) => *u,
-            None => return None
-        };
-
-        if u < 0xD800 || 0xDFFF < u {
-            // not a surrogate
-            Some(Utf16Item::ScalarValue(unsafe { char::from_u32_unchecked(u as u32) }))
-        } else if u >= 0xDC00 {
-            // a trailing surrogate
-            Some(Utf16Item::LoneSurrogate(u))
-        } else {
-            // preserve state for rewinding.
-            let old = self.iter.clone();
-
-            let u2 = match self.iter.next() {
-                Some(u2) => *u2,
-                // eof
-                None => return Some(Utf16Item::LoneSurrogate(u))
-            };
-            if u2 < 0xDC00 || u2 > 0xDFFF {
-                // not a trailing surrogate so we're not a valid
-                // surrogate pair, so rewind to redecode u2 next time.
-                self.iter = old.clone();
-                return Some(Utf16Item::LoneSurrogate(u))
-            }
-
-            // all ok, so lets decode it.
-            let c = (((u - 0xD800) as u32) << 10 | (u2 - 0xDC00) as u32) + 0x1_0000;
-            Some(Utf16Item::ScalarValue(unsafe { char::from_u32_unchecked(c) }))
-        }
+        self.decoder.next().map(|result| match result {
+            Ok(c) => Utf16Item::ScalarValue(c),
+            Err(s) => Utf16Item::LoneSurrogate(s),
+        })
     }
 
     #[inline]
     fn size_hint(&self) -> (usize, Option<usize>) {
-        let (low, high) = self.iter.size_hint();
-        // we could be entirely valid surrogates (2 elements per
-        // char), or entirely non-surrogates (1 element per char)
-        (low / 2, high)
+        self.decoder.size_hint()
     }
 }
 
@@ -196,7 +178,7 @@ impl<'a> Iterator for Utf16Items<'a> {
 /// # Examples
 ///
 /// ```
-/// #![feature(unicode)]
+/// #![feature(unicode, decode_utf16)]
 ///
 /// extern crate rustc_unicode;
 ///
@@ -216,8 +198,11 @@ impl<'a> Iterator for Utf16Items<'a> {
 ///                     LoneSurrogate(0xD834)]);
 /// }
 /// ```
+#[deprecated(since = "1.4.0", reason = "renamed to `char::decode_utf16`")]
+#[unstable(feature = "decode_utf16", reason = "not exposed in std", issue = "27830")]
+#[allow(deprecated)]
 pub fn utf16_items<'a>(v: &'a [u16]) -> Utf16Items<'a> {
-    Utf16Items { iter : v.iter() }
+    Utf16Items { decoder: decode_utf16(v.iter().cloned()) }
 }
 
 /// Iterator adaptor for encoding `char`s to UTF-16.

diff --git a/src/libserialize/json.rs b/src/libserialize/json.rs
@@ -209,8 +209,6 @@ use std::str::FromStr;
 use std::string;
 use std::{char, f64, fmt, str};
 use std;
-use rustc_unicode::str as unicode_str;
-use rustc_unicode::str::Utf16Item;
 
 use Encodable;
 
@@ -1712,11 +1710,13 @@ impl<T: Iterator<Item=char>> Parser<T> {
                                 _ => return self.error(UnexpectedEndOfHexEscape),
                             }
 
-                            let buf = [n1, try!(self.decode_hex_escape())];
-                            match unicode_str::utf16_items(&buf).next() {
-                                Some(Utf16Item::ScalarValue(c)) => res.push(c),
-                                _ => return self.error(LoneLeadingSurrogateInHexEscape),
+                            let n2 = try!(self.decode_hex_escape());
+                            if n2 < 0xDC00 || n2 > 0xDFFF {
+                                return self.error(LoneLeadingSurrogateInHexEscape)
                             }
+                            let c = (((n1 - 0xD800) as u32) << 10 |
+                                     (n2 - 0xDC00) as u32) + 0x1_0000;
+                            res.push(char::from_u32(c).unwrap());
                         }
 
                         n => match char::from_u32(n as u32) {

diff --git a/src/libstd/lib.rs b/src/libstd/lib.rs
@@ -242,6 +242,7 @@
 #![feature(unicode)]
 #![feature(unique)]
 #![feature(unsafe_no_drop_flag, filling_drop)]
+#![feature(decode_utf16)]
 #![feature(vec_push_all)]
 #![feature(vec_resize)]
 #![feature(wrapping)]

diff --git a/src/libstd/sys/common/wtf8.rs b/src/libstd/sys/common/wtf8.rs
@@ -37,7 +37,6 @@ use hash::{Hash, Hasher};
 use iter::FromIterator;
 use mem;
 use ops;
-use rustc_unicode::str::{Utf16Item, utf16_items};
 use slice;
 use str;
 use string::String;
@@ -186,14 +185,14 @@ impl Wtf8Buf {
     /// will always return the original code units.
     pub fn from_wide(v: &[u16]) -> Wtf8Buf {
         let mut string = Wtf8Buf::with_capacity(v.len());
-        for item in utf16_items(v) {
+        for item in char::decode_utf16(v.iter().cloned()) {
             match item {
-                Utf16Item::ScalarValue(c) => string.push_char(c),
-                Utf16Item::LoneSurrogate(s) => {
+                Ok(ch) => string.push_char(ch),
+                Err(surrogate) => {
                     // Surrogates are known to be in the code point range.
-                    let code_point = unsafe { CodePoint::from_u32_unchecked(s as u32) };
+                    let code_point = unsafe { CodePoint::from_u32_unchecked(surrogate as u32) };
                     // Skip the WTF-8 concatenation check,
-                    // surrogate pairs are already decoded by utf16_items
+                    // surrogate pairs are already decoded by decode_utf16
                     string.push_code_point_unchecked(code_point)
                 }
             }