From f358407dcfe4cb50a65110aad31bf88c87e72d39 Mon Sep 17 00:00:00 2001
From: Steve Klabnik <steve@steveklabnik.com>
Date: Mon, 22 Sep 2014 15:55:55 -0400
Subject: [PATCH] Some String Guide improvements.

Fixes #17340
---
 src/doc/guide-strings.md | 142 +++++++++++++++++++++++++++++++++------
 1 file changed, 120 insertions(+), 22 deletions(-)

diff --git a/src/doc/guide-strings.md b/src/doc/guide-strings.md
index afca75a76d444..44fc0d8304475 100644
--- a/src/doc/guide-strings.md
+++ b/src/doc/guide-strings.md
@@ -96,12 +96,11 @@ need, and it can make your lifetimes more complex.
 
 ## Generic functions
 
-To write a function that's generic over types of strings, use [the `Str`
-trait](http://doc.rust-lang.org/std/str/trait.Str.html):
+To write a function that's generic over types of strings, use `&str`.
 
 ```{rust}
-fn some_string_length<T: Str>(x: T) -> uint {
-        x.as_slice().len()
+fn some_string_length(x: &str) -> uint {
+        x.len()
 }
 
 fn main() {
@@ -111,15 +110,12 @@ fn main() {
 
     let s = "Hello, world".to_string();
 
-    println!("{}", some_string_length(s));
+    println!("{}", some_string_length(s.as_slice()));
 }
 ```
 
 Both of these lines will print `12`. 
 
-The only method that the `Str` trait has is `as_slice()`, which gives you
-access to a `&str` value from the underlying string.
-
 ## Comparisons
 
 To compare a String to a constant string, prefer `as_slice()`...
@@ -161,25 +157,93 @@ indexing is basically never what you want to do. The reason is that each
 character can be a variable number of bytes. This means that you have to iterate
 through the characters anyway, which is a O(n) operation. 
 
-To iterate over a string, use the `graphemes()` method on `&str`:
+There's 3 basic levels of unicode (and its encodings):
+
+- code units, the underlying data type used to store everything
+- code points/unicode scalar values (char)
+- graphemes (visible characters)
+
+Rust provides iterators for each of these situations:
+
+- `.bytes()` will iterate over the underlying bytes
+- `.chars()` will iterate over the code points
+- `.graphemes()` will iterate over each grapheme
+
+Usually, the `graphemes()` method on `&str` is what you want:
 
 ```{rust}
-let s = "αἰθήρ";
+let s = "u͔n͈̰̎i̙̮͚̦c͚̉o̼̩̰͗d͔̆̓ͥé";
 
 for l in s.graphemes(true) {
     println!("{}", l);
 }
 ```
 
+This prints:
+
+```{notrust,ignore}
+u͔
+n͈̰̎
+i̙̮͚̦
+c͚̉
+o̼̩̰͗
+d͔̆̓ͥ
+é
+```
+
 Note that `l` has the type `&str` here, since a single grapheme can consist of
 multiple codepoints, so a `char` wouldn't be appropriate.
 
-This will print out each character in turn, as you'd expect: first "α", then
-"ἰ", etc. You can see that this is different than just the individual bytes.
-Here's a version that prints out each byte:
+This will print out each visible character in turn, as you'd expect: first "u͔", then
+"n͈̰̎", etc. If you wanted each individual codepoint of each grapheme, you can use `.chars()`:
 
 ```{rust}
-let s = "αἰθήρ";
+let s = "u͔n͈̰̎i̙̮͚̦c͚̉o̼̩̰͗d͔̆̓ͥé";
+
+for l in s.chars() {
+    println!("{}", l);
+}
+```
+
+This prints:
+
+```{notrust,ignore}
+u
+͔
+n
+̎
+͈
+̰
+i
+̙
+̮
+͚
+̦
+c
+̉
+͚
+o
+͗
+̼
+̩
+̰
+d
+̆
+̓
+ͥ
+͔
+e
+́
+```
+
+You can see how some of them are combining characters, and therefore the output
+looks a bit odd.
+
+If you want the individual byte representation of each codepoint, you can use
+`.bytes()`:
+
+```{rust}
+let s = "u͔n͈̰̎i̙̮͚̦c͚̉o̼̩̰͗d͔̆̓ͥé";
 
 for l in s.bytes() {
     println!("{}", l);
@@ -189,16 +253,50 @@ for l in s.bytes() {
 This will print:
 
 ```{notrust,ignore}
-206
-177
-225
-188
+117
+205
+148
+110
+204
+142
+205
+136
+204
 176
-206
-184
-206
+105
+204
+153
+204
 174
-207
+205
+154
+204
+166
+99
+204
+137
+205
+154
+111
+205
+151
+204
+188
+204
+169
+204
+176
+100
+204
+134
+205
+131
+205
+165
+205
+148
+101
+204
 129
 ```