From 89feb6d5fd38aa9b493d6fc3ca5b546c373aac31 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20Romanowski?= <pawroman@gmail.com>
Date: Thu, 18 Apr 2019 15:30:50 +0200
Subject: [PATCH 1/6] Clean up unicode.py script

---
 .gitignore                     |   1 +
 src/libcore/unicode/unicode.py | 372 ++++++++++++++++++++++++---------
 2 files changed, 270 insertions(+), 103 deletions(-)

diff --git a/.gitignore b/.gitignore
index 67e0dd8e795bb..51f3e722ca7d8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -36,6 +36,7 @@ __pycache__/
 /src/libcore/unicode/Scripts.txt
 /src/libcore/unicode/SpecialCasing.txt
 /src/libcore/unicode/UnicodeData.txt
+/src/libcore/unicode/downloaded
 /stage[0-9]+/
 /target
 target/
diff --git a/src/libcore/unicode/unicode.py b/src/libcore/unicode/unicode.py
index ae356c3ff4459..97c11fb795ea8 100755
--- a/src/libcore/unicode/unicode.py
+++ b/src/libcore/unicode/unicode.py
@@ -1,35 +1,71 @@
 #!/usr/bin/env python
 
-# This script uses the following Unicode tables:
-# - DerivedCoreProperties.txt
-# - DerivedNormalizationProps.txt
-# - EastAsianWidth.txt
-# - auxiliary/GraphemeBreakProperty.txt
-# - PropList.txt
-# - ReadMe.txt
-# - Scripts.txt
-# - UnicodeData.txt
-#
+"""
+Regenerate Unicode tables (tables.rs).
+"""
+
+# This script uses the Unicode tables as defined
+# in the UnicodeFiles class.
+
 # Since this should not require frequent updates, we just store this
 # out-of-line and check the tables.rs file into git.
 
-import fileinput, re, os, sys, operator, math, datetime
+# Note that the "curl" program is required for operation.
+# This script is compatible with Python 2.7 and 3.x.
+
+import argparse
+import datetime
+import fileinput
+import operator
+import os
+import re
+import textwrap
+import subprocess
+
+from collections import namedtuple
+
+
+# we don't use enum.Enum because of Python 2.7 compatibility
+class UnicodeFiles(object):
+    # ReadMe does not contain any unicode data, we
+    # use it to extract versions.
+    README = "ReadMe.txt"
+
+    DERIVED_CORE_PROPERTIES = "DerivedCoreProperties.txt"
+    DERIVED_NORMALIZATION_PROPS = "DerivedNormalizationProps.txt"
+    SPECIAL_CASING = "SpecialCasing.txt"
+    SCRIPTS = "Scripts.txt"
+    PROPS = "PropList.txt"
+    UNICODE_DATA = "UnicodeData.txt"
+
+
+UnicodeFiles.ALL_FILES = tuple(
+    getattr(UnicodeFiles, name) for name in dir(UnicodeFiles)
+    if not name.startswith("_")
+)
 
-# The directory in which this file resides.
-fdir = os.path.dirname(os.path.realpath(__file__)) + "/"
+# The directory this file is located in.
+THIS_DIR = os.path.dirname(os.path.realpath(__file__))
 
-preamble = '''
+# Where to download the Unicode data.  The downloaded files
+# will be placed in sub-directories named after Unicode version.
+FETCH_DIR = os.path.join(THIS_DIR, "downloaded")
+
+FETCH_URL_LATEST = "ftp://ftp.unicode.org/Public/UNIDATA/{filename}"
+FETCH_URL_VERSION = "ftp://ftp.unicode.org/Public/{version}/ucd/{filename}"
+
+PREAMBLE = """\
 // NOTE: The following code was generated by "./unicode.py", do not edit directly
 
 #![allow(missing_docs, non_upper_case_globals, non_snake_case)]
 
 use unicode::version::UnicodeVersion;
 use unicode::bool_trie::{{BoolTrie, SmallBoolTrie}};
-'''.format(year = datetime.datetime.now().year)
+""".format(year=datetime.datetime.now().year)
 
 # Mapping taken from Table 12 from:
 # http://www.unicode.org/reports/tr44/#General_Category_Values
-expanded_categories = {
+EXPANDED_CATEGORIES = {
     'Lu': ['LC', 'L'], 'Ll': ['LC', 'L'], 'Lt': ['LC', 'L'],
     'Lm': ['L'], 'Lo': ['L'],
     'Mn': ['M'], 'Mc': ['M'], 'Me': ['M'],
@@ -42,22 +78,101 @@
 }
 
 # these are the surrogate codepoints, which are not valid rust characters
-surrogate_codepoints = (0xd800, 0xdfff)
+SURROGATE_CODEPOINTS = (0xd800, 0xdfff)
+
+UnicodeData = namedtuple(
+    "UnicodeData", ("canon_decomp", "compat_decomp", "gencats", "combines",
+                    "to_upper", "to_lower", "to_title", )
+)
+
+UnicodeVersion = namedtuple(
+    "UnicodeVersion", ("major", "minor", "micro", "as_str")
+)
+
+
+def fetch_files(version=None):
+    """
+    Fetch all the Unicode files from unicode.org
+
+    :param version: The desired Unicode version, as string.
+        (If None, defaults to latest final release available).
+    :return: The version downloaded (UnicodeVersion object).
+    """
+    have_version = should_skip_fetch(version)
+    if have_version:
+        return have_version
+
+    if version:
+        # check if the desired version exists on the server
+        get_fetch_url = lambda name: FETCH_URL_VERSION.format(version=version, filename=name)
+    else:
+        # extract the latest version
+        get_fetch_url = lambda name: FETCH_URL_LATEST.format(filename=name)
+
+    readme_url = get_fetch_url(UnicodeFiles.README)
+
+    print("Fetching: {}".format(readme_url))
+    readme_content = subprocess.check_output(("curl", readme_url))
+
+    unicode_version = parse_unicode_version(
+        str(readme_content, "utf8")
+    )
+
+    download_dir = os.path.join(FETCH_DIR, unicode_version.as_str)
+    if not os.path.exists(download_dir):
+        # for 2.7 compat, we don't use exist_ok=True
+        os.makedirs(download_dir)
+
+    for filename in UnicodeFiles.ALL_FILES:
+        file_path = os.path.join(download_dir, filename)
+
+        if filename == UnicodeFiles.README:
+            with open(file_path, "wb") as fd:
+                fd.write(readme_content)
+        elif not os.path.exists(file_path):
+            url = get_fetch_url(filename)
+            print("Fetching: {}".format(url))
+            subprocess.check_call(("curl", "-o", file_path, url))
+
+    return unicode_version
+
+
+def should_skip_fetch(version):
+    if not version:
+        # should always check latest version
+        return False
+
+    fetch_dir = os.path.join(FETCH_DIR, version)
+
+    for filename in UnicodeFiles.ALL_FILES:
+        file_path = os.path.join(fetch_dir, filename)
+
+        if not os.path.exists(file_path):
+            return False
+
+    with open(os.path.join(fetch_dir, UnicodeFiles.README)) as fd:
+        return parse_unicode_version(fd.read())
+
+
+def parse_unicode_version(readme_content):
+    # "raw string" is necessary for \d not being treated as escape char
+    # (for the sake of compat with future Python versions)
+    # see: https://docs.python.org/3.6/whatsnew/3.6.html#deprecated-python-behavior
+    pattern = r"for Version (\d+)\.(\d+)\.(\d+) of the Unicode"
+    groups = re.search(pattern, readme_content).groups()
+
+    return UnicodeVersion(*map(int, groups), as_str=".".join(groups))
+
 
-def fetch(f):
-    path = fdir + os.path.basename(f)
-    if not os.path.exists(path):
-        os.system("curl -o {0}{1} ftp://ftp.unicode.org/Public/UNIDATA/{1}".format(fdir, f))
+def get_unicode_file_path(unicode_version, filename):
+    return os.path.join(FETCH_DIR, unicode_version.as_str, filename)
 
-    if not os.path.exists(path):
-        sys.stderr.write("cannot load %s" % f)
-        exit(1)
 
 def is_surrogate(n):
-    return surrogate_codepoints[0] <= n <= surrogate_codepoints[1]
+    return SURROGATE_CODEPOINTS[0] <= n <= SURROGATE_CODEPOINTS[1]
 
-def load_unicode_data(f):
-    fetch(f)
+
+def load_unicode_data(file_path):
     gencats = {}
     to_lower = {}
     to_upper = {}
@@ -68,8 +183,8 @@ def load_unicode_data(f):
 
     udict = {}
     range_start = -1
-    for line in fileinput.input(fdir + f):
-        data = line.split(';')
+    for line in fileinput.input(file_path):
+        data = line.split(";")
         if len(data) != 15:
             continue
         cp = int(data[0], 16)
@@ -104,7 +219,7 @@ def load_unicode_data(f):
 
         # store decomposition, if given
         if decomp != "":
-            if decomp.startswith('<'):
+            if decomp.startswith("<"):
                 seq = []
                 for i in decomp.split()[1:]:
                     seq.append(int(i, 16))
@@ -116,7 +231,7 @@ def load_unicode_data(f):
                 canon_decomp[code] = seq
 
         # place letter in categories as appropriate
-        for cat in [gencat, "Assigned"] + expanded_categories.get(gencat, []):
+        for cat in [gencat, "Assigned"] + EXPANDED_CATEGORIES.get(gencat, []):
             if cat not in gencats:
                 gencats[cat] = []
             gencats[cat].append(code)
@@ -136,12 +251,15 @@ def load_unicode_data(f):
     gencats = group_cats(gencats)
     combines = to_combines(group_cats(combines))
 
-    return (canon_decomp, compat_decomp, gencats, combines, to_upper, to_lower, to_title)
+    return UnicodeData(
+        canon_decomp, compat_decomp, gencats, combines, to_upper,
+        to_lower, to_title,
+    )
+
 
-def load_special_casing(f, to_upper, to_lower, to_title):
-    fetch(f)
-    for line in fileinput.input(fdir + f):
-        data = line.split('#')[0].split(';')
+def load_special_casing(file_path, unicode_data):
+    for line in fileinput.input(file_path):
+        data = line.split("#")[0].split(";")
         if len(data) == 5:
             code, lower, title, upper, _comment = data
         elif len(data) == 6:
@@ -155,7 +273,9 @@ def load_special_casing(f, to_upper, to_lower, to_title):
         title = title.strip()
         upper = upper.strip()
         key = int(code, 16)
-        for (map_, values) in [(to_lower, lower), (to_upper, upper), (to_title, title)]:
+        for (map_, values) in ((unicode_data.to_lower, lower),
+                               (unicode_data.to_upper, upper),
+                               (unicode_data.to_title, title)):
             if values != code:
                 values = [int(i, 16) for i in values.split()]
                 for _ in range(len(values), 3):
@@ -163,12 +283,14 @@ def load_special_casing(f, to_upper, to_lower, to_title):
                 assert len(values) == 3
                 map_[key] = values
 
+
 def group_cats(cats):
     cats_out = {}
     for cat in cats:
         cats_out[cat] = group_cat(cats[cat])
     return cats_out
 
+
 def group_cat(cat):
     cat_out = []
     letters = sorted(set(cat))
@@ -185,6 +307,7 @@ def group_cat(cat):
     cat_out.append((cur_start, cur_end))
     return cat_out
 
+
 def ungroup_cat(cat):
     cat_out = []
     for (lo, hi) in cat:
@@ -193,21 +316,24 @@ def ungroup_cat(cat):
             lo += 1
     return cat_out
 
+
 def gen_unassigned(assigned):
     assigned = set(assigned)
     return ([i for i in range(0, 0xd800) if i not in assigned] +
             [i for i in range(0xe000, 0x110000) if i not in assigned])
 
+
 def to_combines(combs):
     combs_out = []
     for comb in combs:
         for (lo, hi) in combs[comb]:
             combs_out.append((lo, hi, comb))
-    combs_out.sort(key=lambda comb: comb[0])
+    combs_out.sort(key=lambda c: c[0])
     return combs_out
 
+
 def format_table_content(f, content, indent):
-    line = " "*indent
+    line = " " * indent
     first = True
     for chunk in content.split(","):
         if len(line) + len(chunk) < 98:
@@ -218,16 +344,19 @@ def format_table_content(f, content, indent):
             first = False
         else:
             f.write(line + ",\n")
-            line = " "*indent + chunk
+            line = " " * indent + chunk
     f.write(line)
 
-def load_properties(f, interestingprops):
-    fetch(f)
+
+def load_properties(file_path, interestingprops):
     props = {}
-    re1 = re.compile("^ *([0-9A-F]+) *; *(\w+)")
-    re2 = re.compile("^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+)")
+    # "raw string" is necessary for \w not to be treated as escape char
+    # (for the sake of compat with future Python versions)
+    # see: https://docs.python.org/3.6/whatsnew/3.6.html#deprecated-python-behavior
+    re1 = re.compile(r"^ *([0-9A-F]+) *; *(\w+)")
+    re2 = re.compile(r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+)")
 
-    for line in fileinput.input(fdir + os.path.basename(f)):
+    for line in fileinput.input(file_path):
         prop = None
         d_lo = 0
         d_hi = 0
@@ -258,10 +387,12 @@ def load_properties(f, interestingprops):
 
     return props
 
+
 def escape_char(c):
     return "'\\u{%x}'" % c if c != 0 else "'\\0'"
 
-def emit_table(f, name, t_data, t_type = "&[(char, char)]", is_pub=True,
+
+def emit_table(f, name, t_data, t_type="&[(char, char)]", is_pub=True,
         pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1]))):
     pub_string = ""
     if is_pub:
@@ -277,6 +408,7 @@ def emit_table(f, name, t_data, t_type = "&[(char, char)]", is_pub=True,
     format_table_content(f, data, 8)
     f.write("\n    ];\n\n")
 
+
 def compute_trie(rawdata, chunksize):
     root = []
     childmap = {}
@@ -288,10 +420,11 @@ def compute_trie(rawdata, chunksize):
             childmap[child] = len(childmap)
             child_data.extend(data)
         root.append(childmap[child])
-    return (root, child_data)
+    return root, child_data
+
 
 def emit_bool_trie(f, name, t_data, is_pub=True):
-    CHUNK = 64
+    chunk_size = 64
     rawdata = [False] * 0x110000
     for (lo, hi) in t_data:
         for cp in range(lo, hi + 1):
@@ -299,7 +432,7 @@ def emit_bool_trie(f, name, t_data, is_pub=True):
 
     # convert to bitmap chunks of 64 bits each
     chunks = []
-    for i in range(0x110000 // CHUNK):
+    for i in range(0x110000 // chunk_size):
         chunk = 0
         for j in range(64):
             if rawdata[i * 64 + j]:
@@ -311,12 +444,12 @@ def emit_bool_trie(f, name, t_data, is_pub=True):
         pub_string = "pub "
     f.write("    %sconst %s: &super::BoolTrie = &super::BoolTrie {\n" % (pub_string, name))
     f.write("        r1: [\n")
-    data = ','.join('0x%016x' % chunk for chunk in chunks[0:0x800 // CHUNK])
+    data = ','.join('0x%016x' % chunk for chunk in chunks[0:0x800 // chunk_size])
     format_table_content(f, data, 12)
     f.write("\n        ],\n")
 
     # 0x800..0x10000 trie
-    (r2, r3) = compute_trie(chunks[0x800 // CHUNK : 0x10000 // CHUNK], 64 // CHUNK)
+    (r2, r3) = compute_trie(chunks[0x800 // chunk_size : 0x10000 // chunk_size], 64 // chunk_size)
     f.write("        r2: [\n")
     data = ','.join(str(node) for node in r2)
     format_table_content(f, data, 12)
@@ -327,7 +460,7 @@ def emit_bool_trie(f, name, t_data, is_pub=True):
     f.write("\n        ],\n")
 
     # 0x10000..0x110000 trie
-    (mid, r6) = compute_trie(chunks[0x10000 // CHUNK : 0x110000 // CHUNK], 64 // CHUNK)
+    (mid, r6) = compute_trie(chunks[0x10000 // chunk_size : 0x110000 // chunk_size], 64 // chunk_size)
     (r4, r5) = compute_trie(mid, 64)
     f.write("        r4: [\n")
     data = ','.join(str(node) for node in r4)
@@ -344,6 +477,7 @@ def emit_bool_trie(f, name, t_data, is_pub=True):
 
     f.write("    };\n\n")
 
+
 def emit_small_bool_trie(f, name, t_data, is_pub=True):
     last_chunk = max(hi // 64 for (lo, hi) in t_data)
     n_chunks = last_chunk + 1
@@ -374,6 +508,7 @@ def emit_small_bool_trie(f, name, t_data, is_pub=True):
 
     f.write("    };\n\n")
 
+
 def emit_property_module(f, mod, tbl, emit):
     f.write("pub mod %s {\n" % mod)
     for cat in sorted(emit):
@@ -389,7 +524,8 @@ def emit_property_module(f, mod, tbl, emit):
             f.write("    }\n\n")
     f.write("}\n\n")
 
-def emit_conversions_module(f, to_upper, to_lower, to_title):
+
+def emit_conversions_module(f, unicode_data):
     f.write("pub mod conversions {")
     f.write("""
     pub fn to_lower(c: char) -> [char; 3] {
@@ -414,74 +550,104 @@ def emit_conversions_module(f, to_upper, to_lower, to_title):
     t_type = "&[(char, [char; 3])]"
     pfun = lambda x: "(%s,[%s,%s,%s])" % (
         escape_char(x[0]), escape_char(x[1][0]), escape_char(x[1][1]), escape_char(x[1][2]))
-    emit_table(f, "to_lowercase_table",
-        sorted(to_lower.items(), key=operator.itemgetter(0)),
-        is_pub=False, t_type = t_type, pfun=pfun)
-    emit_table(f, "to_uppercase_table",
-        sorted(to_upper.items(), key=operator.itemgetter(0)),
-        is_pub=False, t_type = t_type, pfun=pfun)
-    f.write("}\n\n")
 
-def emit_norm_module(f, canon, compat, combine, norm_props):
-    canon_keys = sorted(canon.keys())
+    emit_table(f,
+               name="to_lowercase_table",
+               t_data=sorted(unicode_data.to_lower.items(), key=operator.itemgetter(0)),
+               t_type=t_type,
+               is_pub=False,
+               pfun=pfun)
 
-    compat_keys = sorted(compat.keys())
+    emit_table(f,
+               name="to_uppercase_table",
+               t_data=sorted(unicode_data.to_upper.items(), key=operator.itemgetter(0)),
+               t_type=t_type,
+               is_pub=False,
+               pfun=pfun)
+
+    f.write("}\n")
+
+
+def emit_norm_module(f, unicode_data, norm_props):
+    canon_keys = sorted(unicode_data.canon_decomp.keys())
 
     canon_comp = {}
     comp_exclusions = norm_props["Full_Composition_Exclusion"]
     for char in canon_keys:
         if any(lo <= char <= hi for lo, hi in comp_exclusions):
             continue
-        decomp = canon[char]
+        decomp = unicode_data.canon_decomp[char]
         if len(decomp) == 2:
             if decomp[0] not in canon_comp:
                 canon_comp[decomp[0]] = []
-            canon_comp[decomp[0]].append( (decomp[1], char) )
-    canon_comp_keys = sorted(canon_comp.keys())
+            canon_comp[decomp[0]].append((decomp[1], char))
 
-if __name__ == "__main__":
-    r = fdir + "tables.rs"
-    if os.path.exists(r):
-        os.remove(r)
-    with open(r, "w") as rf:
+
+def parse_args():
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("-v", "--version", default=None, type=str,
+                        help="Unicode version to use (if not specified,"
+                             " defaults to latest available final release).")
+
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+
+    unicode_version = fetch_files(args.version)
+    print("Using Unicode version: {}".format(unicode_version.as_str))
+
+    tables_rs_path = os.path.join(THIS_DIR, "tables.rs")
+    if os.path.exists(tables_rs_path):
+        os.remove(tables_rs_path)
+
+    with open(tables_rs_path, "w") as rf:
         # write the file's preamble
-        rf.write(preamble)
-
-        # download and parse all the data
-        fetch("ReadMe.txt")
-        with open(fdir + "ReadMe.txt") as readme:
-            pattern = "for Version (\d+)\.(\d+)\.(\d+) of the Unicode"
-            unicode_version = re.search(pattern, readme.read()).groups()
-        rf.write("""
-/// The version of [Unicode](http://www.unicode.org/) that the Unicode parts of
-/// `char` and `str` methods are based on.
-#[unstable(feature = "unicode_version", issue = "49726")]
-pub const UNICODE_VERSION: UnicodeVersion = UnicodeVersion {
-    major: %s,
-    minor: %s,
-    micro: %s,
-    _priv: (),
-};
-""" % unicode_version)
-        (canon_decomp, compat_decomp, gencats, combines,
-                to_upper, to_lower, to_title) = load_unicode_data("UnicodeData.txt")
-        load_special_casing("SpecialCasing.txt", to_upper, to_lower, to_title)
+        rf.write(PREAMBLE)
+
+        unicode_version_notice = textwrap.dedent("""
+        /// The version of [Unicode](http://www.unicode.org/) that the Unicode parts of
+        /// `char` and `str` methods are based on.
+        #[unstable(feature = "unicode_version", issue = "49726")]
+        pub const UNICODE_VERSION: UnicodeVersion = UnicodeVersion {{
+            major: {version.major},
+            minor: {version.minor},
+            micro: {version.micro},
+            _priv: (),
+        }};
+        """).format(version=unicode_version)
+        rf.write(unicode_version_notice)
+
+        get_path = lambda f: get_unicode_file_path(unicode_version, f)
+
+        unicode_data = load_unicode_data(get_path(UnicodeFiles.UNICODE_DATA))
+        load_special_casing(get_path(UnicodeFiles.SPECIAL_CASING), unicode_data)
+
         want_derived = ["XID_Start", "XID_Continue", "Alphabetic", "Lowercase", "Uppercase",
                         "Cased", "Case_Ignorable", "Grapheme_Extend"]
-        derived = load_properties("DerivedCoreProperties.txt", want_derived)
-        scripts = load_properties("Scripts.txt", [])
-        props = load_properties("PropList.txt",
-                ["White_Space", "Join_Control", "Noncharacter_Code_Point", "Pattern_White_Space"])
-        norm_props = load_properties("DerivedNormalizationProps.txt",
-                     ["Full_Composition_Exclusion"])
+        derived = load_properties(get_path(UnicodeFiles.DERIVED_CORE_PROPERTIES), want_derived)
+
+        # TODO scripts not used?
+        scripts = load_properties(get_path(UnicodeFiles.SCRIPTS), [])
+        props = load_properties(get_path(UnicodeFiles.PROPS),
+                                ["White_Space", "Join_Control", "Noncharacter_Code_Point",
+                                 "Pattern_White_Space"])
+        norm_props = load_properties(get_path(UnicodeFiles.DERIVED_NORMALIZATION_PROPS),
+                                     ["Full_Composition_Exclusion"])
 
         # category tables
-        for (name, cat, pfuns) in ("general_category", gencats, ["N", "Cc"]), \
-                                  ("derived_property", derived, want_derived), \
-                                  ("property", props, ["White_Space", "Pattern_White_Space"]):
+        for (name, cat, pfuns) in (("general_category", unicode_data.gencats, ["N", "Cc"]),
+                                   ("derived_property", derived, want_derived),
+                                   ("property", props, ["White_Space", "Pattern_White_Space"])):
             emit_property_module(rf, name, cat, pfuns)
 
         # normalizations and conversions module
-        emit_norm_module(rf, canon_decomp, compat_decomp, combines, norm_props)
-        emit_conversions_module(rf, to_upper, to_lower, to_title)
+        emit_norm_module(rf, unicode_data, norm_props)
+        emit_conversions_module(rf, unicode_data)
+
     print("Regenerated tables.rs.")
+
+
+if __name__ == "__main__":
+    main()

From a580421afbd6ee93aaab0ad01dee3df8343a88dc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20Romanowski?= <pawroman@gmail.com>
Date: Thu, 18 Apr 2019 16:16:34 +0200
Subject: [PATCH 2/6] More cleanups for unicode.py

---
 src/libcore/unicode/unicode.py | 48 ++++++++++++++++------------------
 1 file changed, 23 insertions(+), 25 deletions(-)

diff --git a/src/libcore/unicode/unicode.py b/src/libcore/unicode/unicode.py
index 97c11fb795ea8..447f4274c18da 100755
--- a/src/libcore/unicode/unicode.py
+++ b/src/libcore/unicode/unicode.py
@@ -28,14 +28,14 @@
 # we don't use enum.Enum because of Python 2.7 compatibility
 class UnicodeFiles(object):
     # ReadMe does not contain any unicode data, we
-    # use it to extract versions.
+    # only use it to extract versions.
     README = "ReadMe.txt"
 
     DERIVED_CORE_PROPERTIES = "DerivedCoreProperties.txt"
     DERIVED_NORMALIZATION_PROPS = "DerivedNormalizationProps.txt"
-    SPECIAL_CASING = "SpecialCasing.txt"
-    SCRIPTS = "Scripts.txt"
     PROPS = "PropList.txt"
+    SCRIPTS = "Scripts.txt"
+    SPECIAL_CASING = "SpecialCasing.txt"
     UNICODE_DATA = "UnicodeData.txt"
 
 
@@ -66,15 +66,15 @@ class UnicodeFiles(object):
 # Mapping taken from Table 12 from:
 # http://www.unicode.org/reports/tr44/#General_Category_Values
 EXPANDED_CATEGORIES = {
-    'Lu': ['LC', 'L'], 'Ll': ['LC', 'L'], 'Lt': ['LC', 'L'],
-    'Lm': ['L'], 'Lo': ['L'],
-    'Mn': ['M'], 'Mc': ['M'], 'Me': ['M'],
-    'Nd': ['N'], 'Nl': ['N'], 'No': ['N'],
-    'Pc': ['P'], 'Pd': ['P'], 'Ps': ['P'], 'Pe': ['P'],
-    'Pi': ['P'], 'Pf': ['P'], 'Po': ['P'],
-    'Sm': ['S'], 'Sc': ['S'], 'Sk': ['S'], 'So': ['S'],
-    'Zs': ['Z'], 'Zl': ['Z'], 'Zp': ['Z'],
-    'Cc': ['C'], 'Cf': ['C'], 'Cs': ['C'], 'Co': ['C'], 'Cn': ['C'],
+    "Lu": ["LC", "L"], "Ll": ["LC", "L"], "Lt": ["LC", "L"],
+    "Lm": ["L"], "Lo": ["L"],
+    "Mn": ["M"], "Mc": ["M"], "Me": ["M"],
+    "Nd": ["N"], "Nl": ["N"], "No": ["N"],
+    "Pc": ["P"], "Pd": ["P"], "Ps": ["P"], "Pe": ["P"],
+    "Pi": ["P"], "Pf": ["P"], "Po": ["P"],
+    "Sm": ["S"], "Sc": ["S"], "Sk": ["S"], "So": ["S"],
+    "Zs": ["Z"], "Zl": ["Z"], "Zp": ["Z"],
+    "Cc": ["C"], "Cf": ["C"], "Cs": ["C"], "Co": ["C"], "Cn": ["C"],
 }
 
 # these are the surrogate codepoints, which are not valid rust characters
@@ -115,7 +115,7 @@ def fetch_files(version=None):
     readme_content = subprocess.check_output(("curl", readme_url))
 
     unicode_version = parse_unicode_version(
-        str(readme_content, "utf8")
+        readme_content.decode("utf8")
     )
 
     download_dir = os.path.join(FETCH_DIR, unicode_version.as_str)
@@ -415,7 +415,7 @@ def compute_trie(rawdata, chunksize):
     child_data = []
     for i in range(len(rawdata) // chunksize):
         data = rawdata[i * chunksize: (i + 1) * chunksize]
-        child = '|'.join(map(str, data))
+        child = "|".join(map(str, data))
         if child not in childmap:
             childmap[child] = len(childmap)
             child_data.extend(data)
@@ -444,18 +444,18 @@ def emit_bool_trie(f, name, t_data, is_pub=True):
         pub_string = "pub "
     f.write("    %sconst %s: &super::BoolTrie = &super::BoolTrie {\n" % (pub_string, name))
     f.write("        r1: [\n")
-    data = ','.join('0x%016x' % chunk for chunk in chunks[0:0x800 // chunk_size])
+    data = ",".join("0x%016x" % chunk for chunk in chunks[0:0x800 // chunk_size])
     format_table_content(f, data, 12)
     f.write("\n        ],\n")
 
     # 0x800..0x10000 trie
     (r2, r3) = compute_trie(chunks[0x800 // chunk_size : 0x10000 // chunk_size], 64 // chunk_size)
     f.write("        r2: [\n")
-    data = ','.join(str(node) for node in r2)
+    data = ",".join(str(node) for node in r2)
     format_table_content(f, data, 12)
     f.write("\n        ],\n")
     f.write("        r3: &[\n")
-    data = ','.join('0x%016x' % chunk for chunk in r3)
+    data = ",".join("0x%016x" % chunk for chunk in r3)
     format_table_content(f, data, 12)
     f.write("\n        ],\n")
 
@@ -463,15 +463,15 @@ def emit_bool_trie(f, name, t_data, is_pub=True):
     (mid, r6) = compute_trie(chunks[0x10000 // chunk_size : 0x110000 // chunk_size], 64 // chunk_size)
     (r4, r5) = compute_trie(mid, 64)
     f.write("        r4: [\n")
-    data = ','.join(str(node) for node in r4)
+    data = ",".join(str(node) for node in r4)
     format_table_content(f, data, 12)
     f.write("\n        ],\n")
     f.write("        r5: &[\n")
-    data = ','.join(str(node) for node in r5)
+    data = ",".join(str(node) for node in r5)
     format_table_content(f, data, 12)
     f.write("\n        ],\n")
     f.write("        r6: &[\n")
-    data = ','.join('0x%016x' % chunk for chunk in r6)
+    data = ",".join("0x%016x" % chunk for chunk in r6)
     format_table_content(f, data, 12)
     f.write("\n        ],\n")
 
@@ -497,12 +497,12 @@ def emit_small_bool_trie(f, name, t_data, is_pub=True):
     (r1, r2) = compute_trie(chunks, 1)
 
     f.write("        r1: &[\n")
-    data = ','.join(str(node) for node in r1)
+    data = ",".join(str(node) for node in r1)
     format_table_content(f, data, 12)
     f.write("\n        ],\n")
 
     f.write("        r2: &[\n")
-    data = ','.join('0x%016x' % node for node in r2)
+    data = ",".join("0x%016x" % node for node in r2)
     format_table_content(f, data, 12)
     f.write("\n        ],\n")
 
@@ -599,11 +599,9 @@ def main():
     print("Using Unicode version: {}".format(unicode_version.as_str))
 
     tables_rs_path = os.path.join(THIS_DIR, "tables.rs")
-    if os.path.exists(tables_rs_path):
-        os.remove(tables_rs_path)
 
+    # will overwrite the file if it exists
     with open(tables_rs_path, "w") as rf:
-        # write the file's preamble
         rf.write(PREAMBLE)
 
         unicode_version_notice = textwrap.dedent("""

From edbc27da2dc5a75f8e0ac70e7a5e07aa6f6f0a16 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20Romanowski?= <pawroman@gmail.com>
Date: Thu, 18 Apr 2019 17:14:31 +0200
Subject: [PATCH 3/6] Fix tidy errors

---
 src/libcore/unicode/unicode.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/libcore/unicode/unicode.py b/src/libcore/unicode/unicode.py
index 447f4274c18da..e645c3f33c84c 100755
--- a/src/libcore/unicode/unicode.py
+++ b/src/libcore/unicode/unicode.py
@@ -460,7 +460,8 @@ def emit_bool_trie(f, name, t_data, is_pub=True):
     f.write("\n        ],\n")
 
     # 0x10000..0x110000 trie
-    (mid, r6) = compute_trie(chunks[0x10000 // chunk_size : 0x110000 // chunk_size], 64 // chunk_size)
+    (mid, r6) = compute_trie(chunks[0x10000 // chunk_size : 0x110000 // chunk_size],
+                             64 // chunk_size)
     (r4, r5) = compute_trie(mid, 64)
     f.write("        r4: [\n")
     data = ",".join(str(node) for node in r4)
@@ -626,7 +627,7 @@ def main():
                         "Cased", "Case_Ignorable", "Grapheme_Extend"]
         derived = load_properties(get_path(UnicodeFiles.DERIVED_CORE_PROPERTIES), want_derived)
 
-        # TODO scripts not used?
+        # FIXME scripts not used?
         scripts = load_properties(get_path(UnicodeFiles.SCRIPTS), [])
         props = load_properties(get_path(UnicodeFiles.PROPS),
                                 ["White_Space", "Join_Control", "Noncharacter_Code_Point",

From 2c9c978e1d4a3541d8df593346c7520c8ef4d69e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20Romanowski?= <pawroman@gmail.com>
Date: Fri, 19 Apr 2019 11:42:08 +0200
Subject: [PATCH 4/6] Refactor and document unicode.py script

---
 src/libcore/unicode/unicode.py | 820 +++++++++++++++++++++------------
 1 file changed, 518 insertions(+), 302 deletions(-)

diff --git a/src/libcore/unicode/unicode.py b/src/libcore/unicode/unicode.py
index e645c3f33c84c..f66e82299100d 100755
--- a/src/libcore/unicode/unicode.py
+++ b/src/libcore/unicode/unicode.py
@@ -16,13 +16,31 @@
 import argparse
 import datetime
 import fileinput
-import operator
+import itertools
 import os
 import re
 import textwrap
 import subprocess
 
-from collections import namedtuple
+from collections import defaultdict, namedtuple
+
+try:
+    # Python 3
+    from itertools import zip_longest
+    from io import StringIO
+except ImportError:
+    # Python 2 compatibility
+    zip_longest = itertools.izip_longest
+    from StringIO import StringIO
+
+try:
+    # completely optional type hinting
+    # (Python 2 compatible using comments,
+    #  see: https://mypy.readthedocs.io/en/latest/python2.html)
+    # This is very helpful in typing-aware IDE like PyCharm.
+    from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, Set, Tuple
+except ImportError:
+    pass
 
 
 # we don't use enum.Enum because of Python 2.7 compatibility
@@ -77,12 +95,21 @@ class UnicodeFiles(object):
     "Cc": ["C"], "Cf": ["C"], "Cs": ["C"], "Co": ["C"], "Cn": ["C"],
 }
 
-# these are the surrogate codepoints, which are not valid rust characters
-SURROGATE_CODEPOINTS = (0xd800, 0xdfff)
+# this is the surrogate codepoints range (both ends inclusive)
+# - they are not valid Rust characters
+SURROGATE_CODEPOINTS_RANGE = (0xd800, 0xdfff)
 
 UnicodeData = namedtuple(
-    "UnicodeData", ("canon_decomp", "compat_decomp", "gencats", "combines",
-                    "to_upper", "to_lower", "to_title", )
+    "UnicodeData", (
+        # conversions:
+        "to_upper", "to_lower", "to_title",
+
+        # decompositions: canonical decompositions, compatibility decomp
+        "canon_decomp", "compat_decomp",
+
+        # grouped: general categories and combining characters
+        "general_categories", "combines",
+    )
 )
 
 UnicodeVersion = namedtuple(
@@ -91,14 +118,19 @@ class UnicodeFiles(object):
 
 
 def fetch_files(version=None):
+    # type: (str) -> UnicodeVersion
     """
-    Fetch all the Unicode files from unicode.org
+    Fetch all the Unicode files from unicode.org.
+
+    This will use cached files (stored in FETCH_DIR) if they exist,
+    creating them if they don't.  In any case, the Unicode version
+    is always returned.
 
     :param version: The desired Unicode version, as string.
-        (If None, defaults to latest final release available).
-    :return: The version downloaded (UnicodeVersion object).
+        (If None, defaults to latest final release available,
+         querying the unicode.org service).
     """
-    have_version = should_skip_fetch(version)
+    have_version = check_stored_version(version)
     if have_version:
         return have_version
 
@@ -114,22 +146,26 @@ def fetch_files(version=None):
     print("Fetching: {}".format(readme_url))
     readme_content = subprocess.check_output(("curl", readme_url))
 
-    unicode_version = parse_unicode_version(
+    unicode_version = parse_readme_unicode_version(
         readme_content.decode("utf8")
     )
 
-    download_dir = os.path.join(FETCH_DIR, unicode_version.as_str)
+    download_dir = get_unicode_dir(unicode_version)
     if not os.path.exists(download_dir):
         # for 2.7 compat, we don't use exist_ok=True
         os.makedirs(download_dir)
 
     for filename in UnicodeFiles.ALL_FILES:
-        file_path = os.path.join(download_dir, filename)
+        file_path = get_unicode_file_path(unicode_version, filename)
+
+        if os.path.exists(file_path):
+            # assume file on the server didn't change if it's been saved before
+            continue
 
         if filename == UnicodeFiles.README:
             with open(file_path, "wb") as fd:
                 fd.write(readme_content)
-        elif not os.path.exists(file_path):
+        else:
             url = get_fetch_url(filename)
             print("Fetching: {}".format(url))
             subprocess.check_call(("curl", "-o", file_path, url))
@@ -137,10 +173,15 @@ def fetch_files(version=None):
     return unicode_version
 
 
-def should_skip_fetch(version):
+def check_stored_version(version):
+    # type: (Optional[str]) -> Optional[UnicodeVersion]
+    """
+    Given desired Unicode version, return the version
+    if stored files are all present, and None otherwise.
+    """
     if not version:
         # should always check latest version
-        return False
+        return None
 
     fetch_dir = os.path.join(FETCH_DIR, version)
 
@@ -148,13 +189,17 @@ def should_skip_fetch(version):
         file_path = os.path.join(fetch_dir, filename)
 
         if not os.path.exists(file_path):
-            return False
+            return None
 
     with open(os.path.join(fetch_dir, UnicodeFiles.README)) as fd:
-        return parse_unicode_version(fd.read())
+        return parse_readme_unicode_version(fd.read())
 
 
-def parse_unicode_version(readme_content):
+def parse_readme_unicode_version(readme_content):
+    # type: (str) -> UnicodeVersion
+    """
+    Parse the Unicode version contained in their ReadMe.txt file.
+    """
     # "raw string" is necessary for \d not being treated as escape char
     # (for the sake of compat with future Python versions)
     # see: https://docs.python.org/3.6/whatsnew/3.6.html#deprecated-python-behavior
@@ -164,45 +209,78 @@ def parse_unicode_version(readme_content):
     return UnicodeVersion(*map(int, groups), as_str=".".join(groups))
 
 
+def get_unicode_dir(unicode_version):
+    # type: (UnicodeVersion) -> str
+    """
+    Indicate where the unicode data files should be stored.
+
+    This returns a full, absolute path.
+    """
+    return os.path.join(FETCH_DIR, unicode_version.as_str)
+
+
 def get_unicode_file_path(unicode_version, filename):
-    return os.path.join(FETCH_DIR, unicode_version.as_str, filename)
+    # type: (UnicodeVersion, str) -> str
+    """
+    Indicate where the unicode data file should be stored.
+    """
+    return os.path.join(get_unicode_dir(unicode_version), filename)
 
 
 def is_surrogate(n):
-    return SURROGATE_CODEPOINTS[0] <= n <= SURROGATE_CODEPOINTS[1]
+    # type: (int) -> bool
+    """
+    Tell if given codepoint is a surrogate (not a valid Rust character).
+    """
+    return SURROGATE_CODEPOINTS_RANGE[0] <= n <= SURROGATE_CODEPOINTS_RANGE[1]
 
 
 def load_unicode_data(file_path):
-    gencats = {}
-    to_lower = {}
-    to_upper = {}
-    to_title = {}
-    combines = {}
-    canon_decomp = {}
-    compat_decomp = {}
-
-    udict = {}
+    # type: (str) -> UnicodeData
+    """
+    Load main unicode data.
+    """
+    # conversions
+    to_lower = {}   # type: Dict[int, Tuple[int, int, int]]
+    to_upper = {}   # type: Dict[int, Tuple[int, int, int]]
+    to_title = {}   # type: Dict[int, Tuple[int, int, int]]
+
+    # decompositions
+    compat_decomp = {}   # type: Dict[int, List[int]]
+    canon_decomp = {}    # type: Dict[int, List[int]]
+
+    # combining characters
+    # FIXME: combines are not used
+    combines = defaultdict(set)   # type: Dict[str, Set[int]]
+
+    # categories
+    general_categories = defaultdict(set)   # type: Dict[str, Set[int]]
+    category_assigned_codepoints = set()    # type: Set[int]
+
+    all_codepoints = {}
+
     range_start = -1
+
     for line in fileinput.input(file_path):
         data = line.split(";")
         if len(data) != 15:
             continue
-        cp = int(data[0], 16)
-        if is_surrogate(cp):
+        codepoint = int(data[0], 16)
+        if is_surrogate(codepoint):
             continue
         if range_start >= 0:
-            for i in range(range_start, cp):
-                udict[i] = data
+            for i in range(range_start, codepoint):
+                all_codepoints[i] = data
             range_start = -1
         if data[1].endswith(", First>"):
-            range_start = cp
+            range_start = codepoint
             continue
-        udict[cp] = data
+        all_codepoints[codepoint] = data
 
-    for code in udict:
+    for code, data in all_codepoints.items():
         (code_org, name, gencat, combine, bidi,
          decomp, deci, digit, num, mirror,
-         old, iso, upcase, lowcase, titlecase) = udict[code]
+         old, iso, upcase, lowcase, titlecase) = data
 
         # generate char to char direct common and simple conversions
         # uppercase to lowercase
@@ -218,46 +296,47 @@ def load_unicode_data(file_path):
             to_title[code] = (int(titlecase, 16), 0, 0)
 
         # store decomposition, if given
-        if decomp != "":
+        if decomp:
+            decompositions = decomp.split()[1:]
+            decomp_code_points = [int(i, 16) for i in decompositions]
+
             if decomp.startswith("<"):
-                seq = []
-                for i in decomp.split()[1:]:
-                    seq.append(int(i, 16))
-                compat_decomp[code] = seq
+                # compatibility decomposition
+                compat_decomp[code] = decomp_code_points
             else:
-                seq = []
-                for i in decomp.split():
-                    seq.append(int(i, 16))
-                canon_decomp[code] = seq
+                # canonical decomposition
+                canon_decomp[code] = decomp_code_points
 
         # place letter in categories as appropriate
-        for cat in [gencat, "Assigned"] + EXPANDED_CATEGORIES.get(gencat, []):
-            if cat not in gencats:
-                gencats[cat] = []
-            gencats[cat].append(code)
+        for cat in itertools.chain((gencat, ), EXPANDED_CATEGORIES.get(gencat, [])):
+            general_categories[cat].add(code)
+            category_assigned_codepoints.add(code)
 
         # record combining class, if any
         if combine != "0":
-            if combine not in combines:
-                combines[combine] = []
-            combines[combine].append(code)
+            combines[combine].add(code)
 
     # generate Not_Assigned from Assigned
-    gencats["Cn"] = gen_unassigned(gencats["Assigned"])
-    # Assigned is not a real category
-    del(gencats["Assigned"])
+    general_categories["Cn"] = get_unassigned_codepoints(category_assigned_codepoints)
+
     # Other contains Not_Assigned
-    gencats["C"].extend(gencats["Cn"])
-    gencats = group_cats(gencats)
-    combines = to_combines(group_cats(combines))
+    general_categories["C"].update(general_categories["Cn"])
+
+    grouped_categories = group_categories(general_categories)
 
+    # FIXME: combines are not used
     return UnicodeData(
-        canon_decomp, compat_decomp, gencats, combines, to_upper,
-        to_lower, to_title,
+        to_lower=to_lower, to_upper=to_upper, to_title=to_title,
+        compat_decomp=compat_decomp, canon_decomp=canon_decomp,
+        general_categories=grouped_categories, combines=combines,
     )
 
 
 def load_special_casing(file_path, unicode_data):
+    # type: (str, UnicodeData) -> None
+    """
+    Load special casing data and enrich given unicode data.
+    """
     for line in fileinput.input(file_path):
         data = line.split("#")[0].split(";")
         if len(data) == 5:
@@ -277,258 +356,395 @@ def load_special_casing(file_path, unicode_data):
                                (unicode_data.to_upper, upper),
                                (unicode_data.to_title, title)):
             if values != code:
-                values = [int(i, 16) for i in values.split()]
-                for _ in range(len(values), 3):
-                    values.append(0)
-                assert len(values) == 3
-                map_[key] = values
-
-
-def group_cats(cats):
-    cats_out = {}
-    for cat in cats:
-        cats_out[cat] = group_cat(cats[cat])
-    return cats_out
-
-
-def group_cat(cat):
-    cat_out = []
-    letters = sorted(set(cat))
-    cur_start = letters.pop(0)
-    cur_end = cur_start
-    for letter in letters:
-        assert letter > cur_end, \
-            "cur_end: %s, letter: %s" % (hex(cur_end), hex(letter))
-        if letter == cur_end + 1:
-            cur_end = letter
-        else:
-            cat_out.append((cur_start, cur_end))
-            cur_start = cur_end = letter
-    cat_out.append((cur_start, cur_end))
-    return cat_out
+                split = values.split()
+
+                codepoints = list(itertools.chain(
+                    (int(i, 16) for i in split),
+                    (0 for _ in range(len(split), 3))
+                ))
+
+                assert len(codepoints) == 3
+                map_[key] = codepoints
+
+
+def group_categories(mapping):
+    # type: (Dict[Any, Iterable[int]]) -> Dict[str, List[Tuple[int, int]]]
+    """
+    Group codepoints mapped in "categories".
+    """
+    return {category: group_codepoints(codepoints)
+            for category, codepoints in mapping.items()}
+
+
+def group_codepoints(codepoints):
+    # type: (Iterable[int]) -> List[Tuple[int, int]]
+    """
+    Group integral values into continuous, disjoint value ranges.
+
+    Performs value deduplication.
+
+    :return: sorted list of pairs denoting start and end of codepoint
+        group values, both ends inclusive.
+
+    >>> group_codepoints([1, 2, 10, 11, 12, 3, 4])
+    [(1, 4), (10, 12)]
+    >>> group_codepoints([1])
+    [(1, 1)]
+    >>> group_codepoints([1, 5, 6])
+    [(1, 1), (5, 6)]
+    >>> group_codepoints([])
+    []
+    """
+    sorted_codes = sorted(set(codepoints))
+    result = []     # type: List[Tuple[int, int]]
 
+    if not sorted_codes:
+        return result
 
-def ungroup_cat(cat):
-    cat_out = []
-    for (lo, hi) in cat:
-        while lo <= hi:
-            cat_out.append(lo)
-            lo += 1
-    return cat_out
+    next_codes = sorted_codes[1:]
+    start_code = sorted_codes[0]
 
+    for code, next_code in zip_longest(sorted_codes, next_codes, fillvalue=None):
+        if next_code is None or next_code - code != 1:
+            result.append((start_code, code))
+            start_code = next_code
 
-def gen_unassigned(assigned):
-    assigned = set(assigned)
-    return ([i for i in range(0, 0xd800) if i not in assigned] +
-            [i for i in range(0xe000, 0x110000) if i not in assigned])
+    return result
 
 
-def to_combines(combs):
-    combs_out = []
-    for comb in combs:
-        for (lo, hi) in combs[comb]:
-            combs_out.append((lo, hi, comb))
-    combs_out.sort(key=lambda c: c[0])
-    return combs_out
+def ungroup_codepoints(codepoint_pairs):
+    # type: (Iterable[Tuple[int, int]]) -> List[int]
+    """
+    The inverse of group_codepoints -- produce a flat list of values
+    from value range pairs.
+
+    >>> ungroup_codepoints([(1, 4), (10, 12)])
+    [1, 2, 3, 4, 10, 11, 12]
+    >>> ungroup_codepoints([(1, 1), (5, 6)])
+    [1, 5, 6]
+    >>> ungroup_codepoints(group_codepoints([1, 2, 7, 8]))
+    [1, 2, 7, 8]
+    >>> ungroup_codepoints([])
+    []
+    """
+    return list(itertools.chain.from_iterable(
+        range(lo, hi + 1) for lo, hi in codepoint_pairs
+    ))
+
+
+def get_unassigned_codepoints(assigned_codepoints):
+    # type: (Set[int]) -> Set[int]
+    """
+    Given a set of "assigned" codepoints, return a set
+    of these that are not in assigned and not surrogate.
+    """
+    return {i for i in range(0, 0x110000)
+            if i not in assigned_codepoints and not is_surrogate(i)}
+
+
+def generate_table_lines(items, indent, wrap=98):
+    # type: (Iterable[str], int, int) -> Iterator[str]
+    """
+    Given table items, generate wrapped lines of text with comma-separated items.
 
+    This is a generator function.
 
-def format_table_content(f, content, indent):
+    :param wrap: soft wrap limit (characters per line), integer.
+    """
     line = " " * indent
     first = True
-    for chunk in content.split(","):
-        if len(line) + len(chunk) < 98:
+    for item in items:
+        if len(line) + len(item) < wrap:
             if first:
-                line += chunk
+                line += item
             else:
-                line += ", " + chunk
+                line += ", " + item
             first = False
         else:
-            f.write(line + ",\n")
-            line = " " * indent + chunk
-    f.write(line)
+            yield line + ",\n"
+            line = " " * indent + item
 
+    yield line
 
-def load_properties(file_path, interestingprops):
-    props = {}
-    # "raw string" is necessary for \w not to be treated as escape char
+
+def load_properties(file_path, interesting_props):
+    # type: (str, Iterable[str]) -> Dict[str, List[Tuple[int, int]]]
+    """
+    Load properties data and return in grouped form.
+    """
+    props = defaultdict(list)   # type: Dict[str, List[Tuple[int, int]]]
+    # "raw string" is necessary for \. and \w not to be treated as escape chars
     # (for the sake of compat with future Python versions)
     # see: https://docs.python.org/3.6/whatsnew/3.6.html#deprecated-python-behavior
     re1 = re.compile(r"^ *([0-9A-F]+) *; *(\w+)")
     re2 = re.compile(r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+)")
 
     for line in fileinput.input(file_path):
-        prop = None
-        d_lo = 0
-        d_hi = 0
-        m = re1.match(line)
-        if m:
-            d_lo = m.group(1)
-            d_hi = m.group(1)
-            prop = m.group(2)
-        else:
-            m = re2.match(line)
-            if m:
-                d_lo = m.group(1)
-                d_hi = m.group(2)
-                prop = m.group(3)
+        match = re1.match(line) or re2.match(line)
+        if match:
+            groups = match.groups()
+
+            if len(groups) == 2:
+                # re1 matched
+                d_lo, prop = groups
+                d_hi = d_lo
             else:
-                continue
-        if interestingprops and prop not in interestingprops:
+                d_lo, d_hi, prop = groups
+        else:
             continue
-        d_lo = int(d_lo, 16)
-        d_hi = int(d_hi, 16)
-        if prop not in props:
-            props[prop] = []
-        props[prop].append((d_lo, d_hi))
+
+        if interesting_props and prop not in interesting_props:
+            continue
+
+        lo_value = int(d_lo, 16)
+        hi_value = int(d_hi, 16)
+
+        props[prop].append((lo_value, hi_value))
 
     # optimize if possible
     for prop in props:
-        props[prop] = group_cat(ungroup_cat(props[prop]))
+        props[prop] = group_codepoints(ungroup_codepoints(props[prop]))
 
     return props
 
 
 def escape_char(c):
-    return "'\\u{%x}'" % c if c != 0 else "'\\0'"
+    # type: (int) -> str
+    r"""
+    Escape a codepoint for use as Rust char literal.
 
+    Outputs are OK to use as Rust source code as char literals
+    and they also include necessary quotes.
 
-def emit_table(f, name, t_data, t_type="&[(char, char)]", is_pub=True,
-        pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1]))):
+    >>> escape_char(97)
+    "'\\u{61}'"
+    >>> escape_char(0)
+    "'\\0'"
+    """
+    return r"'\u{%x}'" % c if c != 0 else r"'\0'"
+
+
+def format_char_pair(pair):
+    # type: (Tuple[int, int]) -> str
+    """
+    Format a pair of two Rust chars.
+    """
+    return "(%s,%s)" % (escape_char(pair[0]), escape_char(pair[1]))
+
+
+def generate_table(
+    name,   # type: str
+    items,  # type: List[Tuple[int, int]]
+    decl_type="&[(char, char)]",    # type: str
+    is_pub=True,                    # type: bool
+    format_item=format_char_pair,   # type: Callable[[Tuple[int, int]], str]
+):
+    # type: (...) -> Iterator[str]
+    """
+    Generate a nicely formatted Rust constant "table" array.
+
+    This generates actual Rust code.
+    """
     pub_string = ""
     if is_pub:
         pub_string = "pub "
-    f.write("    %sconst %s: %s = &[\n" % (pub_string, name, t_type))
-    data = ""
+
+    yield "    %sconst %s: %s = &[\n" % (pub_string, name, decl_type)
+
+    data = []
     first = True
-    for dat in t_data:
+    for item in items:
         if not first:
-            data += ","
+            data.append(",")
         first = False
-        data += pfun(dat)
-    format_table_content(f, data, 8)
-    f.write("\n    ];\n\n")
+        data.extend(format_item(item))
+
+    for table_line in generate_table_lines("".join(data).split(","), 8):
+        yield table_line
 
+    yield "\n    ];\n\n"
 
-def compute_trie(rawdata, chunksize):
+
+def compute_trie(raw_data, chunk_size):
+    # type: (List[int], int) -> Tuple[List[int], List[int]]
+    """
+    Compute postfix-compressed trie.
+
+    See: bool_trie.rs for more details.
+
+    >>> compute_trie([1, 2, 3, 1, 2, 3, 4, 5, 6], 3)
+    ([0, 0, 1], [1, 2, 3, 4, 5, 6])
+    >>> compute_trie([1, 2, 3, 1, 2, 4, 4, 5, 6], 3)
+    ([0, 1, 2], [1, 2, 3, 1, 2, 4, 4, 5, 6])
+    """
     root = []
-    childmap = {}
+    childmap = {}       # type: Dict[Tuple[int, ...], int]
     child_data = []
-    for i in range(len(rawdata) // chunksize):
-        data = rawdata[i * chunksize: (i + 1) * chunksize]
-        child = "|".join(map(str, data))
+
+    assert len(raw_data) % chunk_size == 0, "Chunks must be equally sized"
+
+    for i in range(len(raw_data) // chunk_size):
+        data = raw_data[i * chunk_size : (i + 1) * chunk_size]
+
+        # postfix compression of child nodes (data chunks)
+        # (identical child nodes are shared)
+
+        # make a tuple out of the list so it's hashable
+        child = tuple(data)
         if child not in childmap:
             childmap[child] = len(childmap)
             child_data.extend(data)
+
         root.append(childmap[child])
+
     return root, child_data
 
 
-def emit_bool_trie(f, name, t_data, is_pub=True):
+def generate_bool_trie(name, codepoint_ranges, is_pub=True):
+    # type: (str, List[Tuple[int, int]], bool) -> Iterator[str]
+    """
+    Generate Rust code for BoolTrie struct.
+
+    This yields string fragments that should be joined to produce
+    the final string.
+
+    See: bool_trie.rs
+    """
     chunk_size = 64
     rawdata = [False] * 0x110000
-    for (lo, hi) in t_data:
+    for (lo, hi) in codepoint_ranges:
         for cp in range(lo, hi + 1):
             rawdata[cp] = True
 
-    # convert to bitmap chunks of 64 bits each
+    # convert to bitmap chunks of chunk_size bits each
     chunks = []
     for i in range(0x110000 // chunk_size):
         chunk = 0
-        for j in range(64):
-            if rawdata[i * 64 + j]:
+        for j in range(chunk_size):
+            if rawdata[i * chunk_size + j]:
                 chunk |= 1 << j
         chunks.append(chunk)
 
     pub_string = ""
     if is_pub:
         pub_string = "pub "
-    f.write("    %sconst %s: &super::BoolTrie = &super::BoolTrie {\n" % (pub_string, name))
-    f.write("        r1: [\n")
-    data = ",".join("0x%016x" % chunk for chunk in chunks[0:0x800 // chunk_size])
-    format_table_content(f, data, 12)
-    f.write("\n        ],\n")
+    yield "    %sconst %s: &super::BoolTrie = &super::BoolTrie {\n" % (pub_string, name)
+    yield "        r1: [\n"
+    data = ("0x%016x" % chunk for chunk in chunks[:0x800 // chunk_size])
+    for fragment in generate_table_lines(data, 12):
+        yield fragment
+    yield "\n        ],\n"
 
     # 0x800..0x10000 trie
     (r2, r3) = compute_trie(chunks[0x800 // chunk_size : 0x10000 // chunk_size], 64 // chunk_size)
-    f.write("        r2: [\n")
-    data = ",".join(str(node) for node in r2)
-    format_table_content(f, data, 12)
-    f.write("\n        ],\n")
-    f.write("        r3: &[\n")
-    data = ",".join("0x%016x" % chunk for chunk in r3)
-    format_table_content(f, data, 12)
-    f.write("\n        ],\n")
+    yield "        r2: [\n"
+    data = map(str, r2)
+    for fragment in generate_table_lines(data, 12):
+        yield fragment
+    yield "\n        ],\n"
+
+    yield "        r3: &[\n"
+    data = ("0x%016x" % node for node in r3)
+    for fragment in generate_table_lines(data, 12):
+        yield fragment
+    yield "\n        ],\n"
 
     # 0x10000..0x110000 trie
     (mid, r6) = compute_trie(chunks[0x10000 // chunk_size : 0x110000 // chunk_size],
                              64 // chunk_size)
     (r4, r5) = compute_trie(mid, 64)
-    f.write("        r4: [\n")
-    data = ",".join(str(node) for node in r4)
-    format_table_content(f, data, 12)
-    f.write("\n        ],\n")
-    f.write("        r5: &[\n")
-    data = ",".join(str(node) for node in r5)
-    format_table_content(f, data, 12)
-    f.write("\n        ],\n")
-    f.write("        r6: &[\n")
-    data = ",".join("0x%016x" % chunk for chunk in r6)
-    format_table_content(f, data, 12)
-    f.write("\n        ],\n")
-
-    f.write("    };\n\n")
-
-
-def emit_small_bool_trie(f, name, t_data, is_pub=True):
-    last_chunk = max(hi // 64 for (lo, hi) in t_data)
+
+    yield "        r4: [\n"
+    data = map(str, r4)
+    for fragment in generate_table_lines(data, 12):
+        yield fragment
+    yield "\n        ],\n"
+
+    yield "        r5: &[\n"
+    data = map(str, r5)
+    for fragment in generate_table_lines(data, 12):
+        yield fragment
+    yield "\n        ],\n"
+
+    yield "        r6: &[\n"
+    data = ("0x%016x" % node for node in r6)
+    for fragment in generate_table_lines(data, 12):
+        yield fragment
+    yield "\n        ],\n"
+
+    yield "    };\n\n"
+
+
+def generate_small_bool_trie(name, codepoint_ranges, is_pub=True):
+    # type: (str, List[Tuple[int, int]], bool) -> Iterator[str]
+    """
+    Generate Rust code for SmallBoolTrie struct.
+
+    See: bool_trie.rs
+    """
+    last_chunk = max(hi // 64 for (lo, hi) in codepoint_ranges)
     n_chunks = last_chunk + 1
     chunks = [0] * n_chunks
-    for (lo, hi) in t_data:
+    for (lo, hi) in codepoint_ranges:
         for cp in range(lo, hi + 1):
-            if cp // 64 >= len(chunks):
-                print(cp, cp // 64, len(chunks), lo, hi)
+            assert cp // 64 < len(chunks)
             chunks[cp // 64] |= 1 << (cp & 63)
 
     pub_string = ""
     if is_pub:
         pub_string = "pub "
-    f.write("    %sconst %s: &super::SmallBoolTrie = &super::SmallBoolTrie {\n"
-            % (pub_string, name))
+
+    yield ("    %sconst %s: &super::SmallBoolTrie = &super::SmallBoolTrie {\n"
+           % (pub_string, name))
 
     (r1, r2) = compute_trie(chunks, 1)
 
-    f.write("        r1: &[\n")
-    data = ",".join(str(node) for node in r1)
-    format_table_content(f, data, 12)
-    f.write("\n        ],\n")
+    yield "        r1: &[\n"
+    data = (str(node) for node in r1)
+    for fragment in generate_table_lines(data, 12):
+        yield fragment
+    yield "\n        ],\n"
+
+    yield "        r2: &[\n"
+    data = ("0x%016x" % node for node in r2)
+    for fragment in generate_table_lines(data, 12):
+        yield fragment
+    yield "\n        ],\n"
 
-    f.write("        r2: &[\n")
-    data = ",".join("0x%016x" % node for node in r2)
-    format_table_content(f, data, 12)
-    f.write("\n        ],\n")
+    yield "    };\n\n"
 
-    f.write("    };\n\n")
 
+def generate_property_module(mod, grouped_categories, category_subset):
+    # type: (str, Dict[str, List[Tuple[int, int]]], Iterable[str]) -> Iterator[str]
+    """
+    Generate Rust code for module defining properties.
+    """
 
-def emit_property_module(f, mod, tbl, emit):
-    f.write("pub mod %s {\n" % mod)
-    for cat in sorted(emit):
-        if cat in ["Cc", "White_Space", "Pattern_White_Space"]:
-            emit_small_bool_trie(f, "%s_table" % cat, tbl[cat])
-            f.write("    pub fn %s(c: char) -> bool {\n" % cat)
-            f.write("        %s_table.lookup(c)\n" % cat)
-            f.write("    }\n\n")
+    yield "pub mod %s {\n" % mod
+    for cat in sorted(category_subset):
+        if cat in ("Cc", "White_Space", "Pattern_White_Space"):
+            generator = generate_small_bool_trie("%s_table" % cat, grouped_categories[cat])
         else:
-            emit_bool_trie(f, "%s_table" % cat, tbl[cat])
-            f.write("    pub fn %s(c: char) -> bool {\n" % cat)
-            f.write("        %s_table.lookup(c)\n" % cat)
-            f.write("    }\n\n")
-    f.write("}\n\n")
+            generator = generate_bool_trie("%s_table" % cat, grouped_categories[cat])
+
+        for fragment in generator:
+            yield fragment
+
+        yield "    pub fn %s(c: char) -> bool {\n" % cat
+        yield "        %s_table.lookup(c)\n" % cat
+        yield "    }\n\n"
+
+    yield "}\n\n"
+
 
+def generate_conversions_module(unicode_data):
+    # type: (UnicodeData) -> Iterator[str]
+    """
+    Generate Rust code for module defining conversions.
+    """
 
-def emit_conversions_module(f, unicode_data):
-    f.write("pub mod conversions {")
-    f.write("""
+    yield "pub mod conversions {"
+    yield """
     pub fn to_lower(c: char) -> [char; 3] {
         match bsearch_case_table(c, to_lowercase_table) {
             None        => [c, '\\0', '\\0'],
@@ -545,46 +761,39 @@ def emit_conversions_module(f, unicode_data):
 
     fn bsearch_case_table(c: char, table: &[(char, [char; 3])]) -> Option<usize> {
         table.binary_search_by(|&(key, _)| key.cmp(&c)).ok()
-    }
-
-""")
-    t_type = "&[(char, [char; 3])]"
-    pfun = lambda x: "(%s,[%s,%s,%s])" % (
-        escape_char(x[0]), escape_char(x[1][0]), escape_char(x[1][1]), escape_char(x[1][2]))
-
-    emit_table(f,
-               name="to_lowercase_table",
-               t_data=sorted(unicode_data.to_lower.items(), key=operator.itemgetter(0)),
-               t_type=t_type,
-               is_pub=False,
-               pfun=pfun)
-
-    emit_table(f,
-               name="to_uppercase_table",
-               t_data=sorted(unicode_data.to_upper.items(), key=operator.itemgetter(0)),
-               t_type=t_type,
-               is_pub=False,
-               pfun=pfun)
-
-    f.write("}\n")
-
-
-def emit_norm_module(f, unicode_data, norm_props):
-    canon_keys = sorted(unicode_data.canon_decomp.keys())
-
-    canon_comp = {}
-    comp_exclusions = norm_props["Full_Composition_Exclusion"]
-    for char in canon_keys:
-        if any(lo <= char <= hi for lo, hi in comp_exclusions):
-            continue
-        decomp = unicode_data.canon_decomp[char]
-        if len(decomp) == 2:
-            if decomp[0] not in canon_comp:
-                canon_comp[decomp[0]] = []
-            canon_comp[decomp[0]].append((decomp[1], char))
+    }\n\n"""
+
+    decl_type = "&[(char, [char; 3])]"
+    format_conversion = lambda x: "({},[{},{},{}])".format(*(
+        escape_char(c) for c in (x[0], x[1][0], x[1][1], x[1][2])
+    ))
+
+    for fragment in generate_table(
+        name="to_lowercase_table",
+        items=sorted(unicode_data.to_lower.items(), key=lambda x: x[0]),
+        decl_type=decl_type,
+        is_pub=False,
+        format_item=format_conversion
+    ):
+        yield fragment
+
+    for fragment in generate_table(
+        name="to_uppercase_table",
+        items=sorted(unicode_data.to_upper.items(), key=lambda x: x[0]),
+        decl_type=decl_type,
+        is_pub=False,
+        format_item=format_conversion
+    ):
+        yield fragment
+
+    yield "}\n"
 
 
 def parse_args():
+    # type: () -> argparse.Namespace
+    """
+    Parse command line arguments.
+    """
     parser = argparse.ArgumentParser(description=__doc__)
     parser.add_argument("-v", "--version", default=None, type=str,
                         help="Unicode version to use (if not specified,"
@@ -594,56 +803,63 @@ def parse_args():
 
 
 def main():
+    # type: () -> None
+    """
+    Script entry point.
+    """
     args = parse_args()
 
     unicode_version = fetch_files(args.version)
     print("Using Unicode version: {}".format(unicode_version.as_str))
 
+    # all the writing happens entirely in memory, we only write to file
+    # once we have generated the file content (it's not very large, <1 MB)
+    buf = StringIO()
+    buf.write(PREAMBLE)
+
+    unicode_version_notice = textwrap.dedent("""
+    /// The version of [Unicode](http://www.unicode.org/) that the Unicode parts of
+    /// `char` and `str` methods are based on.
+    #[unstable(feature = "unicode_version", issue = "49726")]
+    pub const UNICODE_VERSION: UnicodeVersion = UnicodeVersion {{
+        major: {version.major},
+        minor: {version.minor},
+        micro: {version.micro},
+        _priv: (),
+    }};
+    """).format(version=unicode_version)
+    buf.write(unicode_version_notice)
+
+    get_path = lambda f: get_unicode_file_path(unicode_version, f)
+
+    unicode_data = load_unicode_data(get_path(UnicodeFiles.UNICODE_DATA))
+    load_special_casing(get_path(UnicodeFiles.SPECIAL_CASING), unicode_data)
+
+    want_derived = {"XID_Start", "XID_Continue", "Alphabetic", "Lowercase", "Uppercase",
+                    "Cased", "Case_Ignorable", "Grapheme_Extend"}
+    derived = load_properties(get_path(UnicodeFiles.DERIVED_CORE_PROPERTIES), want_derived)
+
+    props = load_properties(get_path(UnicodeFiles.PROPS),
+                            {"White_Space", "Join_Control", "Noncharacter_Code_Point",
+                             "Pattern_White_Space"})
+
+    # category tables
+    for (name, categories, category_subset) in (
+            ("general_category", unicode_data.general_categories, ["N", "Cc"]),
+            ("derived_property", derived, want_derived),
+            ("property", props, ["White_Space", "Pattern_White_Space"])
+    ):
+        for fragment in generate_property_module(name, categories, category_subset):
+            buf.write(fragment)
+
+    for fragment in generate_conversions_module(unicode_data):
+        buf.write(fragment)
+
     tables_rs_path = os.path.join(THIS_DIR, "tables.rs")
 
     # will overwrite the file if it exists
-    with open(tables_rs_path, "w") as rf:
-        rf.write(PREAMBLE)
-
-        unicode_version_notice = textwrap.dedent("""
-        /// The version of [Unicode](http://www.unicode.org/) that the Unicode parts of
-        /// `char` and `str` methods are based on.
-        #[unstable(feature = "unicode_version", issue = "49726")]
-        pub const UNICODE_VERSION: UnicodeVersion = UnicodeVersion {{
-            major: {version.major},
-            minor: {version.minor},
-            micro: {version.micro},
-            _priv: (),
-        }};
-        """).format(version=unicode_version)
-        rf.write(unicode_version_notice)
-
-        get_path = lambda f: get_unicode_file_path(unicode_version, f)
-
-        unicode_data = load_unicode_data(get_path(UnicodeFiles.UNICODE_DATA))
-        load_special_casing(get_path(UnicodeFiles.SPECIAL_CASING), unicode_data)
-
-        want_derived = ["XID_Start", "XID_Continue", "Alphabetic", "Lowercase", "Uppercase",
-                        "Cased", "Case_Ignorable", "Grapheme_Extend"]
-        derived = load_properties(get_path(UnicodeFiles.DERIVED_CORE_PROPERTIES), want_derived)
-
-        # FIXME scripts not used?
-        scripts = load_properties(get_path(UnicodeFiles.SCRIPTS), [])
-        props = load_properties(get_path(UnicodeFiles.PROPS),
-                                ["White_Space", "Join_Control", "Noncharacter_Code_Point",
-                                 "Pattern_White_Space"])
-        norm_props = load_properties(get_path(UnicodeFiles.DERIVED_NORMALIZATION_PROPS),
-                                     ["Full_Composition_Exclusion"])
-
-        # category tables
-        for (name, cat, pfuns) in (("general_category", unicode_data.gencats, ["N", "Cc"]),
-                                   ("derived_property", derived, want_derived),
-                                   ("property", props, ["White_Space", "Pattern_White_Space"])):
-            emit_property_module(rf, name, cat, pfuns)
-
-        # normalizations and conversions module
-        emit_norm_module(rf, unicode_data, norm_props)
-        emit_conversions_module(rf, unicode_data)
+    with open(tables_rs_path, "w") as fd:
+        fd.write(buf.getvalue())
 
     print("Regenerated tables.rs.")
 

From 60ccf89693037b3c010b027081d253b9c69a304c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20Romanowski?= <pawroman@gmail.com>
Date: Mon, 10 Jun 2019 20:45:58 +0200
Subject: [PATCH 5/6] Apply suggestions from code review

Co-Authored-By: varkor <github@varkor.com>
---
 src/libcore/unicode/unicode.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/libcore/unicode/unicode.py b/src/libcore/unicode/unicode.py
index f66e82299100d..9eaf6eb9baa9e 100755
--- a/src/libcore/unicode/unicode.py
+++ b/src/libcore/unicode/unicode.py
@@ -36,7 +36,7 @@
 try:
     # completely optional type hinting
     # (Python 2 compatible using comments,
-    #  see: https://mypy.readthedocs.io/en/latest/python2.html)
+    # see: https://mypy.readthedocs.io/en/latest/python2.html)
     # This is very helpful in typing-aware IDE like PyCharm.
     from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, Set, Tuple
 except ImportError:
@@ -95,7 +95,8 @@ class UnicodeFiles(object):
     "Cc": ["C"], "Cf": ["C"], "Cs": ["C"], "Co": ["C"], "Cn": ["C"],
 }
 
-# this is the surrogate codepoints range (both ends inclusive)
+# This is the (inclusive) range of surrogate codepoints.
+# These are not valid Rust characters.
 # - they are not valid Rust characters
 SURROGATE_CODEPOINTS_RANGE = (0xd800, 0xdfff)
 
@@ -122,7 +123,7 @@ def fetch_files(version=None):
     """
     Fetch all the Unicode files from unicode.org.
 
-    This will use cached files (stored in FETCH_DIR) if they exist,
+    This will use cached files (stored in `FETCH_DIR`) if they exist,
     creating them if they don't.  In any case, the Unicode version
     is always returned.
 
@@ -797,7 +798,7 @@ def parse_args():
     parser = argparse.ArgumentParser(description=__doc__)
     parser.add_argument("-v", "--version", default=None, type=str,
                         help="Unicode version to use (if not specified,"
-                             " defaults to latest available final release).")
+                             " defaults to latest release).")
 
     return parser.parse_args()
 

From 2b47a085dd418447f1dd79986df94dd051f27c79 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20Romanowski?= <pawroman@gmail.com>
Date: Mon, 10 Jun 2019 21:13:01 +0200
Subject: [PATCH 6/6] Address review remarks in unicode.py

---
 src/libcore/unicode/unicode.py | 116 +++++++++++++++++----------------
 1 file changed, 61 insertions(+), 55 deletions(-)

diff --git a/src/libcore/unicode/unicode.py b/src/libcore/unicode/unicode.py
index 9eaf6eb9baa9e..a0539cd9ca9b6 100755
--- a/src/libcore/unicode/unicode.py
+++ b/src/libcore/unicode/unicode.py
@@ -34,7 +34,7 @@
     from StringIO import StringIO
 
 try:
-    # completely optional type hinting
+    # Completely optional type hinting
     # (Python 2 compatible using comments,
     # see: https://mypy.readthedocs.io/en/latest/python2.html)
     # This is very helpful in typing-aware IDE like PyCharm.
@@ -43,9 +43,9 @@
     pass
 
 
-# we don't use enum.Enum because of Python 2.7 compatibility
+# We don't use enum.Enum because of Python 2.7 compatibility.
 class UnicodeFiles(object):
-    # ReadMe does not contain any unicode data, we
+    # ReadMe does not contain any Unicode data, we
     # only use it to extract versions.
     README = "ReadMe.txt"
 
@@ -57,11 +57,15 @@ class UnicodeFiles(object):
     UNICODE_DATA = "UnicodeData.txt"
 
 
-UnicodeFiles.ALL_FILES = tuple(
-    getattr(UnicodeFiles, name) for name in dir(UnicodeFiles)
+# The order doesn't really matter (Python < 3.6 won't preserve it),
+# we only want to aggregate all the file names.
+ALL_UNICODE_FILES = tuple(
+    value for name, value in UnicodeFiles.__dict__.items()
     if not name.startswith("_")
 )
 
+assert len(ALL_UNICODE_FILES) == 7, "Unexpected number of unicode files"
+
 # The directory this file is located in.
 THIS_DIR = os.path.dirname(os.path.realpath(__file__))
 
@@ -97,18 +101,17 @@ class UnicodeFiles(object):
 
 # This is the (inclusive) range of surrogate codepoints.
 # These are not valid Rust characters.
-# - they are not valid Rust characters
 SURROGATE_CODEPOINTS_RANGE = (0xd800, 0xdfff)
 
 UnicodeData = namedtuple(
     "UnicodeData", (
-        # conversions:
+        # Conversions:
         "to_upper", "to_lower", "to_title",
 
-        # decompositions: canonical decompositions, compatibility decomp
+        # Decompositions: canonical decompositions, compatibility decomp
         "canon_decomp", "compat_decomp",
 
-        # grouped: general categories and combining characters
+        # Grouped: general categories and combining characters
         "general_categories", "combines",
     )
 )
@@ -136,10 +139,10 @@ def fetch_files(version=None):
         return have_version
 
     if version:
-        # check if the desired version exists on the server
+        # Check if the desired version exists on the server.
         get_fetch_url = lambda name: FETCH_URL_VERSION.format(version=version, filename=name)
     else:
-        # extract the latest version
+        # Extract the latest version.
         get_fetch_url = lambda name: FETCH_URL_LATEST.format(filename=name)
 
     readme_url = get_fetch_url(UnicodeFiles.README)
@@ -153,14 +156,14 @@ def fetch_files(version=None):
 
     download_dir = get_unicode_dir(unicode_version)
     if not os.path.exists(download_dir):
-        # for 2.7 compat, we don't use exist_ok=True
+        # For 2.7 compat, we don't use `exist_ok=True`.
         os.makedirs(download_dir)
 
-    for filename in UnicodeFiles.ALL_FILES:
+    for filename in ALL_UNICODE_FILES:
         file_path = get_unicode_file_path(unicode_version, filename)
 
         if os.path.exists(file_path):
-            # assume file on the server didn't change if it's been saved before
+            # Assume file on the server didn't change if it's been saved before.
             continue
 
         if filename == UnicodeFiles.README:
@@ -178,15 +181,16 @@ def check_stored_version(version):
     # type: (Optional[str]) -> Optional[UnicodeVersion]
     """
     Given desired Unicode version, return the version
-    if stored files are all present, and None otherwise.
+    if stored files are all present, and `None` otherwise.
     """
     if not version:
-        # should always check latest version
+        # If no desired version specified, we should check what's the latest
+        # version, skipping stored version checks.
         return None
 
     fetch_dir = os.path.join(FETCH_DIR, version)
 
-    for filename in UnicodeFiles.ALL_FILES:
+    for filename in ALL_UNICODE_FILES:
         file_path = os.path.join(fetch_dir, filename)
 
         if not os.path.exists(file_path):
@@ -199,11 +203,11 @@ def check_stored_version(version):
 def parse_readme_unicode_version(readme_content):
     # type: (str) -> UnicodeVersion
     """
-    Parse the Unicode version contained in their ReadMe.txt file.
+    Parse the Unicode version contained in their `ReadMe.txt` file.
     """
-    # "raw string" is necessary for \d not being treated as escape char
-    # (for the sake of compat with future Python versions)
-    # see: https://docs.python.org/3.6/whatsnew/3.6.html#deprecated-python-behavior
+    # "Raw string" is necessary for \d not being treated as escape char
+    # (for the sake of compat with future Python versions).
+    # See: https://docs.python.org/3.6/whatsnew/3.6.html#deprecated-python-behavior
     pattern = r"for Version (\d+)\.(\d+)\.(\d+) of the Unicode"
     groups = re.search(pattern, readme_content).groups()
 
@@ -213,7 +217,7 @@ def parse_readme_unicode_version(readme_content):
 def get_unicode_dir(unicode_version):
     # type: (UnicodeVersion) -> str
     """
-    Indicate where the unicode data files should be stored.
+    Indicate in which parent dir the Unicode data files should be stored.
 
     This returns a full, absolute path.
     """
@@ -223,7 +227,7 @@ def get_unicode_dir(unicode_version):
 def get_unicode_file_path(unicode_version, filename):
     # type: (UnicodeVersion, str) -> str
     """
-    Indicate where the unicode data file should be stored.
+    Indicate where the Unicode data file should be stored.
     """
     return os.path.join(get_unicode_dir(unicode_version), filename)
 
@@ -239,22 +243,22 @@ def is_surrogate(n):
 def load_unicode_data(file_path):
     # type: (str) -> UnicodeData
     """
-    Load main unicode data.
+    Load main Unicode data.
     """
-    # conversions
+    # Conversions
     to_lower = {}   # type: Dict[int, Tuple[int, int, int]]
     to_upper = {}   # type: Dict[int, Tuple[int, int, int]]
     to_title = {}   # type: Dict[int, Tuple[int, int, int]]
 
-    # decompositions
+    # Decompositions
     compat_decomp = {}   # type: Dict[int, List[int]]
     canon_decomp = {}    # type: Dict[int, List[int]]
 
-    # combining characters
+    # Combining characters
     # FIXME: combines are not used
     combines = defaultdict(set)   # type: Dict[str, Set[int]]
 
-    # categories
+    # Categories
     general_categories = defaultdict(set)   # type: Dict[str, Set[int]]
     category_assigned_codepoints = set()    # type: Set[int]
 
@@ -283,41 +287,42 @@ def load_unicode_data(file_path):
          decomp, deci, digit, num, mirror,
          old, iso, upcase, lowcase, titlecase) = data
 
-        # generate char to char direct common and simple conversions
-        # uppercase to lowercase
+        # Generate char to char direct common and simple conversions:
+
+        # Uppercase to lowercase
         if lowcase != "" and code_org != lowcase:
             to_lower[code] = (int(lowcase, 16), 0, 0)
 
-        # lowercase to uppercase
+        # Lowercase to uppercase
         if upcase != "" and code_org != upcase:
             to_upper[code] = (int(upcase, 16), 0, 0)
 
-        # title case
+        # Title case
         if titlecase.strip() != "" and code_org != titlecase:
             to_title[code] = (int(titlecase, 16), 0, 0)
 
-        # store decomposition, if given
+        # Store decomposition, if given
         if decomp:
             decompositions = decomp.split()[1:]
             decomp_code_points = [int(i, 16) for i in decompositions]
 
             if decomp.startswith("<"):
-                # compatibility decomposition
+                # Compatibility decomposition
                 compat_decomp[code] = decomp_code_points
             else:
-                # canonical decomposition
+                # Canonical decomposition
                 canon_decomp[code] = decomp_code_points
 
-        # place letter in categories as appropriate
+        # Place letter in categories as appropriate.
         for cat in itertools.chain((gencat, ), EXPANDED_CATEGORIES.get(gencat, [])):
             general_categories[cat].add(code)
             category_assigned_codepoints.add(code)
 
-        # record combining class, if any
+        # Record combining class, if any.
         if combine != "0":
             combines[combine].add(code)
 
-    # generate Not_Assigned from Assigned
+    # Generate Not_Assigned from Assigned.
     general_categories["Cn"] = get_unassigned_codepoints(category_assigned_codepoints)
 
     # Other contains Not_Assigned
@@ -336,7 +341,7 @@ def load_unicode_data(file_path):
 def load_special_casing(file_path, unicode_data):
     # type: (str, UnicodeData) -> None
     """
-    Load special casing data and enrich given unicode data.
+    Load special casing data and enrich given Unicode data.
     """
     for line in fileinput.input(file_path):
         data = line.split("#")[0].split(";")
@@ -474,9 +479,9 @@ def load_properties(file_path, interesting_props):
     Load properties data and return in grouped form.
     """
     props = defaultdict(list)   # type: Dict[str, List[Tuple[int, int]]]
-    # "raw string" is necessary for \. and \w not to be treated as escape chars
-    # (for the sake of compat with future Python versions)
-    # see: https://docs.python.org/3.6/whatsnew/3.6.html#deprecated-python-behavior
+    # "Raw string" is necessary for `\.` and `\w` not to be treated as escape chars
+    # (for the sake of compat with future Python versions).
+    # See: https://docs.python.org/3.6/whatsnew/3.6.html#deprecated-python-behavior
     re1 = re.compile(r"^ *([0-9A-F]+) *; *(\w+)")
     re2 = re.compile(r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+)")
 
@@ -486,7 +491,7 @@ def load_properties(file_path, interesting_props):
             groups = match.groups()
 
             if len(groups) == 2:
-                # re1 matched
+                # `re1` matched (2 groups).
                 d_lo, prop = groups
                 d_hi = d_lo
             else:
@@ -502,7 +507,7 @@ def load_properties(file_path, interesting_props):
 
         props[prop].append((lo_value, hi_value))
 
-    # optimize if possible
+    # Optimize if possible.
     for prop in props:
         props[prop] = group_codepoints(ungroup_codepoints(props[prop]))
 
@@ -587,10 +592,10 @@ def compute_trie(raw_data, chunk_size):
     for i in range(len(raw_data) // chunk_size):
         data = raw_data[i * chunk_size : (i + 1) * chunk_size]
 
-        # postfix compression of child nodes (data chunks)
-        # (identical child nodes are shared)
+        # Postfix compression of child nodes (data chunks)
+        # (identical child nodes are shared).
 
-        # make a tuple out of the list so it's hashable
+        # Make a tuple out of the list so it's hashable.
         child = tuple(data)
         if child not in childmap:
             childmap[child] = len(childmap)
@@ -609,7 +614,7 @@ def generate_bool_trie(name, codepoint_ranges, is_pub=True):
     This yields string fragments that should be joined to produce
     the final string.
 
-    See: bool_trie.rs
+    See: `bool_trie.rs`.
     """
     chunk_size = 64
     rawdata = [False] * 0x110000
@@ -617,7 +622,7 @@ def generate_bool_trie(name, codepoint_ranges, is_pub=True):
         for cp in range(lo, hi + 1):
             rawdata[cp] = True
 
-    # convert to bitmap chunks of chunk_size bits each
+    # Convert to bitmap chunks of `chunk_size` bits each.
     chunks = []
     for i in range(0x110000 // chunk_size):
         chunk = 0
@@ -679,9 +684,9 @@ def generate_bool_trie(name, codepoint_ranges, is_pub=True):
 def generate_small_bool_trie(name, codepoint_ranges, is_pub=True):
     # type: (str, List[Tuple[int, int]], bool) -> Iterator[str]
     """
-    Generate Rust code for SmallBoolTrie struct.
+    Generate Rust code for `SmallBoolTrie` struct.
 
-    See: bool_trie.rs
+    See: `bool_trie.rs`.
     """
     last_chunk = max(hi // 64 for (lo, hi) in codepoint_ranges)
     n_chunks = last_chunk + 1
@@ -813,8 +818,8 @@ def main():
     unicode_version = fetch_files(args.version)
     print("Using Unicode version: {}".format(unicode_version.as_str))
 
-    # all the writing happens entirely in memory, we only write to file
-    # once we have generated the file content (it's not very large, <1 MB)
+    # All the writing happens entirely in memory, we only write to file
+    # once we have generated the file content (it's not very large, <1 MB).
     buf = StringIO()
     buf.write(PREAMBLE)
 
@@ -844,7 +849,7 @@ def main():
                             {"White_Space", "Join_Control", "Noncharacter_Code_Point",
                              "Pattern_White_Space"})
 
-    # category tables
+    # Category tables
     for (name, categories, category_subset) in (
             ("general_category", unicode_data.general_categories, ["N", "Cc"]),
             ("derived_property", derived, want_derived),
@@ -858,7 +863,8 @@ def main():
 
     tables_rs_path = os.path.join(THIS_DIR, "tables.rs")
 
-    # will overwrite the file if it exists
+    # Actually write out the file content.
+    # Will overwrite the file if it exists.
     with open(tables_rs_path, "w") as fd:
         fd.write(buf.getvalue())