Skip to content

Commit

Permalink
Address review remarks in unicode.py
Browse files Browse the repository at this point in the history
  • Loading branch information
pawroman committed Jul 1, 2019
1 parent 60ccf89 commit 2b47a08
Showing 1 changed file with 61 additions and 55 deletions.
116 changes: 61 additions & 55 deletions src/libcore/unicode/unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
from StringIO import StringIO

try:
# completely optional type hinting
# Completely optional type hinting
# (Python 2 compatible using comments,
# see: https://mypy.readthedocs.io/en/latest/python2.html)
# This is very helpful in typing-aware IDE like PyCharm.
Expand All @@ -43,9 +43,9 @@
pass


# we don't use enum.Enum because of Python 2.7 compatibility
# We don't use enum.Enum because of Python 2.7 compatibility.
class UnicodeFiles(object):
# ReadMe does not contain any unicode data, we
# ReadMe does not contain any Unicode data, we
# only use it to extract versions.
README = "ReadMe.txt"

Expand All @@ -57,11 +57,15 @@ class UnicodeFiles(object):
UNICODE_DATA = "UnicodeData.txt"


UnicodeFiles.ALL_FILES = tuple(
getattr(UnicodeFiles, name) for name in dir(UnicodeFiles)
# The order doesn't really matter (Python < 3.6 won't preserve it),
# we only want to aggregate all the file names.
ALL_UNICODE_FILES = tuple(
value for name, value in UnicodeFiles.__dict__.items()
if not name.startswith("_")
)

assert len(ALL_UNICODE_FILES) == 7, "Unexpected number of unicode files"

# The directory this file is located in.
THIS_DIR = os.path.dirname(os.path.realpath(__file__))

Expand Down Expand Up @@ -97,18 +101,17 @@ class UnicodeFiles(object):

# This is the (inclusive) range of surrogate codepoints.
# These are not valid Rust characters.
# - they are not valid Rust characters
SURROGATE_CODEPOINTS_RANGE = (0xd800, 0xdfff)

UnicodeData = namedtuple(
"UnicodeData", (
# conversions:
# Conversions:
"to_upper", "to_lower", "to_title",

# decompositions: canonical decompositions, compatibility decomp
# Decompositions: canonical decompositions, compatibility decomp
"canon_decomp", "compat_decomp",

# grouped: general categories and combining characters
# Grouped: general categories and combining characters
"general_categories", "combines",
)
)
Expand Down Expand Up @@ -136,10 +139,10 @@ def fetch_files(version=None):
return have_version

if version:
# check if the desired version exists on the server
# Check if the desired version exists on the server.
get_fetch_url = lambda name: FETCH_URL_VERSION.format(version=version, filename=name)
else:
# extract the latest version
# Extract the latest version.
get_fetch_url = lambda name: FETCH_URL_LATEST.format(filename=name)

readme_url = get_fetch_url(UnicodeFiles.README)
Expand All @@ -153,14 +156,14 @@ def fetch_files(version=None):

download_dir = get_unicode_dir(unicode_version)
if not os.path.exists(download_dir):
# for 2.7 compat, we don't use exist_ok=True
# For 2.7 compat, we don't use `exist_ok=True`.
os.makedirs(download_dir)

for filename in UnicodeFiles.ALL_FILES:
for filename in ALL_UNICODE_FILES:
file_path = get_unicode_file_path(unicode_version, filename)

if os.path.exists(file_path):
# assume file on the server didn't change if it's been saved before
# Assume file on the server didn't change if it's been saved before.
continue

if filename == UnicodeFiles.README:
Expand All @@ -178,15 +181,16 @@ def check_stored_version(version):
# type: (Optional[str]) -> Optional[UnicodeVersion]
"""
Given desired Unicode version, return the version
if stored files are all present, and None otherwise.
if stored files are all present, and `None` otherwise.
"""
if not version:
# should always check latest version
# If no desired version specified, we should check what's the latest
# version, skipping stored version checks.
return None

fetch_dir = os.path.join(FETCH_DIR, version)

for filename in UnicodeFiles.ALL_FILES:
for filename in ALL_UNICODE_FILES:
file_path = os.path.join(fetch_dir, filename)

if not os.path.exists(file_path):
Expand All @@ -199,11 +203,11 @@ def check_stored_version(version):
def parse_readme_unicode_version(readme_content):
# type: (str) -> UnicodeVersion
"""
Parse the Unicode version contained in their ReadMe.txt file.
Parse the Unicode version contained in their `ReadMe.txt` file.
"""
# "raw string" is necessary for \d not being treated as escape char
# (for the sake of compat with future Python versions)
# see: https://docs.python.org/3.6/whatsnew/3.6.html#deprecated-python-behavior
# "Raw string" is necessary for \d not being treated as escape char
# (for the sake of compat with future Python versions).
# See: https://docs.python.org/3.6/whatsnew/3.6.html#deprecated-python-behavior
pattern = r"for Version (\d+)\.(\d+)\.(\d+) of the Unicode"
groups = re.search(pattern, readme_content).groups()

Expand All @@ -213,7 +217,7 @@ def parse_readme_unicode_version(readme_content):
def get_unicode_dir(unicode_version):
# type: (UnicodeVersion) -> str
"""
Indicate where the unicode data files should be stored.
Indicate in which parent dir the Unicode data files should be stored.
This returns a full, absolute path.
"""
Expand All @@ -223,7 +227,7 @@ def get_unicode_dir(unicode_version):
def get_unicode_file_path(unicode_version, filename):
# type: (UnicodeVersion, str) -> str
"""
Indicate where the unicode data file should be stored.
Indicate where the Unicode data file should be stored.
"""
return os.path.join(get_unicode_dir(unicode_version), filename)

Expand All @@ -239,22 +243,22 @@ def is_surrogate(n):
def load_unicode_data(file_path):
# type: (str) -> UnicodeData
"""
Load main unicode data.
Load main Unicode data.
"""
# conversions
# Conversions
to_lower = {} # type: Dict[int, Tuple[int, int, int]]
to_upper = {} # type: Dict[int, Tuple[int, int, int]]
to_title = {} # type: Dict[int, Tuple[int, int, int]]

# decompositions
# Decompositions
compat_decomp = {} # type: Dict[int, List[int]]
canon_decomp = {} # type: Dict[int, List[int]]

# combining characters
# Combining characters
# FIXME: combines are not used
combines = defaultdict(set) # type: Dict[str, Set[int]]

# categories
# Categories
general_categories = defaultdict(set) # type: Dict[str, Set[int]]
category_assigned_codepoints = set() # type: Set[int]

Expand Down Expand Up @@ -283,41 +287,42 @@ def load_unicode_data(file_path):
decomp, deci, digit, num, mirror,
old, iso, upcase, lowcase, titlecase) = data

# generate char to char direct common and simple conversions
# uppercase to lowercase
# Generate char to char direct common and simple conversions:

# Uppercase to lowercase
if lowcase != "" and code_org != lowcase:
to_lower[code] = (int(lowcase, 16), 0, 0)

# lowercase to uppercase
# Lowercase to uppercase
if upcase != "" and code_org != upcase:
to_upper[code] = (int(upcase, 16), 0, 0)

# title case
# Title case
if titlecase.strip() != "" and code_org != titlecase:
to_title[code] = (int(titlecase, 16), 0, 0)

# store decomposition, if given
# Store decomposition, if given
if decomp:
decompositions = decomp.split()[1:]
decomp_code_points = [int(i, 16) for i in decompositions]

if decomp.startswith("<"):
# compatibility decomposition
# Compatibility decomposition
compat_decomp[code] = decomp_code_points
else:
# canonical decomposition
# Canonical decomposition
canon_decomp[code] = decomp_code_points

# place letter in categories as appropriate
# Place letter in categories as appropriate.
for cat in itertools.chain((gencat, ), EXPANDED_CATEGORIES.get(gencat, [])):
general_categories[cat].add(code)
category_assigned_codepoints.add(code)

# record combining class, if any
# Record combining class, if any.
if combine != "0":
combines[combine].add(code)

# generate Not_Assigned from Assigned
# Generate Not_Assigned from Assigned.
general_categories["Cn"] = get_unassigned_codepoints(category_assigned_codepoints)

# Other contains Not_Assigned
Expand All @@ -336,7 +341,7 @@ def load_unicode_data(file_path):
def load_special_casing(file_path, unicode_data):
# type: (str, UnicodeData) -> None
"""
Load special casing data and enrich given unicode data.
Load special casing data and enrich given Unicode data.
"""
for line in fileinput.input(file_path):
data = line.split("#")[0].split(";")
Expand Down Expand Up @@ -474,9 +479,9 @@ def load_properties(file_path, interesting_props):
Load properties data and return in grouped form.
"""
props = defaultdict(list) # type: Dict[str, List[Tuple[int, int]]]
# "raw string" is necessary for \. and \w not to be treated as escape chars
# (for the sake of compat with future Python versions)
# see: https://docs.python.org/3.6/whatsnew/3.6.html#deprecated-python-behavior
# "Raw string" is necessary for `\.` and `\w` not to be treated as escape chars
# (for the sake of compat with future Python versions).
# See: https://docs.python.org/3.6/whatsnew/3.6.html#deprecated-python-behavior
re1 = re.compile(r"^ *([0-9A-F]+) *; *(\w+)")
re2 = re.compile(r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+)")

Expand All @@ -486,7 +491,7 @@ def load_properties(file_path, interesting_props):
groups = match.groups()

if len(groups) == 2:
# re1 matched
# `re1` matched (2 groups).
d_lo, prop = groups
d_hi = d_lo
else:
Expand All @@ -502,7 +507,7 @@ def load_properties(file_path, interesting_props):

props[prop].append((lo_value, hi_value))

# optimize if possible
# Optimize if possible.
for prop in props:
props[prop] = group_codepoints(ungroup_codepoints(props[prop]))

Expand Down Expand Up @@ -587,10 +592,10 @@ def compute_trie(raw_data, chunk_size):
for i in range(len(raw_data) // chunk_size):
data = raw_data[i * chunk_size : (i + 1) * chunk_size]

# postfix compression of child nodes (data chunks)
# (identical child nodes are shared)
# Postfix compression of child nodes (data chunks)
# (identical child nodes are shared).

# make a tuple out of the list so it's hashable
# Make a tuple out of the list so it's hashable.
child = tuple(data)
if child not in childmap:
childmap[child] = len(childmap)
Expand All @@ -609,15 +614,15 @@ def generate_bool_trie(name, codepoint_ranges, is_pub=True):
This yields string fragments that should be joined to produce
the final string.
See: bool_trie.rs
See: `bool_trie.rs`.
"""
chunk_size = 64
rawdata = [False] * 0x110000
for (lo, hi) in codepoint_ranges:
for cp in range(lo, hi + 1):
rawdata[cp] = True

# convert to bitmap chunks of chunk_size bits each
# Convert to bitmap chunks of `chunk_size` bits each.
chunks = []
for i in range(0x110000 // chunk_size):
chunk = 0
Expand Down Expand Up @@ -679,9 +684,9 @@ def generate_bool_trie(name, codepoint_ranges, is_pub=True):
def generate_small_bool_trie(name, codepoint_ranges, is_pub=True):
# type: (str, List[Tuple[int, int]], bool) -> Iterator[str]
"""
Generate Rust code for SmallBoolTrie struct.
Generate Rust code for `SmallBoolTrie` struct.
See: bool_trie.rs
See: `bool_trie.rs`.
"""
last_chunk = max(hi // 64 for (lo, hi) in codepoint_ranges)
n_chunks = last_chunk + 1
Expand Down Expand Up @@ -813,8 +818,8 @@ def main():
unicode_version = fetch_files(args.version)
print("Using Unicode version: {}".format(unicode_version.as_str))

# all the writing happens entirely in memory, we only write to file
# once we have generated the file content (it's not very large, <1 MB)
# All the writing happens entirely in memory, we only write to file
# once we have generated the file content (it's not very large, <1 MB).
buf = StringIO()
buf.write(PREAMBLE)

Expand Down Expand Up @@ -844,7 +849,7 @@ def main():
{"White_Space", "Join_Control", "Noncharacter_Code_Point",
"Pattern_White_Space"})

# category tables
# Category tables
for (name, categories, category_subset) in (
("general_category", unicode_data.general_categories, ["N", "Cc"]),
("derived_property", derived, want_derived),
Expand All @@ -858,7 +863,8 @@ def main():

tables_rs_path = os.path.join(THIS_DIR, "tables.rs")

# will overwrite the file if it exists
# Actually write out the file content.
# Will overwrite the file if it exists.
with open(tables_rs_path, "w") as fd:
fd.write(buf.getvalue())

Expand Down

0 comments on commit 2b47a08

Please sign in to comment.