From 1b794713d0e3304f26b9e81dd69cf35cff2a79ce Mon Sep 17 00:00:00 2001 From: Thorsten Vitt Date: Fri, 5 May 2017 16:40:06 +0200 Subject: [PATCH] Stats on unicode characters in XML based files. --- utils/collect_chars.py | 127 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 127 insertions(+) create mode 100755 utils/collect_chars.py diff --git a/utils/collect_chars.py b/utils/collect_chars.py new file mode 100755 index 0000000..f09c37c --- /dev/null +++ b/utils/collect_chars.py @@ -0,0 +1,127 @@ +#!/usr/bin/env python3 + +from lxml import etree, html +from collections import Counter, defaultdict, OrderedDict +from ftfy import fix_text +import unicodedata +import json +import gzip +import os +import sys +from tqdm import tqdm +from multiprocessing import Pool + +def count_in_file(filename): + try: + tree = etree.parse(filename) + except etree.XMLSyntaxError: + tree = html.parse(filename) + + return Counter("".join(fix_text(node, fix_latin_ligatures=False, + fix_character_width=False, + uncurl_quotes=False) + for node in tree.xpath('//text()'))) + + +def files_by_ext(path, accept=['.html', '.xml', '.svg', '.php']): + for dirpath, dirnames, filenames in os.walk(path): + for filename in filenames: + __, ext = os.path.splitext(filename) + if ext in accept: + yield os.path.join(dirpath, filename) + + +def collect_stats(path, accept=['.html', '.xml', '.svg', '.php']): + """ + Returns an iterable filename → Counter(char → count) + """ + + with Pool() as pool: + files = list(files_by_ext(path, accept)) + counts = pool.imap(count_in_file, files) + tuples = zip(files, counts) + return dict(tqdm(tuples, unit=' files', total=len(files))) + + +def sum_values(counters): + total = Counter() + for counter in counters: + total.update(counter) + return total + +def file_by_char(stats): + result = defaultdict(Counter) + for file, stat in stats.items(): + for char, count in stat.items(): + result[char][file] = count + return result + +def ordered_by_char(bychar, totals): + result = OrderedDict() + for char, __ in reversed(totals.most_common()): + result[char] = OrderedDict(bychar[char].most_common()) + return result + +def format_item(char, count=''): + """ + Formats the given character & count to an info line, tab separated, with the following fields: + + - hexadecimal representation of the codepoint + - the character itself if its not a control character + - the count + - the character's unicode name + """ + return "{:0>4X}\t{}\t{}\t{}\n".format( + ord(char), + char if char >= ' ' else '', + count, + unicodedata.name(char, '')) + +def intervals(iterable): + """ + Sorts the iterable and calculates continuous intervals.mro + + Args: + iterable: + An iterable of integers (or anything else that supports +1 and <) + + Yields: + (start, stop) tuples, start and stop are inclusive. + + Example: + >>> list(intervals([20,7,25,1,2,3,4,6,12,13,14,15,5,5])) + [(1, 7), (12, 15), (20, 20), (25, 25)] + """ + ordered = sorted(iterable) + if not(ordered): + return + start = ordered[0] + stop = start + for item in ordered[1:]: + if stop+1 < item: + yield (start, stop) + start = item + stop = start + else: + stop = item + yield (start, stop) + +def _main(args): + stats = collect_stats(args[1]) + totals = sum_values(stats.values()) + + with open("char_stats.tsv", "w", encoding="UTF-8") as statfile: + statfile.writelines(format_item(char, count) + for char, count in totals.most_common()) + + with open("ranges.txt", "w", encoding="UTF-8") as ranges: + intervs = intervals(map(ord, totals.keys())) + formatted = ("{:0>4X}-{:0>4X}".format(*i) for i in intervs) + ranges.write(",\n".join(formatted) + "\n") + + with gzip.open("by_char.json.gz", "wt", encoding="utf-8") as f: + json.dump(ordered_by_char(file_by_char(stats), totals), f, indent=2, + ensure_ascii=False) + +if __name__ == "__main__": + _main(sys.argv)