Added font support to character usage script

faustedition · May 8, 2017 · 016de29 · 016de29
1 parent 1b79471
commit 016de29
Showing 1 changed file with 141 additions and 2 deletions.
diff --git a/utils/collect_chars.py b/utils/collect_chars.py
@@ -10,6 +10,10 @@
 import sys
 from tqdm import tqdm
 from multiprocessing import Pool
+from subprocess import check_output
+import pandas as pd
+import argparse
+import re
 
 def count_in_file(filename):
     try:
@@ -106,7 +110,32 @@ def intervals(iterable):
             stop = item
     yield (start, stop)
 
-def _main(args):
+
+def font_support_table(fontfile: str) -> pd.DataFrame:
+    """
+    Runs otfinfo -u on the fontfile to get a table describing the fonts' char support
+    """
+    def charinfo(line):
+        codepoint_str, glyphno, glyphname = line.split()
+        codepoint = int(codepoint_str[3:], base=16)
+        return "{:>04X}".format(codepoint), chr(codepoint), glyphno, glyphname
+    otfinfo = check_output(['otfinfo', '-u', fontfile], universal_newlines=True)
+    return pd.DataFrame.from_records(
+        data=[charinfo(line) for line in otfinfo.split('\n') if line],
+        columns=['codepoint', 'char', 'glyphno', 'glyphname'],
+        index='codepoint')
+
+def font_support(fontfile: str) -> pd.Series:
+    result = font_support_table(fontfile).glyphname
+    font_info = check_output(['otfinfo', '-i', fontfile], universal_newlines=True)
+    font_name = re.search(r'^Full name:\s*(.*)$', font_info, re.MULTILINE)
+    if font_name is None:
+        result.name = os.path.splitext(os.path.basename(fontfile))[0]
+    else:
+        result.name = font_name.group(1)
+    return result
+
+def _main_old(args):
     stats = collect_stats(args[1])
     totals = sum_values(stats.values())
 
@@ -123,5 +152,115 @@ def _main(args):
         json.dump(ordered_by_char(file_by_char(stats), totals), f, indent=2,
                   ensure_ascii=False)
 
+def getargparser():
+    p = argparse.ArgumentParser(description="Analyze font usage",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    p.add_argument('-o', '--output', nargs='+', metavar='TSV', default=['char_stats.tsv'],
+                   help='Write the character statistics table to this file')
+    ana = p.add_argument_group(
+        title='Collecting character groups',
+        description="""
+        If at least one -d option is given, the tool recursively parses all
+        files with one of the extensions from the -a option and counts all
+        characters it finds therein. The other options in this group create
+        various representations of this statistic. """)
+    ana.add_argument('-d', '--directory', nargs='+', metavar="DIR",
+                   help="Analyze all XML/HTML files in this directory")
+    ana.add_argument('-a', '--accept', nargs='+', default=['.html', '.xml', '.svg', '.php'],
+                   help='File extensions to accept')
+    ana.add_argument('-r', '--ranges',
+                   help='write a list of used character ranges to this file')
+    ana.add_argument('-c', '--by-char',
+                    help="""write a compressed JSON file that describes which
+                     character occurs how often in which input file""")
+    font = p.add_argument_group(
+        title='Font analysis',
+        description="""
+        Uses otfinfo to analyze which characters each font supports and
+        adds this info to the result table. The contents of the table cell
+        is the glyph name in that font.
+
+        Either uses the table created using -d, or reads one using -i.""")
+    font.add_argument('-i', '--input-table',
+                      help="Input TSV file to augment, format as written by -o")
+    font.add_argument('-f', '--fonts', nargs='+',
+                      help="Font files to analyze")
+    font.add_argument('-k', '--keep-unused', action='store_true',
+                      help='Keep unused characters in the table')
+    return p
+
+def main():
+    parser = getargparser()
+    options = parser.parse_args()
+    summary = None
+    if options.directory and options.input_table:
+        parser.error("Cannot combine -d and -i. Use --help for more info.")
+    if not(options.directory or options.input_table):
+        parser.error("Either -d or -i is required. Use --help for more info.")
+
+    if options.directory:
+        # collect statistics
+        stats = None
+        for directory in options.directory:
+            print("Collecting characters in {} ...".format(directory))
+            dir_stat = collect_stats(directory, accept=options.accept)
+            if stats is None:
+                stats = dir_stat
+            else:
+                stats.update(dir_stat)
+
+        print("Summarizing over all files ...")
+        totals = sum_values(stats.values())
+
+        if options.by_char:
+            fn = options.by_char
+            if ".json" not in fn: fn += ".json"
+            if not fn.endswith('.gz'): fn += '.gz'
+            print("Writing by-char statistics to {}...".format(fn))
+            with gzip.open(fn, "wt", encoding="utf-8") as f:
+                json.dump(ordered_by_char(file_by_char(stats), totals), f, indent=2,
+                        ensure_ascii=False)
+
+        if options.ranges:
+            with open(options.ranges, "w", encoding="UTF-8") as ranges:
+                intervs = intervals(map(ord, totals.keys()))
+                formatted = ("{:0>4X}-{:0>4X}".format(*i) for i in intervs)
+                ranges.write(",\n".join(formatted) + "\n")
+
+        print("Preparing summary table ...")
+        # now, prepare the support DF
+        summary = pd.DataFrame.from_records((
+            ("{:>04X}".format(ord(char)),
+             char if char >= " " else "",
+             unicodedata.name(char, None),
+             count) for char, count in totals.items()),
+            columns=['codepoint', 'character', 'name', 'count'],
+            index='codepoint')
+
+    if options.input_table:
+        summary = pd.read_csv(options.input_table, sep='\t', index_col='codepoint')
+
+    if options.fonts:
+        for font in options.fonts:
+            print("Analyzing font {} ...".format(font))
+            support = font_support(font)
+            summary.loc[:,support.name] = support
+
+    if not(options.keep_unused):
+        summary.dropna(subset=['count'], inplace=True)
+
+    summary.sort_values(by='count', ascending=False, inplace=True)
+
+    for output in options.output:
+        print("Saving summary to {} ...".format(output))
+        ext = os.path.splitext(output)[1]
+        if ext == '.xls' or ext == '.xlsx':
+            summary.to_excel(output)
+        elif ext == '.html':
+            summary.to_html(output)
+        else:
+            summary.to_csv(output, sep='\t', encoding='utf-8')
+
+
 if __name__ == "__main__":
-    _main(sys.argv)
+    main()