Skip to content

Commit

Permalink
Added font support to character usage script
Browse files Browse the repository at this point in the history
  • Loading branch information
thvitt committed May 8, 2017
1 parent 1b79471 commit 016de29
Showing 1 changed file with 141 additions and 2 deletions.
143 changes: 141 additions & 2 deletions utils/collect_chars.py
Expand Up @@ -10,6 +10,10 @@
import sys
from tqdm import tqdm
from multiprocessing import Pool
from subprocess import check_output
import pandas as pd
import argparse
import re

def count_in_file(filename):
try:
Expand Down Expand Up @@ -106,7 +110,32 @@ def intervals(iterable):
stop = item
yield (start, stop)

def _main(args):

def font_support_table(fontfile: str) -> pd.DataFrame:
"""
Runs otfinfo -u on the fontfile to get a table describing the fonts' char support
"""
def charinfo(line):
codepoint_str, glyphno, glyphname = line.split()
codepoint = int(codepoint_str[3:], base=16)
return "{:>04X}".format(codepoint), chr(codepoint), glyphno, glyphname
otfinfo = check_output(['otfinfo', '-u', fontfile], universal_newlines=True)
return pd.DataFrame.from_records(
data=[charinfo(line) for line in otfinfo.split('\n') if line],
columns=['codepoint', 'char', 'glyphno', 'glyphname'],
index='codepoint')

def font_support(fontfile: str) -> pd.Series:
result = font_support_table(fontfile).glyphname
font_info = check_output(['otfinfo', '-i', fontfile], universal_newlines=True)
font_name = re.search(r'^Full name:\s*(.*)$', font_info, re.MULTILINE)
if font_name is None:
result.name = os.path.splitext(os.path.basename(fontfile))[0]
else:
result.name = font_name.group(1)
return result

def _main_old(args):
stats = collect_stats(args[1])
totals = sum_values(stats.values())

Expand All @@ -123,5 +152,115 @@ def _main(args):
json.dump(ordered_by_char(file_by_char(stats), totals), f, indent=2,
ensure_ascii=False)

def getargparser():
p = argparse.ArgumentParser(description="Analyze font usage",
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
p.add_argument('-o', '--output', nargs='+', metavar='TSV', default=['char_stats.tsv'],
help='Write the character statistics table to this file')
ana = p.add_argument_group(
title='Collecting character groups',
description="""
If at least one -d option is given, the tool recursively parses all
files with one of the extensions from the -a option and counts all
characters it finds therein. The other options in this group create
various representations of this statistic. """)
ana.add_argument('-d', '--directory', nargs='+', metavar="DIR",
help="Analyze all XML/HTML files in this directory")
ana.add_argument('-a', '--accept', nargs='+', default=['.html', '.xml', '.svg', '.php'],
help='File extensions to accept')
ana.add_argument('-r', '--ranges',
help='write a list of used character ranges to this file')
ana.add_argument('-c', '--by-char',
help="""write a compressed JSON file that describes which
character occurs how often in which input file""")
font = p.add_argument_group(
title='Font analysis',
description="""
Uses otfinfo to analyze which characters each font supports and
adds this info to the result table. The contents of the table cell
is the glyph name in that font.
Either uses the table created using -d, or reads one using -i.""")
font.add_argument('-i', '--input-table',
help="Input TSV file to augment, format as written by -o")
font.add_argument('-f', '--fonts', nargs='+',
help="Font files to analyze")
font.add_argument('-k', '--keep-unused', action='store_true',
help='Keep unused characters in the table')
return p

def main():
parser = getargparser()
options = parser.parse_args()
summary = None
if options.directory and options.input_table:
parser.error("Cannot combine -d and -i. Use --help for more info.")
if not(options.directory or options.input_table):
parser.error("Either -d or -i is required. Use --help for more info.")

if options.directory:
# collect statistics
stats = None
for directory in options.directory:
print("Collecting characters in {} ...".format(directory))
dir_stat = collect_stats(directory, accept=options.accept)
if stats is None:
stats = dir_stat
else:
stats.update(dir_stat)

print("Summarizing over all files ...")
totals = sum_values(stats.values())

if options.by_char:
fn = options.by_char
if ".json" not in fn: fn += ".json"
if not fn.endswith('.gz'): fn += '.gz'
print("Writing by-char statistics to {}...".format(fn))
with gzip.open(fn, "wt", encoding="utf-8") as f:
json.dump(ordered_by_char(file_by_char(stats), totals), f, indent=2,
ensure_ascii=False)

if options.ranges:
with open(options.ranges, "w", encoding="UTF-8") as ranges:
intervs = intervals(map(ord, totals.keys()))
formatted = ("{:0>4X}-{:0>4X}".format(*i) for i in intervs)
ranges.write(",\n".join(formatted) + "\n")

print("Preparing summary table ...")
# now, prepare the support DF
summary = pd.DataFrame.from_records((
("{:>04X}".format(ord(char)),
char if char >= " " else "",
unicodedata.name(char, None),
count) for char, count in totals.items()),
columns=['codepoint', 'character', 'name', 'count'],
index='codepoint')

if options.input_table:
summary = pd.read_csv(options.input_table, sep='\t', index_col='codepoint')

if options.fonts:
for font in options.fonts:
print("Analyzing font {} ...".format(font))
support = font_support(font)
summary.loc[:,support.name] = support

if not(options.keep_unused):
summary.dropna(subset=['count'], inplace=True)

summary.sort_values(by='count', ascending=False, inplace=True)

for output in options.output:
print("Saving summary to {} ...".format(output))
ext = os.path.splitext(output)[1]
if ext == '.xls' or ext == '.xlsx':
summary.to_excel(output)
elif ext == '.html':
summary.to_html(output)
else:
summary.to_csv(output, sep='\t', encoding='utf-8')


if __name__ == "__main__":
_main(sys.argv)
main()

0 comments on commit 016de29

Please sign in to comment.