#!/usr/bin/ruby -w
require 'tileinfo.rb'
# The directory with the helper files:
helpers = "png-find-grid-revised"
unless system("cd #{helpers} && make")
puts "Failed to build the helper programs. Is libpng12-dev installed?"
exit(-1)
end
charts_index_page = "http://www.unicode.org/charts/PDF/"
index_html = `curl -s #{charts_index_page}`
unless $?.success?
puts "Couldn't fetch the charts index page from: #{charts_index_page}"
exit(-1)
end
pdf_names = Array.new
pdf_re = Regexp.new( '(U([0-9A-F]+)\.pdf)' )
index_html.scan( pdf_re ) { |m| pdf_names.push( m[0] ) }
pdf_names.sort!.uniq!
pdf_names.sort! { |a,b|
a =~ pdf_re; a_hex_string = $2
b =~ pdf_re; b_hex_string = $2
a_hex_string.hex <=> b_hex_string.hex
}
pdf_names.each do |p|
puts "Downloading: #{p}"
unless FileTest.exist?( p )
unless system( "curl", "-q", charts_index_page + p, "-o", p )
puts "Failed to download: #{p}"
exit( -1 )
end
end
end
pdfs_to_skip = Hash.new
pdfs_to_skip['UD800.pdf'] = "High Surrogate Area"
pdfs_to_skip['UDC00.pdf'] = "Low Surrogate Area"
pdfs_to_skip['UE000.pdf'] = "Private Use Area"
pdfs_to_skip['UF0000.pdf'] = "Supplementary Private Use Area-A"
pdfs_to_skip['U100000.pdf'] = "Supplementary Private Use Area-B"
pdf_names.each do |p|
png_name = p.gsub( /.pdf$/, '-%03d.png' )
second_png_name = p.gsub( /\.pdf/, '-002.png' )
if FileTest.exist?( second_png_name )
puts "#{p} has probably already been converted"
elsif pdfs_to_skip[p]
puts "We're supposed to skip #{p}"
else
puts "Converting #{p} to #{png_name}"
unless system( "gs",
"-sOutputFile=#{png_name}",
"-dNOPAUSE",
"-dSAFER",
"-dBATCH",
"-q",
"-sDEVICE=pnggray",
"-r800",
"-dGraphicsAlphaBits=4",
"-dTextAlphaBits=4",
p )
puts "Failed to convert #{p} to a series of PNG files"
exit(-1)
end
end
end
puts "Globbing directory..."
png_files = Dir['U*-*.png']
some_done_done_from = Hash.new
already_output_files = Dir['individual-characters/U*-*-*.png']
already_output_files.each do |already_done|
if already_done =~ /(U[0-9A-F]+-[0-9A-F]+)/
some_done_done_from[$1+".png"] = true
end
end
pngs_with_no_grid = Dir['individual-characters/U*-*.png.empty']
pngs_with_no_grid.each do |already_done|
if already_done =~ /(U[0-9A-F]+-[0-9A-F]+)/
some_done_done_from[$1+".png"] = true
end
end
png_files.each do |png_file|
next if png_file =~ /U.*-001.png/
if some_done_done_from[png_file]
puts "Looks as if we've already split #{png_file}"
next
end
puts "Splitting characters out of each PNG file... "
puts " Considering: "+png_file
png_output_template = "individual-characters/" + png_file.gsub(/\.png/,"-%08d.png")
last_in_each_cell_y = nil
first_in_each_cell_y = nil
last_in_each_cell_x = nil
first_in_each_cell_x = nil
# The find-grid program gets certain code pages wrong, and one of
# the PDF files shouldn't be in there.
if png_file =~ /UFEFF-.*\.png/
next # The PDF shouldn't be in that directory...
elsif png_file =~ /U2580-002\.png/
# Fix this one...
last_in_each_cell_x = [ 2247, 2592 ]
last_in_each_cell_y = [ 1244, 1684, 2124, 2564, 3004, 3444, 3884, 4324, 4764, 5204, 5644, 6084, 6524, 6964, 7404, 7837 ]
first_in_each_cell_x = [ 1908, 2252 ]
first_in_each_cell_y = [ 816, 1249, 1689, 2129, 2569, 3009, 3449, 3889, 4329, 4769, 5209, 5649, 6089, 6529, 6969, 7409 ]
elsif png_file =~ /UFFF0-002\.png/
last_in_each_cell_x = [ 2414 ]
last_in_each_cell_y = [ 5204, 5644, 6084, 6524, 6964 ]
first_in_each_cell_x = [ 2085 ]
first_in_each_cell_y = [ 4769, 5209, 5649, 6089, 6529 ]
elsif png_file =~ /UFB50-004\.png/
last_in_each_cell_x = [ 1387, 1739, 2090, 2442, 2794, 3146, 3497, 3849, 4200, 4552, 4904, 5952 ]
last_in_each_cell_y = [ 1244, 1684, 2124, 2564, 3004, 3444, 3884, 4324, 4764, 5204, 5644, 6084, 6524, 6964, 7404, 7837 ]
first_in_each_cell_x = [ 1041, 1392, 1744, 2095, 2447, 2799, 3151, 3502, 3854, 4205, 4557, 5612 ]
first_in_each_cell_y = [ 816, 1249, 1689, 2129, 2569, 3009, 3449, 3889, 4329, 4769, 5209, 5649, 6089, 6529, 6969, 7409 ]
else
grid_results_lines = `#{helpers}/find-grid #{png_file} 333 427`.split("\n")
unless $?.success?
puts "#{helpers}/find-grid #{png_file} 333 427 failed"
exit(-1)
end
grid_results_lines.each do |line|
line.chomp!
values = line.gsub(/^(.*): *()/,'\2').split(/ +/)
name = line.gsub(/^ *(.*):.*$/,'\1')
case name
when 'last_in_each_cell_x'
last_in_each_cell_x = values
when 'last_in_each_cell_y'
last_in_each_cell_y = values
when 'first_in_each_cell_x'
first_in_each_cell_x = values
when 'first_in_each_cell_y'
first_in_each_cell_y = values
end
end
end
cells_width = last_in_each_cell_x.length
cells_height = last_in_each_cell_y.length
if ((cells_width == 0) && (cells_height != 0)) || ((cells_width != 0) && (cells_height == 0))
STDERR.puts "Broken: divided #{png_file} into #{cells_width} by #{cells_height}"
end
if ((cells_width == 0) || (cells_height == 0))
system "touch individual-characters/#{png_file}.empty"
next
end
puts " Cropping from #{png_file}"
p = IO.popen( "#{helpers}/crop-images #{png_file}", "w" )
c_number = 0
0.upto( cells_width - 1 ) do |cell_x|
0.upto( cells_height - 1 ) do |cell_y|
right_x = Integer(last_in_each_cell_x[cell_x])
left_x = Integer(first_in_each_cell_x[cell_x])
top_y = Integer(first_in_each_cell_y[cell_y])
bottom_y = Integer(last_in_each_cell_y[cell_y])
output_filename = sprintf( png_output_template, c_number )
unless FileTest.exist?( output_filename )
puts " Cropping to: #{output_filename}"
p.puts( "#{output_filename} #{left_x} #{top_y} #{(right_x - left_x) + 1} #{(bottom_y - top_y) + 1}" )
p.flush
end
c_number += 1
end
end
p.close
unless $?.success?
puts "crop-images failed"
exit(-1)
end
end
def compare_tile_filenames( a, b )
Integer( a.gsub( /^.*U([0-9A-F]+)\-([0-9A-F]+)\-([0-9A-F]+).*\.png$/, '0x\1\2\3' ) ) <=>
Integer( b.gsub( /^.*U([0-9A-F]+)\-([0-9A-F]+)\-([0-9A-F]+).*\.png$/, '0x\1\2\3' ) )
end
files = Dir['individual-characters/U*-*-*.png']
# If this is a repeat run then we may have some leftover -top and
# -bottom files here:
files.delete_if { |x| x =~ /top/ }
files.delete_if { |x| x =~ /bottom/ }
files.sort! { |a,b| compare_tile_filenames( a, b ) }
# ------------------------------------------------------------------------
last_block = -1
open( "top-sizes.yaml", "w" ) do |o_top_sizes|
o_top_sizes.puts "---"
open( "codepoints.yaml", "w" ) do |o_codepoints|
files.each do |fname|
fname =~ /U([0-9A-F]+)-([0-9A-F]+)/
block = Integer("0x"+$1)
if block != last_block
puts "Extracting numbers from block: " + sprintf("0x%06X",block)
last_block = block
end
block_name = name_of_page(fname.gsub(/^.*(U[0-9A-F]+).*$/,'\1'))
next unless block_name
info = `#{helpers}/png-size #{fname}`
unless $?.success?
puts "png-size #{fname} failed."
next
end
width = nil
height = nil
info.chomp!
if info =~ /(\d+)x(\d+)/
width = Integer($1)
height = Integer($2)
top_part_fname = fname.sub( /.png/, '-top.png' )
bottom_part_fname = fname.sub( /.png/, '-bottom.png' )
guessed_text_size = 75
text_starts_at = height - guessed_text_size
p = IO.popen( "#{helpers}/crop-images #{fname}", "w" )
p.puts( "#{top_part_fname} 0 0 #{width} #{text_starts_at}" )
p.puts( "#{bottom_part_fname} 0 #{text_starts_at} #{width} #{guessed_text_size}" )
p.close
unless $?.success?
puts "#{helpers}/crop-images #{fname} failed."
exit(-1)
end
top_width = nil
top_height = nil
output = safe_backticks("#{helpers}/empty-image",top_part_fname)
if $?.success?
puts "Empty, skipping: #{fname}"
system("rm",top_part_fname)
system("rm",bottom_part_fname)
next
else
output.chomp!
if output =~ /^(\d+) (\d+)/
top_width = Integer($1)
top_height = Integer($2)
end
end
# Check that the bottom part isn't hashed out, and delete
# it if so:
output = safe_backticks("#{helpers}/bottom-line-proportion",bottom_part_fname)
proportion = Float(output)
if proportion > 0.01
puts "Probably cross-hatched, skipping: #{fname}"
system("rm",top_part_fname)
system("rm",bottom_part_fname)
next
end
if top_width and top_height
o_top_sizes.puts("#{top_part_fname}: !ruby/object:TileInfo")
o_top_sizes.puts(" filename: #{top_part_fname}")
o_top_sizes.puts(" h: #{top_height}")
o_top_sizes.puts(" w: #{top_width}")
o_top_sizes.puts(" block: #{block_name}")
end
# Now use OCR to try to parse the codepoint out of the bottom
# part:
result = `pngtopnm #{bottom_part_fname} | ocrad -`
unless $?.success?
puts "png-topnm #{bottom_part_fname} | ocrad - failed."
exit(-1)
end
result.chomp!
result.gsub!( /\s/, '' )
result.gsub!( /[oO]/, '0' )
result.gsub!( /a/, '8' )
result.gsub!( /g/, '9' )
result.upcase!
o_codepoints.puts "-"
o_codepoints.puts ' - "' + fname + '"'
o_codepoints.puts " - " + "0x" + result
end
end
end
end