Skip to content

Commit

Permalink
Add --exposures option to tesstrain.sh
Browse files Browse the repository at this point in the history
This flag can be used to specify multiple different exposure levels
for a training. There was some code already in tesstrain_utils.sh
to deal with multiple exposure levels, so it looks like this
functionality was always intended.

The default usage does not change, with exposure level 0 being the
only one used if --exposures is not used.
  • Loading branch information
nickjwhite committed Sep 10, 2015
1 parent 8e71c79 commit c0133ec
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 29 deletions.
1 change: 1 addition & 0 deletions training/tesstrain.sh
Expand Up @@ -24,6 +24,7 @@
# --output_dir OUTPUTDIR # Location of output traineddata file.
# --overwrite # Safe to overwrite files in output_dir.
# --run_shape_clustering # Run shape clustering (use for Indic langs).
# --exposures EXPOSURES # A list of exposure levels to use (e.g. "-1 0 1").
#
# OPTIONAL flags for input data. If unspecified we will look for them in
# the langdata_dir directory.
Expand Down
69 changes: 40 additions & 29 deletions training/tesstrain_utils.sh
Expand Up @@ -26,6 +26,7 @@ OVERWRITE=0
RUN_SHAPE_CLUSTERING=0
EXTRACT_FONT_PROPERTIES=1
WORKSPACE_DIR="/tmp/tesstrain"
EXPOSURES=0

# Logging helper functions.
tlog() {
Expand Down Expand Up @@ -98,6 +99,16 @@ parse_flags() {
FONTS=( ${ARGV[$j]} )
IFS=$ofs
i=$j ;;
--exposures)
exp=""
while test $j -lt ${#ARGV[@]}; do
test -z ${ARGV[$j]} && break
test `echo ${ARGV[$j]} | cut -c -2` = "--" && break
exp="$exp ${ARGV[$j]}"
j=$((j+1))
done
parse_value "EXPOSURES" "$exp"
i=$((j-1)) ;;
--fonts_dir)
parse_value "FONTS_DIR" ${ARGV[$j]}
i=$j ;;
Expand Down Expand Up @@ -226,35 +237,36 @@ phase_I_generate_image() {
err_exit "Could not find training text file ${TRAINING_TEXT}"
fi
CHAR_SPACING="0.0"
EXPOSURE="0"

if (( ${EXTRACT_FONT_PROPERTIES} )) && [[ -r ${BIGRAM_FREQS_FILE} ]]; then
# Parse .bigram_freqs file and compose a .train_ngrams file with text
# for tesseract to recognize during training. Take only the ngrams whose
# combined weight accounts for 95% of all the bigrams in the language.
NGRAM_FRAC=$(cat ${BIGRAM_FREQS_FILE} \
| awk '{s=s+$2}; END {print (s/100)*p}' p=99)
cat ${BIGRAM_FREQS_FILE} | sort -rnk2 \
| awk '{s=s+$2; if (s <= x) {printf "%s ", $1; } }' \
x=${NGRAM_FRAC} > ${TRAIN_NGRAMS_FILE}
check_file_readable ${TRAIN_NGRAMS_FILE}
fi

local counter=0
for font in "${FONTS[@]}"; do
generate_font_image "${font}" &
let counter=counter+1
let rem=counter%par_factor
if [[ "${rem}" -eq 0 ]]; then
wait
for EXPOSURE in $EXPOSURES; do
if (( ${EXTRACT_FONT_PROPERTIES} )) && [[ -r ${BIGRAM_FREQS_FILE} ]]; then
# Parse .bigram_freqs file and compose a .train_ngrams file with text
# for tesseract to recognize during training. Take only the ngrams whose
# combined weight accounts for 95% of all the bigrams in the language.
NGRAM_FRAC=$(cat ${BIGRAM_FREQS_FILE} \
| awk '{s=s+$2}; END {print (s/100)*p}' p=99)
cat ${BIGRAM_FREQS_FILE} | sort -rnk2 \
| awk '{s=s+$2; if (s <= x) {printf "%s ", $1; } }' \
x=${NGRAM_FRAC} > ${TRAIN_NGRAMS_FILE}
check_file_readable ${TRAIN_NGRAMS_FILE}
fi
done
wait
# Check that each process was successful.
for font in "${FONTS[@]}"; do
local fontname=$(echo ${font} | tr ' ' '_' | sed 's/,//g')
local outbase=${TRAINING_DIR}/${LANG_CODE}.${fontname}.exp${EXPOSURE}
check_file_readable ${outbase}.box ${outbase}.tif

local counter=0
for font in "${FONTS[@]}"; do
generate_font_image "${font}" &
let counter=counter+1
let rem=counter%par_factor
if [[ "${rem}" -eq 0 ]]; then
wait
fi
done
wait
# Check that each process was successful.
for font in "${FONTS[@]}"; do
local fontname=$(echo ${font} | tr ' ' '_' | sed 's/,//g')
local outbase=${TRAINING_DIR}/${LANG_CODE}.${fontname}.exp${EXPOSURE}
check_file_readable ${outbase}.box ${outbase}.tif
done
done
}

Expand Down Expand Up @@ -359,10 +371,9 @@ phase_E_extract_features() {
par_factor=1
fi
tlog "\n=== Phase E: Extracting features ==="
TRAIN_EXPOSURES='0'

local img_files=""
for exposure in ${TRAIN_EXPOSURES}; do
for exposure in ${EXPOSURES}; do
img_files=${img_files}' '$(ls ${TRAINING_DIR}/*.exp${exposure}.tif)
done

Expand Down

0 comments on commit c0133ec

Please sign in to comment.