diff --git a/src/training/tesstrain.sh b/src/training/tesstrain.sh index e9e343f545..f0f4ba2341 100755 --- a/src/training/tesstrain.sh +++ b/src/training/tesstrain.sh @@ -14,39 +14,47 @@ # Tesseract. For a detailed description of the phases, see # https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract # -# USAGE: -# -# tesstrain.sh -# --fontlist FONTS # A list of fontnames to train on. -# --fonts_dir FONTS_PATH # Path to font files. -# --lang LANG_CODE # ISO 639 code. -# --langdata_dir DATADIR # Path to tesseract/training/langdata directory. -# --output_dir OUTPUTDIR # Location of output traineddata file. -# --save_box_tiff # Save box/tiff pairs along with lstmf files. -# --overwrite # Safe to overwrite files in output_dir. -# --linedata_only # Only generate training data for lstmtraining. -# --run_shape_clustering # Run shape clustering (use for Indic langs). -# --exposures EXPOSURES # A list of exposure levels to use (e.g. "-1 0 1"). -# -# OPTIONAL flags for input data. If unspecified we will look for them in -# the langdata_dir directory. -# --training_text TEXTFILE # Text to render and use for training. -# --wordlist WORDFILE # Word list for the language ordered by -# # decreasing frequency. -# -# OPTIONAL flag to specify location of existing traineddata files, required -# during feature extraction. If unspecified will use TESSDATA_PREFIX defined in -# the current environment. -# --tessdata_dir TESSDATADIR # Path to tesseract/tessdata directory. -# -# NOTE: -# The font names specified in --fontlist need to be recognizable by Pango using -# fontconfig. An easy way to list the canonical names of all fonts available on -# your system is to run text2image with --list_available_fonts and the -# appropriate --fonts_dir path. +display_usage() { +echo -e "USAGE: tesstrain.sh + --fontlist FONTS # A list of fontnames to train on. + --fonts_dir FONTS_PATH # Path to font files. + --lang LANG_CODE # ISO 639 code. + --langdata_dir DATADIR # Path to tesseract/training/langdata directory. + --output_dir OUTPUTDIR # Location of output traineddata file. + --save_box_tiff # Save box/tiff pairs along with lstmf files. + --overwrite # Safe to overwrite files in output_dir. + --linedata_only # Only generate training data for lstmtraining. + --run_shape_clustering # Run shape clustering (use for Indic langs). + --exposures EXPOSURES # A list of exposure levels to use (e.g. "-1 0 1"). + + OPTIONAL flags for input data. If unspecified we will look for them in + the langdata_dir directory. + --training_text TEXTFILE # Text to render and use for training. + --wordlist WORDFILE # Word list for the language ordered by + # decreasing frequency. + + OPTIONAL flag to specify location of existing traineddata files, required + during feature extraction. If unspecified will use TESSDATA_PREFIX defined in + the current environment. + --tessdata_dir TESSDATADIR # Path to tesseract/tessdata directory. + + NOTE: + The font names specified in --fontlist need to be recognizable by Pango using + fontconfig. An easy way to list the canonical names of all fonts available on + your system is to run text2image with --list_available_fonts and the + appropriate --fonts_dir path." +} source "$(dirname $0)/tesstrain_utils.sh" +if [[ "$1" == "--help" || "$1" == "-h" ]]; then + display_usage + exit 0 +fi +if [ $# == 0 ]; then + display_usage + exit 1 +fi ARGV=("$@") parse_flags