Skip to content

Commit

Permalink
Update tesseract traineddata loader with new path search.
Browse files Browse the repository at this point in the history
First, we look in TESSDATA_PREFIX (if defined).

If not found there, we look in ROMFS (in tessdata).

If not found there, we look at the configured "tessdata" path
(which defaults to ${datadir}/tessdata). (${datadir} defaults to
${prefix}/share on unix, and ${gsrootdir} on windows.)

If not found there, we look in the current directory.

Update doc/Devices.html (and fix some indexing).
  • Loading branch information
robinwatts committed Oct 13, 2020
1 parent 848077c commit 5af4f31
Show file tree
Hide file tree
Showing 8 changed files with 110 additions and 20 deletions.
5 changes: 5 additions & 0 deletions Makefile.in
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,11 @@ COMPILE_INITS=@COMPILE_INITS@

GS_LIB_DEFAULT=$(gsdatadir)/Resource/Init:$(gsdatadir)/lib:$(gsdatadir)/Resource/Font:$(gsdir)/fonts:@fontpath@

# Define the default search path for Tesseract. Separate multiple directories
# with a :.

TESSDATA=@tessdata@

# Define the default directory for cached data files
# this must be a single path.

Expand Down
2 changes: 1 addition & 1 deletion base/lib.mak
Original file line number Diff line number Diff line change
Expand Up @@ -3359,7 +3359,7 @@ $(GLD)romfs0.dev : $(LIB_MAK) $(ECHOGS_XE) $(LIB_MAK) $(MAKEDIRS)
$(GLGEN)gsromfs1_.c : $(MKROMFS_XE) $(PS_ROMFS_DEPS) $(LIB_MAK) $(MAKEDIRS)
$(EXP)$(MKROMFS_XE) -o $(GLGEN)gsromfs1_.c \
$(MKROMFS_FLAGS) -X .svn -X CVS -P $(GLSRCDIR)$(D)..$(D) iccprofiles$(D)* \
$(PS_ROMFS_ARGS) $(PS_FONT_ROMFS_ARGS) $(GL_ROMFS_ARGS) $(TESS_ROMFS_ARGS)
$(TESS_ROMFS_ARGS) $(PS_ROMFS_ARGS) $(PS_FONT_ROMFS_ARGS) $(GL_ROMFS_ARGS)

$(GLGEN)gsromfs1_1.c : $(MKROMFS_XE) $(PS_ROMFS_DEPS) $(LIB_MAK) $(MAKEDIRS)
$(EXP)$(MKROMFS_XE) -o $(GLGEN)gsromfs1_1.c \
Expand Down
2 changes: 1 addition & 1 deletion base/ocr.mak
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ $(GLGEN)libocr.dev : $(LIBOCR_MAK) $(ECHOGS_XE)$(MAKEDIRS)\
# Tesseract veneer.
$(GLGEN)tessocr.$(OBJ) : $(GLSRC)tessocr.cpp $(GLSRC)tessocr.h $(LIBOCR_MAK) \
$(gsmemory_h) $(gxiodev_h) $(stream_h) $(TESSDEPS)
$(TESSCXX) $(D_)LEPTONICA_INTERCEPT_MALLOC=1$(_D) $(I_)$(LEPTONICADIR)$(D)src$(_I) $(GLO_)tessocr.$(OBJ) $(C_) $(GLSRC)tessocr.cpp
$(TESSCXX) $(D_)LEPTONICA_INTERCEPT_MALLOC=1$(_D) $(I_)$(LEPTONICADIR)$(D)src$(_I) $(GLO_)tessocr.$(OBJ) $(C_) $(D_)TESSDATA="$(TESSDATA)"$(_D) $(GLSRC)tessocr.cpp

# 0 = No version.

Expand Down
2 changes: 1 addition & 1 deletion base/tesseract.mak
Original file line number Diff line number Diff line change
Expand Up @@ -1165,4 +1165,4 @@ TESSERACT_LEGACY_OBJS=\
TESSERACT_LEGACY=

TESS_ROMFS_ARGS=\
-c -d Resource/ -P .$(D)Resource$(D) Tesseract$(D)*
-c -P $(GLSRCDIR)$(D)..$(D) tessdata$(D)*
64 changes: 57 additions & 7 deletions base/tessocr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -146,23 +146,72 @@ load_file(const char* filename, GenericVector<char>* data) {
return result;
}

static bool
load_file_from_path(const char *path, const char *file, GenericVector<char> *out)
{
const char *sep = gp_file_name_directory_separator();
size_t seplen = strlen(sep);
size_t bufsize = strlen(path) + seplen + strlen(file) + 1;
const char *s, *e;
bool ret = 0;
char *buf = (char *)gs_alloc_bytes(leptonica_mem, bufsize, "load_file_from_path");
if (buf == NULL)
return 0;

s = path;
do {
e = path;
while (*e && *e != gp_file_name_list_separator)
e++;
memcpy(buf, s, e-s);
memcpy(&buf[e-s], sep, seplen);
strcpy(&buf[e-s+seplen], file);
ret = load_file(buf, out);
if (ret)
break;
s = e;
while (*s == gp_file_name_list_separator)
s++;
} while (*s != 0);

gs_free_object(leptonica_mem, buf, "load_file_from_path");

return ret;
}

#ifndef TESSDATA
#define TESSDATA tessdata
#endif
#define STRINGIFY2(S) #S
#define STRINGIFY(S) STRINGIFY2(S)
static char *tessdata_prefix = STRINGIFY(TESSDATA);

static bool
tess_file_reader(const char *fname, GenericVector<char> *out)
{
const char *file = fname;
const char *s;
char text[PATH_MAX];
int code = 0;
bool found;
stream *ps;
gx_io_device *iodev;

/* fname, as supplied to us by Tesseract has TESSDATA_PREFIX prepended
* to it. Check that first. */
found = load_file(fname, out);
if (found)
return found;

/* Find file, fname with any prefix removed, and use that in
* the rest of the searches. */
for (s = fname; *s; s++)
if (*s == '\\' || *s == '/')
file = s+1;

/* FIXME: Try loading 'file' from gs specific paths */
/* Next look in romfs in the tessdata directory. */
iodev = gs_findiodevice(leptonica_mem, (const byte *)"%rom", 4);
gs_snprintf(text, sizeof(text), "Resource/Tesseract/%s", file);
gs_snprintf(text, sizeof(text), "tessdata/%s", file);
if (iodev) {
long size;
long i;
Expand Down Expand Up @@ -195,12 +244,13 @@ tess_file_reader(const char *fname, GenericVector<char> *out)
}
}

/* Fall back to gp_file access, first under Resource/Tesseract */
if (load_file(text, out))
return true;
/* Fall back to gp_file access under our configured tessdata path. */
found = load_file_from_path(tessdata_prefix, file, out);
if (found)
return found;

/* Then under TESSDATA */
return load_file(fname, out);
/* If all else fails, look in the current directory. */
return load_file(file, out);
}

int
Expand Down
10 changes: 10 additions & 0 deletions configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -3198,6 +3198,16 @@ fi

AC_SUBST(fontpath)

dnl look for default tessdata...
AC_ARG_WITH([tessdata], AC_HELP_STRING([--with-tessdata],
[set tesseract data search path]), tessdata="$withval", tessdata="")

if test "x$tessdata" = "x"; then
tessdata="${datadir}/tessdata"
fi

AC_SUBST(tessdata)

dnl --------------------------------------------------
dnl Check for library functions
dnl --------------------------------------------------
Expand Down
39 changes: 29 additions & 10 deletions doc/Devices.htm
Original file line number Diff line number Diff line change
Expand Up @@ -70,13 +70,17 @@ <h2>Table of contents</h2>
<li><a href="#BMP">BMP file format</a></li>
<li><a href="#PCX">PCX file format</a></li>
<li><a href="#PSD">PSD file format (DeviceN color model)</a></li>
<li><a href="#PDFimage">Bitmap PDF output, PCLm output</a></li>
</ul>
<li><a href="#OCR-Devices">OCR Devices</a></li>
<ul>
<li><a href="#OCR">OCR text output</a></li>
<li><a href="#PDFocr">Bitmap PDF output (with OCR text)</a></li>
</ul>
<li><a href="#High-level">High level formats</a></li>
<ul>
<li><a href="#PDF">PDF file output</a></li>
<li><a href="#PDFimage">Bitmap PDF output, PCLm output</a></li>
<li><a href="#OCR">OCR devices</a></li>
<li><a href="#PDFocr">Bitmap PDF output (with OCR text)</a></li>
<li><a href="#PS">PostScript file output</a></li>
<li><a href="#EPS">EPS file output</a></li>
<li><a href="#PXL">PCL-XL file output</a></li>
Expand Down Expand Up @@ -954,9 +958,11 @@ <h3><a name="PDFimage"></a>PDF image output</h3>
possible) the drawing elements of the input file maintaining flexibility,
resolution independence, and editability.</p>

<h2><a name="High-level"></a>High-level devices</h2>
<hr>

<h3><a name="OCR"></a>Optical Character Recognition (OCR) output</h3>
<h2><a name="OCR-Devices"></a>Optical Character Recognition (OCR) devices</h2>

<h3><a name="OCR"></a>OCR text output</h3>

<p>
These devices render internally in 8 bit greyscale, and then
Expand All @@ -974,12 +980,23 @@ <h3><a name="OCR"></a>Optical Character Recognition (OCR) output</h3>
standard Tesseract tools.
</p>
<p>
These files are looked for from a variety of places. Firstly,
any files placed in &quot;Resource/Tesseract/&quot; will be
included in the binary for any standard (COMPILE_INITS=1) build.
Secondly, files will be searched for in the current directory.
Thirdly, files will be searched for in the directory given by
the environment variable TESSDATA_PREFIX.
These files are looked for from a variety of places.
</p>
<ul>
<li>Firstly, files will be searched for in the directory given by the
environment variable TESSDATA_PREFIX.
<li>Next, they will be searched for within the ROM filing system. Any
files placed in &quot;tessdata&quot; will be included within the ROM
filing system in the binary for any standard (COMPILE_INITS=1) build.
<li>Next, files will be searched for in the configured 'tessdata' path. On
Unix, this can be specified at the configure stage using
'--with-tessdata=&lt;path&gt;' (where &lt;path&gt; is a list of
directories to search, separated by ':' (on Unix) or ';' (on Windows)).
<li>Finally, we resort to searching the current directory.
</ul>
<p>
Please note, this pattern of directory searching differs from the original
release of the OCR devices.
</p>
<p>
By default, the OCR process defaults to looking for English text,
Expand Down Expand Up @@ -1042,6 +1059,8 @@ <h3><a name="PDFocr"></a>PDF image output (with OCR text)</h3>
</p>
<p>

<hr>

<h2><a name="High-level"></a>High-level devices</h2>

<h3><a name="PDF"></a>PDF writer</h3>
Expand Down
6 changes: 6 additions & 0 deletions psi/msvc.mak
Original file line number Diff line number Diff line change
Expand Up @@ -308,6 +308,12 @@ AROOTDIR=c:/gs
GSROOTDIR=$(AROOTDIR)/gs$(GS_DOT_VERSION)
!endif

# Define the directory to look in for tesseract data.

!ifndef TESSDATA
TESSDATA=$(GSROOTDIR)/tessdata
!endif

# Define the directory that will hold documentation at runtime.

!ifndef GS_DOCDIR
Expand Down

0 comments on commit 5af4f31

Please sign in to comment.