Skip to content

Commit

Permalink
List available languages recursively
Browse files Browse the repository at this point in the history
Tesseract supports hierarchies of languages and uses them since
the new files best/*.traineddata were added.

Now `tesseract --list-langs` also shows any traineddata files in
subdirectories of the tessdata directory.

Signed-off-by: Stefan Weil <sw@weilnetz.de>
  • Loading branch information
stweil committed Aug 10, 2017
1 parent efa50da commit cc0d87c
Showing 1 changed file with 63 additions and 39 deletions.
102 changes: 63 additions & 39 deletions api/baseapi.cpp
Expand Up @@ -39,6 +39,9 @@
#include <dirent.h>
#include <libgen.h>
#include <string.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#endif // _WIN32

#include <fstream>
Expand Down Expand Up @@ -109,6 +112,65 @@ const int kMaxCredibleResolution = 2400;
/** Default resolution. */
const int kDefaultResolution = 300;

/* Add all available languages recursively.
*/
static void addAvailableLanguages(const STRING &datadir, const STRING &base,
GenericVector<STRING>* langs)
{
const STRING base2 = (base.string()[0] == '\0') ? base : base + "/";
const size_t extlen = sizeof(kTrainedDataSuffix);
#ifdef _WIN32
WIN32_FIND_DATA data;
HANDLE handle = FindFirstFile((datadir + base2 + "*").string(), &data);
if (handle != INVALID_HANDLE_VALUE) {
BOOL result = TRUE;
for (; result;) {
char *name = data.cFileName;
// Skip '.', '..', and hidden files
if (name[0] != '.') {
if ((data.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) ==
FILE_ATTRIBUTE_DIRECTORY) {
addAvailableLanguages(datadir, base2 + name, langs);
} else {
size_t len = strlen(name);
if (len > extlen && name[len - extlen] == '.' &&
strcmp(&name[len - extlen + 1], kTrainedDataSuffix) == 0) {
name[len - extlen] = '\0';
langs->push_back(base2 + name);
}
}
}
result = FindNextFile(handle, &data);
}
FindClose(handle);
}
#else // _WIN32
DIR* dir = opendir((datadir + base).string());
if (dir != NULL) {
dirent *de;
while ((de = readdir(dir))) {
char *name = de->d_name;
// Skip '.', '..', and hidden files
if (name[0] != '.') {
struct stat st;
if (stat((datadir + base2 + name).string(), &st) == 0 &&
(st.st_mode & S_IFDIR) == S_IFDIR) {
addAvailableLanguages(datadir, base2 + name, langs);
} else {
size_t len = strlen(name);
if (len > extlen && name[len - extlen] == '.' &&
strcmp(&name[len - extlen + 1], kTrainedDataSuffix) == 0) {
name[len - extlen] = '\0';
langs->push_back(base2 + name);
}
}
}
}
closedir(dir);
}
#endif
}

TessBaseAPI::TessBaseAPI()
: tesseract_(nullptr),
osd_tesseract_(nullptr),
Expand Down Expand Up @@ -393,45 +455,7 @@ void TessBaseAPI::GetAvailableLanguagesAsVector(
GenericVector<STRING>* langs) const {
langs->clear();
if (tesseract_ != NULL) {
#ifdef _WIN32
STRING pattern = tesseract_->datadir + "/*." + kTrainedDataSuffix;
char fname[_MAX_FNAME];
WIN32_FIND_DATA data;
BOOL result = TRUE;
HANDLE handle = FindFirstFile(pattern.string(), &data);
if (handle != INVALID_HANDLE_VALUE) {
for (; result; result = FindNextFile(handle, &data)) {
_splitpath(data.cFileName, NULL, NULL, fname, NULL);
langs->push_back(STRING(fname));
}
FindClose(handle);
}
#else // _WIN32
DIR *dir;
struct dirent *dirent;
char *dot;

STRING extension = STRING(".") + kTrainedDataSuffix;

dir = opendir(tesseract_->datadir.string());
if (dir != NULL) {
while ((dirent = readdir(dir))) {
// Skip '.', '..', and hidden files
if (dirent->d_name[0] != '.') {
if (strstr(dirent->d_name, extension.string()) != NULL) {
dot = strrchr(dirent->d_name, '.');
// This ensures that .traineddata is at the end of the file name
if (strncmp(dot, extension.string(),
strlen(extension.string())) == 0) {
*dot = '\0';
langs->push_back(STRING(dirent->d_name));
}
}
}
}
closedir(dir);
}
#endif
addAvailableLanguages(tesseract_->datadir, "", langs);
}
}

Expand Down

0 comments on commit cc0d87c

Please sign in to comment.