Skip to content

Commit

Permalink
Add support for ALTO output
Browse files Browse the repository at this point in the history
  • Loading branch information
jakesebright authored and stweil committed Nov 30, 2018
1 parent 685b136 commit d7cee03
Show file tree
Hide file tree
Showing 12 changed files with 309 additions and 2 deletions.
3 changes: 2 additions & 1 deletion CMakeLists.txt
Expand Up @@ -215,6 +215,7 @@ set(tesseract_src ${tesseract_src}
src/api/capi.cpp
src/api/renderer.cpp
src/api/pdfrenderer.cpp
src/api/altorenderer.cpp
)

if (WIN32)
Expand All @@ -223,7 +224,7 @@ if (WIN32)
set(tesseract_hdr
${tesseract_hdr}
${CMAKE_CURRENT_SOURCE_DIR}/src/vs2010/tesseract/resource.h)
set(tesseract_rsc ${CMAKE_CURRENT_BINARY_DIR}/vs2010/tesseract/libtesseract.rc)
set(tesseract_rsc ${CMAKE_CURRENT_BINARY_DIR}/vs2010/tesseract/libtesseract.rc src/api/altorenderer.cpp)
set_source_files_properties(
${CMAKE_CURRENT_SOURCE_DIR}/src/arch/dotproductsse.cpp
PROPERTIES COMPILE_DEFINITIONS __SSE4_1__)
Expand Down
1 change: 1 addition & 0 deletions android/jni/Android.mk
Expand Up @@ -31,6 +31,7 @@ LOCAL_SRC_FILES := $(wildcard $(LOCAL_PATH)/../../api/*.cpp $(LOCAL_PATH)/../../

EXPLICIT_SRC_EXCLUDES := \
$(LOCAL_PATH)/../../api/pdfrenderer.cpp \
$(LOCAL_PATH)/../../api/altorenderer.cpp \
$(LOCAL_PATH)/../../api/tesseractmain.cpp \

LOCAL_SRC_FILES := $(filter-out $(EXPLICIT_SRC_EXCLUDES), $(LOCAL_SRC_FILES))
Expand Down
2 changes: 1 addition & 1 deletion src/api/Makefile.am
Expand Up @@ -32,7 +32,7 @@ libtesseract_api_la_CPPFLAGS = $(AM_CPPFLAGS)
if VISIBILITY
libtesseract_api_la_CPPFLAGS += -DTESS_EXPORTS
endif
libtesseract_api_la_SOURCES = baseapi.cpp capi.cpp renderer.cpp pdfrenderer.cpp
libtesseract_api_la_SOURCES = baseapi.cpp capi.cpp renderer.cpp pdfrenderer.cpp altorenderer.cpp

lib_LTLIBRARIES += libtesseract.la
libtesseract_la_LDFLAGS = $(LEPTONICA_LIBS) $(OPENCL_LDFLAGS)
Expand Down
252 changes: 252 additions & 0 deletions src/api/altorenderer.cpp
@@ -0,0 +1,252 @@
// File: altorenderer.cpp
// Description: ALTO rendering interface
// Author: Jake Sebright

// (C) Copyright 2018
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "baseapi.h"
#include <memory>
#include "renderer.h"

namespace tesseract {

///
/// Add coordinates to specified TextBlock, TextLine, or String bounding box
/// Add word confidence if adding to a String bounding box
///
static void AddBoxToAlto(const ResultIterator *it, PageIteratorLevel level,
STRING *alto_str) {
int left, top, right, bottom;
it->BoundingBox(level, &left, &top, &right, &bottom);

int hpos = left;
int vpos = top;
int height = bottom - top;
int width = right - left;

*alto_str += " HPOS=\"";
alto_str->add_str_int("", hpos);
*alto_str += "\"";
*alto_str += " VPOS=\"";
alto_str->add_str_int("", vpos);
*alto_str += "\"";
*alto_str += " WIDTH=\"";
alto_str->add_str_int("", width);
*alto_str += "\"";
*alto_str += " HEIGHT=\"";
alto_str->add_str_int("", height);
*alto_str += "\"";

if (level == RIL_WORD) {
int wc = it->Confidence(RIL_WORD);
*alto_str += " WC=\"0.";
alto_str->add_str_int("", wc);
*alto_str += "\"";
}
if (level != RIL_WORD) {

*alto_str += ">";
}
}

///
/// Add a unique ID to an ALTO element
///
static void AddIdToAlto(STRING *alto_str, const std::string base, int num1) {
const size_t BUFSIZE = 64;
char id_buffer[BUFSIZE];
snprintf(id_buffer, BUFSIZE - 1, "%s_%d", base.c_str(), num1);
id_buffer[BUFSIZE - 1] = '\0';
*alto_str += " ID=\"";
*alto_str += id_buffer;
*alto_str += "\"";
}

///
/// Append the ALTO XML for the beginning of the document
///
bool TessAltoRenderer::BeginDocumentHandler() {
AppendString(
"<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
"<alto xmlns=\"http://www.loc.gov/standards/alto/ns-v3#\" xmlns:xlink=\"http://www.w3.org/1999/xlink\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:schemaLocation=\"http://www.loc.gov/standards/alto/ns-v3# http://www.loc.gov/alto/v3/alto-3-0.xsd\">\n"
"\t<Description>\n"
"\t\t<MeasurementUnit>pixel</MeasurementUnit>\n"
"\t\t<sourceImageInformation>\n"
"\t\t\t<fileName>");

AppendString(title());

AppendString("\t\t\t</fileName>\n"
"\t\t</sourceImageInformation>\n"
"\t\t<OCRProcessing ID=\"OCR_0\">\n"
"\t\t\t<ocrProcessingStep>\n"
"\t\t\t\t<processingSoftware>\n"
"\t\t\t\t\t<softwareName>tesseract ");
AppendString(TessBaseAPI::Version());
AppendString("</softwareName>\n"
"\t\t\t\t</processingSoftware>\n"
"\t\t\t</ocrProcessingStep>\n"
"\t\t</OCRProcessing>\n"
"\t</Description>\n"
"\t<Layout>\n");

return true;
}

///
/// Append the ALTO XML for the layout of the image
///
bool TessAltoRenderer::AddImageHandler(TessBaseAPI* api) {
const std::unique_ptr<const char[]> hocr(api->GetAltoText(imagenum()));
if (hocr == nullptr) return false;

AppendString(hocr.get());

return true;
}

///
/// Append the ALTO XML for the end of the document
///
bool TessAltoRenderer::EndDocumentHandler() {
AppendString("\t</Layout>\n</alto>\n");

return true;
}

TessAltoRenderer::TessAltoRenderer(const char *outputbase)
: TessResultRenderer(outputbase, "xml") {
}

///
/// Make an XML-formatted string with ALTO markup from the internal
/// data structures.
///
char *TessBaseAPI::GetAltoText(int page_number) {
return GetAltoText(nullptr, page_number);
}

///
/// Make an XML-formatted string with ALTO markup from the internal
/// data structures.
///
char *TessBaseAPI::GetAltoText(ETEXT_DESC *monitor, int page_number) {
if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(monitor) < 0))
return nullptr;

int lcnt = 0, bcnt = 0, wcnt = 0;
int page_id = page_number;

STRING alto_str("");

if (input_file_ == nullptr)
SetInputName(nullptr);

#ifdef _WIN32
// convert input name from ANSI encoding to utf-8
int str16_len =
MultiByteToWideChar(CP_ACP, 0, input_file_->string(), -1, nullptr, 0);
wchar_t *uni16_str = new WCHAR[str16_len];
str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_->string(), -1,
uni16_str, str16_len);
int utf8_len = WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, nullptr, 0,
nullptr, nullptr);
char *utf8_str = new char[utf8_len];
WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, utf8_str,
utf8_len, nullptr, nullptr);
*input_file_ = utf8_str;
delete[] uni16_str;
delete[] utf8_str;
#endif

alto_str += "\t\t<Page WIDTH=\"";
alto_str.add_str_int("", rect_width_);
alto_str += "\" HEIGHT=\"";
alto_str.add_str_int("", rect_height_);
alto_str += "\" PHYSICAL_IMG_NR=\"";
alto_str.add_str_int("", rect_height_);
alto_str += "\"";
AddIdToAlto(&alto_str, "page", page_id);
alto_str += ">\n";
alto_str += ("\t\t\t<PrintSpace HPOS=\"0\" "
"VPOS=\"0\""
" WIDTH=\"");
alto_str.add_str_int("", rect_width_);
alto_str += "\" HEIGHT=\"";
alto_str.add_str_int("", rect_height_);
alto_str += "\">\n";

ResultIterator *res_it = GetIterator();
while (!res_it->Empty(RIL_BLOCK)) {
if (res_it->Empty(RIL_WORD)) {
res_it->Next(RIL_WORD);
continue;
}

if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
alto_str += "\t\t\t\t<TextBlock ";
AddIdToAlto(&alto_str, "block", bcnt);
AddBoxToAlto(res_it, RIL_BLOCK, &alto_str);
alto_str += "\n";
}

if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {

alto_str += "\t\t\t\t\t<TextLine ";
AddIdToAlto(&alto_str, "line", lcnt);
AddBoxToAlto(res_it, RIL_TEXTLINE, &alto_str);
alto_str += "\n";
}

alto_str += "\t\t\t\t\t\t<String ";
AddIdToAlto(&alto_str, "string", wcnt);
AddBoxToAlto(res_it, RIL_WORD, &alto_str);
alto_str += " CONTENT=\"";


bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);

do {
const std::unique_ptr<const char[]> grapheme(
res_it->GetUTF8Text(RIL_SYMBOL));
if (grapheme && grapheme[0] != 0) {
alto_str += HOcrEscape(grapheme.get());
}
res_it->Next(RIL_SYMBOL);
} while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));

alto_str += "\"/>\n";

wcnt++;

if (last_word_in_line) {
alto_str += "\t\t\t\t\t</TextLine>\n";
lcnt++;
}

if (last_word_in_block) {
alto_str += "\t\t\t\t</TextBlock>\n";
bcnt++;
}
}

alto_str += "\t\t\t</PrintSpace>\n";
alto_str += "\t\t</Page>\n";

char *ret = new char[alto_str.length() + 1];
strcpy(ret, alto_str.string());
delete res_it;
return ret;
}

}
13 changes: 13 additions & 0 deletions src/api/baseapi.h
Expand Up @@ -594,6 +594,19 @@ class TESS_API TessBaseAPI {
*/
char* GetHOCRText(int page_number);

/**
* Make an XML-formatted string with Alto markup from the internal
* data structures.
*/
char* GetAltoText(ETEXT_DESC* monitor, int page_number);


/**
* Make an XML-formatted string with Alto markup from the internal
* data structures.
*/
char* GetAltoText(int page_number);

/**
* Make a TSV-formatted string from the internal data structures.
* page_number is 0-based but will appear in the output as 1-based.
Expand Down
5 changes: 5 additions & 0 deletions src/api/capi.cpp
Expand Up @@ -66,6 +66,11 @@ TESS_API TessResultRenderer* TESS_CALL TessHOcrRendererCreate2(const char* outpu
return new TessHOcrRenderer(outputbase, font_info);
}

TESS_API TessResultRenderer* TESS_CALL TessAltoRendererCreate(const char* outputbase)
{
return new TessAltoRenderer(outputbase);
}

TESS_API TessResultRenderer* TESS_CALL TessPDFRendererCreate(const char* outputbase, const char* datadir,
BOOL textonly)
{
Expand Down
4 changes: 4 additions & 0 deletions src/api/capi.h
Expand Up @@ -56,6 +56,7 @@ extern "C" {
typedef tesseract::TessResultRenderer TessResultRenderer;
typedef tesseract::TessTextRenderer TessTextRenderer;
typedef tesseract::TessHOcrRenderer TessHOcrRenderer;
typedef tesseract::TessAltoRenderer TessAltoRenderer;
typedef tesseract::TessPDFRenderer TessPDFRenderer;
typedef tesseract::TessUnlvRenderer TessUnlvRenderer;
typedef tesseract::TessBoxTextRenderer TessBoxTextRenderer;
Expand Down Expand Up @@ -126,6 +127,7 @@ TESS_API void TESS_CALL TessDeleteIntArray(int* arr);
TESS_API TessResultRenderer* TESS_CALL TessTextRendererCreate(const char* outputbase);
TESS_API TessResultRenderer* TESS_CALL TessHOcrRendererCreate(const char* outputbase);
TESS_API TessResultRenderer* TESS_CALL TessHOcrRendererCreate2(const char* outputbase, BOOL font_info);
TESS_API TessResultRenderer* TESS_CALL TessAltoRendererCreate(const char* outputbase);
TESS_API TessResultRenderer* TESS_CALL TessPDFRendererCreate(const char* outputbase, const char* datadir,
BOOL textonly);
TESS_API TessResultRenderer* TESS_CALL TessUnlvRendererCreate(const char* outputbase);
Expand Down Expand Up @@ -277,6 +279,8 @@ TESS_API TessMutableIterator*
TESS_API char* TESS_CALL TessBaseAPIGetUTF8Text(TessBaseAPI* handle);
TESS_API char* TESS_CALL TessBaseAPIGetHOCRText(TessBaseAPI* handle, int page_number);

TESS_API char* TESS_CALL TessBaseAPIGetAltoText(TessBaseAPI* handle, int page_number);

TESS_API char* TESS_CALL TessBaseAPIGetBoxText(TessBaseAPI* handle, int page_number);

TESS_API char* TESS_CALL TessBaseAPIGetUNLVText(TessBaseAPI* handle);
Expand Down
14 changes: 14 additions & 0 deletions src/api/renderer.h
Expand Up @@ -166,6 +166,20 @@ class TESS_API TessHOcrRenderer : public TessResultRenderer {
bool font_info_; // whether to print font information
};

/**
* Renders tesseract output into an alto text string
*/
class TESS_API TessAltoRenderer : public TessResultRenderer {
public:
explicit TessAltoRenderer(const char *outputbase);

protected:
virtual bool BeginDocumentHandler();
virtual bool AddImageHandler(TessBaseAPI* api);
virtual bool EndDocumentHandler();

};

/**
* Renders Tesseract output into a TSV string
*/
Expand Down
13 changes: 13 additions & 0 deletions src/api/tesseractmain.cpp
Expand Up @@ -419,6 +419,19 @@ static void PreloadRenderers(
}
}

api->GetBoolVariable("tessedit_create_alto", &b);
if (b) {
tesseract::TessAltoRenderer* renderer =
new tesseract::TessAltoRenderer(outputbase);
if (renderer->happy()) {
renderers->push_back(renderer);
} else {
delete renderer;
tprintf("Error, could not create ALTO output file: %s\n",
strerror(errno));
}
}

api->GetBoolVariable("tessedit_create_tsv", &b);
if (b) {
bool font_info;
Expand Down
2 changes: 2 additions & 0 deletions src/ccmain/tesseractclass.cpp
Expand Up @@ -387,6 +387,8 @@ Tesseract::Tesseract()
this->params()),
BOOL_MEMBER(tessedit_create_hocr, false, "Write .html hOCR output file",
this->params()),
BOOL_MEMBER(tessedit_create_alto, false, "Write .xml ALTO file",
this->params()),
BOOL_MEMBER(tessedit_create_tsv, false, "Write .tsv output file",
this->params()),
BOOL_MEMBER(tessedit_create_pdf, false, "Write .pdf output file",
Expand Down

3 comments on commit d7cee03

@nguyenq
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

capi.cpp seems to be missing the implementation for TessBaseAPIGetAltoText method.

@zdenop
Copy link
Contributor

@zdenop zdenop commented on d7cee03 Dec 26, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks. added in cc997b5. Can you please test Alto output via C-API?

@nguyenq
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems to be working fine. For eurotext.tif, it produces:

<Page WIDTH="1024" HEIGHT="800" PHYSICAL_IMG_NR="0" ID="page_0">
	<PrintSpace HPOS="0" VPOS="0" WIDTH="1024" HEIGHT="800">
		<TextBlock ID="block_0" HPOS="98" VPOS="66" WIDTH="820" HEIGHT="595">
			<TextLine ID="line_0" HPOS="105" VPOS="66" WIDTH="718" HEIGHT="47">
				<String ID="string_0" HPOS="105" VPOS="66" WIDTH="73" HEIGHT="31" WC="0.95" CONTENT="The"/><SP WIDTH="27" VPOS="66" HPOS="178"/>
				<String ID="string_1" HPOS="205" VPOS="67" WIDTH="142" HEIGHT="39" WC="0.95" CONTENT="(quick)"/><SP WIDTH="29" VPOS="67" HPOS="347"/>
				<String ID="string_2" HPOS="376" VPOS="69" WIDTH="152" HEIGHT="40" WC="0.96" CONTENT="[brown]"/><SP WIDTH="31" VPOS="69" HPOS="528"/>
				<String ID="string_3" HPOS="559" VPOS="71" WIDTH="104" HEIGHT="39" WC="0.96" CONTENT="{fox}"/><SP WIDTH="24" VPOS="71" HPOS="663"/>
				<String ID="string_4" HPOS="687" VPOS="73" WIDTH="136" HEIGHT="40" WC="0.96" CONTENT="jumps!"/>
			</TextLine>
			<TextLine ID="line_1" HPOS="104" VPOS="115" WIDTH="783" HEIGHT="50">
				<String ID="string_5" HPOS="104" VPOS="115" WIDTH="95" HEIGHT="32" WC="0.96" CONTENT="Over"/><SP WIDTH="25" VPOS="115" HPOS="199"/>
				<String ID="string_6" HPOS="224" VPOS="117" WIDTH="59" HEIGHT="31" WC="0.96" CONTENT="the"/><SP WIDTH="27" VPOS="117" HPOS="283"/>
				<String ID="string_7" HPOS="310" VPOS="117" WIDTH="223" HEIGHT="38" WC="0.93" CONTENT="$43,456.78"/><SP WIDTH="28" VPOS="117" HPOS="533"/>
				<String ID="string_8" HPOS="561" VPOS="121" WIDTH="135" HEIGHT="41" WC="0.92" CONTENT="&lt;lazy&gt;"/><SP WIDTH="26" VPOS="121" HPOS="696"/>
				<String ID="string_9" HPOS="722" VPOS="123" WIDTH="69" HEIGHT="31" WC="0.96" CONTENT="#90"/><SP WIDTH="27" VPOS="123" HPOS="791"/>
				<String ID="string_10" HPOS="818" VPOS="125" WIDTH="69" HEIGHT="40" WC="0.96" CONTENT="dog"/>
			</TextLine>
			<TextLine ID="line_2" HPOS="103" VPOS="165" WIDTH="732" HEIGHT="41">
				<String ID="string_11" HPOS="103" VPOS="165" WIDTH="31" HEIGHT="31" WC="0.92" CONTENT="&amp;"/><SP WIDTH="26" VPOS="165" HPOS="134"/>
				<String ID="string_12" HPOS="160" VPOS="166" WIDTH="236" HEIGHT="40" WC="0.90" CONTENT="duck/goose,"/><SP WIDTH="28" VPOS="166" HPOS="396"/>
				<String ID="string_13" HPOS="424" VPOS="178" WIDTH="39" HEIGHT="23" WC="0.95" CONTENT="as"/><SP WIDTH="30" VPOS="178" HPOS="463"/>
				<String ID="string_14" HPOS="493" VPOS="171" WIDTH="121" HEIGHT="32" WC="0.95" CONTENT="12.5%"/><SP WIDTH="24" VPOS="171" HPOS="614"/>
				<String ID="string_15" HPOS="638" VPOS="172" WIDTH="42" HEIGHT="32" WC="0.95" CONTENT="of"/><SP WIDTH="20" VPOS="172" HPOS="680"/>
				<String ID="string_16" HPOS="700" VPOS="174" WIDTH="135" HEIGHT="32" WC="0.96" CONTENT="E-mail"/>
			</TextLine>
			...
		</TextBlock>
	</PrintSpace>
</Page>

Please sign in to comment.