Skip to content

Commit

Permalink
Adds TessHOcrTsvRenderer class for rendering HOCR info in tsv format.
Browse files Browse the repository at this point in the history
  • Loading branch information
sundarcf authored and tfmorris committed Mar 1, 2016
1 parent d04e325 commit 4d13892
Show file tree
Hide file tree
Showing 2 changed files with 72 additions and 0 deletions.
55 changes: 55 additions & 0 deletions api/renderer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,61 @@ bool TessHOcrRenderer::AddImageHandler(TessBaseAPI* api) {
return true;
}

/**********************************************************************
* HOcr Text Renderer interface implementation
**********************************************************************/
TessHOcrTsvRenderer::TessHOcrTsvRenderer(const char *outputbase)
: TessResultRenderer(outputbase, "hocr.tsv") {
font_info_ = false;
}

TessHOcrTsvRenderer::TessHOcrTsvRenderer(const char *outputbase, bool font_info)
: TessResultRenderer(outputbase, "hocr.tsv") {
font_info_ = font_info;
}

bool TessHOcrTsvRenderer::BeginDocumentHandler() {
AppendString(
"<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
"<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\n"
" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n"
"<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" "
"lang=\"en\">\n <head>\n <title>\n");
AppendString(title());
AppendString(
"</title>\n"
"<meta http-equiv=\"Content-Type\" content=\"text/html;"
"charset=utf-8\" />\n"
" <meta name='ocr-system' content='tesseract " TESSERACT_VERSION_STR
"' />\n"
" <meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par"
" ocr_line ocrx_word");
if (font_info_)
AppendString(
" ocrp_lang ocrp_dir ocrp_font ocrp_fsize ocrp_wconf");
AppendString(
"'/>\n"
"</head>\n<body>\n");

return true;
}

bool TessHOcrTsvRenderer::EndDocumentHandler() {
AppendString(" </body>\n</html>\n");

return true;
}

bool TessHOcrTsvRenderer::AddImageHandler(TessBaseAPI* api) {
char* hocr = api->GetHOCRText(imagenum());
if (hocr == NULL) return false;

AppendString(hocr);
delete[] hocr;

return true;
}

/**********************************************************************
* UNLV Text Renderer interface implementation
**********************************************************************/
Expand Down
17 changes: 17 additions & 0 deletions api/renderer.h
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,23 @@ class TESS_API TessHOcrRenderer : public TessResultRenderer {
bool font_info_; // whether to print font information
};

/**
* Renders tesseract output into an hocr tsv string
*/
class TESS_API TessHOcrTsvRenderer : public TessResultRenderer {
public:
explicit TessHOcrTsvRenderer(const char *outputbase, bool font_info);
explicit TessHOcrTsvRenderer(const char *outputbase);

protected:
virtual bool BeginDocumentHandler();
virtual bool AddImageHandler(TessBaseAPI* api);
virtual bool EndDocumentHandler();

private:
bool font_info_; // whether to print font information
};

/**
* Renders tesseract output into searchable PDF
*/
Expand Down

0 comments on commit 4d13892

Please sign in to comment.