diff --git a/ccmain/resultiterator.cpp b/ccmain/resultiterator.cpp index 02bf598423..15f6801be7 100644 --- a/ccmain/resultiterator.cpp +++ b/ccmain/resultiterator.cpp @@ -34,6 +34,12 @@ ResultIterator::ResultIterator(const LTRResultIterator &resit) : LTRResultIterator(resit) { in_minor_direction_ = false; at_beginning_of_minor_run_ = false; + + BoolParam *p = ParamUtils::FindParam( + "preserve_interword_spaces", GlobalParams()->bool_params, + tesseract_->params()->bool_params); + if (p != NULL) preserve_interword_spaces_ = (bool)(*p); + current_paragraph_is_ltr_ = CurrentParagraphIsLtr(); MoveToLogicalStartOfTextline(); } @@ -629,14 +635,16 @@ void ResultIterator::IterateAndAppendUTF8TextlineText(STRING *text) { int words_appended = 0; do { + int numSpaces = preserve_interword_spaces_ ? it_->word()->word->space() : 1; + for(int i = 0 ; i < numSpaces ; ++i) { + *text += " "; + } AppendUTF8WordText(text); words_appended++; - *text += " "; } while (Next(RIL_WORD) && !IsAtBeginningOf(RIL_TEXTLINE)); if (BidiDebug(1)) { tprintf("%d words printed\n", words_appended); } - text->truncate_at(text->length() - 1); *text += line_separator_; // If we just finished a paragraph, add an extra newline. if (it_->block() == NULL || IsAtBeginningOf(RIL_PARA)) diff --git a/ccmain/resultiterator.h b/ccmain/resultiterator.h index cde19f0825..9214140c98 100644 --- a/ccmain/resultiterator.h +++ b/ccmain/resultiterator.h @@ -46,8 +46,8 @@ class TESS_API ResultIterator : public LTRResultIterator { virtual ~ResultIterator() {} // ============= Moving around within the page ============. - /** - * Moves the iterator to point to the start of the page to begin + /** + * Moves the iterator to point to the start of the page to begin * an iteration. */ virtual void Begin(); @@ -181,7 +181,7 @@ class TESS_API ResultIterator : public LTRResultIterator { void MoveToLogicalStartOfTextline(); /** - * Precondition: current_paragraph_is_ltr_ and in_minor_direction_ + * Precondition: current_paragraph_is_ltr_ and in_minor_direction_ * are set. */ void MoveToLogicalStartOfWord(); @@ -231,6 +231,12 @@ class TESS_API ResultIterator : public LTRResultIterator { /** Is the currently pointed-at character in a minor-direction sequence? */ bool in_minor_direction_; + + /** + * Should detected inter-word spaces be preserved, or "compressed" to a single + * space character (default behavior). + */ + bool preserve_interword_spaces_ = false; }; } // namespace tesseract. diff --git a/ccmain/tesseractclass.cpp b/ccmain/tesseractclass.cpp index d5949661d9..39103111cf 100644 --- a/ccmain/tesseractclass.cpp +++ b/ccmain/tesseractclass.cpp @@ -440,6 +440,8 @@ Tesseract::Tesseract() this->params()), INT_MEMBER(tessedit_parallelize, 0, "Run in parallel where possible", this->params()), + BOOL_MEMBER(preserve_interword_spaces, false, + "Preserve multiple interword spaces", this->params()), // The following parameters were deprecated and removed from their original // locations. The parameters are temporarily kept here to give Tesseract diff --git a/ccmain/tesseractclass.h b/ccmain/tesseractclass.h index 992c7ecdcc..8a7c686c03 100644 --- a/ccmain/tesseractclass.h +++ b/ccmain/tesseractclass.h @@ -1009,6 +1009,7 @@ class Tesseract : public Wordrec { double_VAR_H(textord_tabfind_aligned_gap_fraction, 0.75, "Fraction of height used as a minimum gap for aligned blobs."); INT_VAR_H(tessedit_parallelize, 0, "Run in parallel where possible"); + BOOL_VAR_H(preserve_interword_spaces, false, "Preserve multiple interword spaces"); // The following parameters were deprecated and removed from their original // locations. The parameters are temporarily kept here to give Tesseract