Fix Heap-buffer-overflow in GenericVector<int>::size (issue #2298)

Credit to OSS-Fuzz: This fixes a security issue which was reported by OSS-Fuzz, see details at https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=13590. Add also some assertions to catch similar bugs. Signed-off-by: Stefan Weil <sw@weilnetz.de>
tesseract-ocr · Mar 10, 2019 · 71d4990 · 71d4990
1 parent b7279f6
commit 71d4990
Show file tree

Hide file tree

Showing 2 changed files with 14 additions and 7 deletions.
diff --git a/src/dict/dict.h b/src/dict/dict.h
@@ -107,17 +107,21 @@ class Dict {
 
   // Returns true if unichar_id is a word compounding character like - or /.
   inline bool compound_marker(UNICHAR_ID unichar_id) {
+    const UNICHARSET& unicharset = getUnicharset();
+    ASSERT_HOST(unicharset.contains_unichar_id(unichar_id));
     const GenericVector<UNICHAR_ID>& normed_ids =
-        getUnicharset().normed_ids(unichar_id);
+        unicharset.normed_ids(unichar_id);
     return normed_ids.size() == 1 &&
         (normed_ids[0] == hyphen_unichar_id_ ||
          normed_ids[0] == slash_unichar_id_);
   }
   // Returns true if unichar_id is an apostrophe-like character that may
   // separate prefix/suffix words from a main body word.
   inline bool is_apostrophe(UNICHAR_ID unichar_id) {
+    const UNICHARSET& unicharset = getUnicharset();
+    ASSERT_HOST(unicharset.contains_unichar_id(unichar_id));
     const GenericVector<UNICHAR_ID>& normed_ids =
-        getUnicharset().normed_ids(unichar_id);
+        unicharset.normed_ids(unichar_id);
     return normed_ids.size() == 1 && normed_ids[0] == apostrophe_unichar_id_;
   }
 
@@ -141,17 +145,20 @@ class Dict {
     }
   }
   /// Check whether the word has a hyphen at the end.
-  inline bool has_hyphen_end(UNICHAR_ID unichar_id, bool first_pos) const {
+  inline bool has_hyphen_end(const UNICHARSET* unicharset,
+                             UNICHAR_ID unichar_id, bool first_pos) const {
     if (!last_word_on_line_ || first_pos)
       return false;
+    ASSERT_HOST(unicharset->contains_unichar_id(unichar_id));
     const GenericVector<UNICHAR_ID>& normed_ids =
-        getUnicharset().normed_ids(unichar_id);
+        unicharset->normed_ids(unichar_id);
     return normed_ids.size() == 1 && normed_ids[0] == hyphen_unichar_id_;
   }
   /// Same as above, but check the unichar at the end of the word.
   inline bool has_hyphen_end(const WERD_CHOICE &word) const {
     int word_index = word.length() - 1;
-    return has_hyphen_end(word.unichar_id(word_index), word_index == 0);
+    return has_hyphen_end(word.unicharset(), word.unichar_id(word_index),
+                          word_index == 0);
   }
   /// Unless the previous word was the last one on the line, and the current
   /// one is not (thus it is the first one on the line), erase hyphen_word_,

diff --git a/src/wordrec/language_model.cpp b/src/wordrec/language_model.cpp
@@ -3,7 +3,6 @@
 // Description: Functions that utilize the knowledge about the properties,
 //              structure and statistics of the language to help recognition.
 // Author:      Daria Antonova
-// Created:     Mon Nov 11 11:26:43 PST 2009
 //
 // (C) Copyright 2009, Google Inc.
 // Licensed under the Apache License, Version 2.0 (the "License");
@@ -803,7 +802,8 @@ LanguageModelDawgInfo *LanguageModel::GenerateDawgInfo(
   }
 
   // Deal with hyphenated words.
-  if (word_end && dict_->has_hyphen_end(b.unichar_id(), curr_col == 0)) {
+  if (word_end && dict_->has_hyphen_end(&dict_->getUnicharset(),
+                                        b.unichar_id(), curr_col == 0)) {
     if (language_model_debug_level > 0) tprintf("Hyphenated word found\n");
     return new LanguageModelDawgInfo(dawg_args_.active_dawgs, COMPOUND_PERM);
   }