python: optimize imports, reformat code

tesseract-ocr · Mar 31, 2019 · 5f06402 · 5f06402
1 parent 2e9fd69
commit 5f06402
Show file tree

Hide file tree

Showing 3 changed files with 63 additions and 67 deletions.
diff --git a/src/training/language_specific.py b/src/training/language_specific.py
@@ -19,8 +19,8 @@
 # Language specific info
 # =============================================================================
 
-import os
 import logging
+import os
 
 log = logging.getLogger(__name__)
 
@@ -875,6 +875,7 @@
 
 FLAGS_webtext_prefix = os.environ.get("FLAGS_webtext_prefix", "")
 
+
 # Set language-specific values for several global variables, including
 #   ${TEXT_CORPUS}
 #      holds the text corpus file for the language, used in phase F
@@ -1079,15 +1080,15 @@ def set_lang_specific_parameters(ctx, lang):
         NUMBER_DAWG_FACTOR = 0.05
         WORD_DAWG_SIZE = 1_000_000
     elif lang in (
-        "aze_cyrl",
-        "bel",
-        "bul",
-        "kaz",
-        "mkd",
-        "srp",
-        "tgk",
-        "ukr",
-        "uzb_cyrl",
+            "aze_cyrl",
+            "bel",
+            "bul",
+            "kaz",
+            "mkd",
+            "srp",
+            "tgk",
+            "ukr",
+            "uzb_cyrl",
     ):
         MIX_LANG = f"{lang}"
         if not FONTS:
@@ -1326,44 +1327,44 @@ def set_lang_specific_parameters(ctx, lang):
         EXPOSURES = [0]
     # Set right-to-left and normalization mode.
     if lang in (
-        "ara",
-        "div",
-        "fas",
-        "pus",
-        "snd",
-        "syr",
-        "uig",
-        "urd",
-        "kur_ara",
-        "heb",
-        "yid",
+            "ara",
+            "div",
+            "fas",
+            "pus",
+            "snd",
+            "syr",
+            "uig",
+            "urd",
+            "kur_ara",
+            "heb",
+            "yid",
     ):
         LANG_IS_RTL = True
         NORM_MODE = 2
     elif lang in (
-        "asm",
-        "ben",
-        "bih",
-        "hin",
-        "mar",
-        "nep",
-        "guj",
-        "kan",
-        "mal",
-        "tam",
-        "tel",
-        "pan",
-        "dzo",
-        "sin",
-        "san",
-        "bod",
-        "ori",
-        "khm",
-        "mya",
-        "tha",
-        "lao",
-        "jav ",
-        "jav_java",
+            "asm",
+            "ben",
+            "bih",
+            "hin",
+            "mar",
+            "nep",
+            "guj",
+            "kan",
+            "mal",
+            "tam",
+            "tel",
+            "pan",
+            "dzo",
+            "sin",
+            "san",
+            "bod",
+            "ori",
+            "khm",
+            "mya",
+            "tha",
+            "lao",
+            "jav ",
+            "jav_java",
     ):
         LANG_IS_RTL = False
         NORM_MODE = 2
@@ -1408,7 +1409,6 @@ def set_lang_specific_parameters(ctx, lang):
 
     return ctx
 
-
 # =============================================================================
 # END of Language specific info
 # =============================================================================
diff --git a/src/training/tesstrain.py b/src/training/tesstrain.py
@@ -15,10 +15,10 @@
 # This script provides an easy way to execute various phases of training
 # Tesseract.  For a detailed description of the phases, see
 # https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract
-#
-import sys
-import os
+
 import logging
+import os
+import sys
 
 if (sys.version_info.major < 3) or (sys.version_info.major == 3 and sys.version_info.minor < 6):
     raise Exception("Must be using Python minimum version 3.6!")
@@ -86,7 +86,6 @@ def main():
 if __name__ == "__main__":
     main()
 
-
 # _rc0 = subprocess.call(["tlog","\n=== Starting training for language '"+str(LANG_CODE.val)+"'"],shell=True)
 # _rc0 = subprocess.call(["source",os.popen("dirname "+__file__).read().rstrip("\n")+"/language-specific.sh"],shell=True)
 # _rc0 = subprocess.call(["set_lang_specific_parameters",str(LANG_CODE.val)],shell=True)

diff --git a/src/training/tesstrain_utils.py b/src/training/tesstrain_utils.py
@@ -14,19 +14,19 @@
 # https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract
 #
 
+import argparse
+import atexit
+import concurrent.futures
+import logging
 import os
+import pathlib
 import platform
+import shutil
+import subprocess
 import sys
 from datetime import date
-from tempfile import TemporaryDirectory, mkdtemp
-import pathlib
-import logging
-import subprocess
-import argparse
 from operator import itemgetter
-import concurrent.futures
-import shutil
-import atexit
+from tempfile import TemporaryDirectory, mkdtemp
 
 from tqdm import tqdm
 
@@ -247,18 +247,18 @@ def show_tmpdir_location(training_dir):
     # specified in the command-line.
     if not ctx.training_text:
         ctx.training_text = (
-            pathlib.Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.training_text"
+                pathlib.Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.training_text"
         )
     if not ctx.wordlist_file:
         ctx.wordlist_file = (
-            pathlib.Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.wordlist"
+                pathlib.Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.wordlist"
         )
 
     ctx.word_bigrams_file = (
-        pathlib.Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.word.bigrams"
+            pathlib.Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.word.bigrams"
     )
     ctx.numbers_file = (
-        pathlib.Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.numbers"
+            pathlib.Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.numbers"
     )
     ctx.punc_file = pathlib.Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.punc"
     ctx.bigram_freqs_file = pathlib.Path(ctx.training_text).with_suffix(
@@ -307,7 +307,6 @@ def make_outbase(ctx, fontname, exposure):
 # Helper function for phaseI_generate_image. Generates the image for a single
 # language/font combination in a way that can be run in parallel.
 def generate_font_image(ctx, font, exposure, char_spacing):
-
     log.info(f"Rendering using {font}")
     fontname = make_fontname(font)
     outbase = make_outbase(ctx, fontname, exposure)
@@ -358,7 +357,6 @@ def generate_font_image(ctx, font, exposure, char_spacing):
 
 # Phase I : Generate (I)mages from training text for each font.
 def phase_I_generate_image(ctx, par_factor):
-
     if not par_factor or par_factor <= 0:
         par_factor = 1
 
@@ -387,8 +385,8 @@ def phase_I_generate_image(ctx, par_factor):
             check_file_readable(ctx.train_ngrams_file)
 
         with tqdm(
-            total=len(ctx.fonts)
-         ) as pbar, concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
+                total=len(ctx.fonts)
+        ) as pbar, concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
             futures = [
                 executor.submit(generate_font_image, ctx, font, exposure, char_spacing)
                 for font in ctx.fonts
@@ -533,7 +531,7 @@ def phase_E_extract_features(ctx, box_config, ext):
     log.info(f"Using TESSDATA_PREFIX={tessdata_environ['TESSDATA_PREFIX']}")
 
     with tqdm(total=len(img_files)) as pbar, concurrent.futures.ThreadPoolExecutor(
-        max_workers=2
+            max_workers=2
     ) as executor:
         futures = []
         for img_file in img_files:
@@ -693,7 +691,6 @@ def get_file_list():
     dir_listing = (str(p) for p in path_output.glob(f"{ctx.lang_code}.*.lstmf"))
     pathlib.Path(lstm_list).write_text("\n".join(dir_listing))
 
-
 # make__traineddata() {
 #   tlog "\n=== Making final traineddata file ==="
 #   local lang_prefix={ctx.langdata_dir}/${LANG_CODE}/${LANG_CODE}