Replace print statements with loguru library (#271)

* Move non-file prints over to logging * migrate to loguru * migrate to loguru * Removed text length * remove all two-argument logger calls * remove all two-argument logger calls * Add logger train.log * Add logger train.log * Redirect stderr and print out shell * Revert * Add Kaldi printouts to logs * Add Kaldi printouts to logs * Add Kaldi printouts to logs * Add Kaldi printouts to logs * Add Kaldi printouts to python console * Add Kaldi printouts to python console * Updated logging behaviour * Revert to simpler behaviour * Addressed PR comments * Make (now longer) log output more readable Co-authored-by: Ben Foley <ben@cbmm.io>
CoEDL · Dec 10, 2021 · 1f488f0 · 1f488f0
1 parent 4e4fd57
commit 1f488f0
Show file tree

Hide file tree

Showing 31 changed files with 11,279 additions and 10,867 deletions.
diff --git a/elpis/__init__.py b/elpis/__init__.py
@@ -1,4 +1,7 @@
 import os
+import logging
+from loguru import logger
+
 from flask import redirect
 from . import endpoints
 from .app import Flask
@@ -18,6 +21,11 @@ def create_app(test_config=None):
     # Variable to control the use of a proxy to support webpackdevserver
     WEBPACK_DEV_SERVER_PROXY = os.environ.get("WEBPACK_DEV_SERVER_PROXY", None)
 
+    log = logging.getLogger('werkzeug')
+    log.setLevel(logging.DEBUG)
+    # Prevent the HTTP request logs polluting more important train logs
+    log.disabled = True
+
     if WEBPACK_DEV_SERVER_PROXY:
         app = Flask(__name__,
                     instance_relative_config=True,
@@ -38,20 +46,14 @@ def create_app(test_config=None):
         #     static_dir = static_dir_build
         # else:
         #     static_dir = static_dir_watch
-        print('using static_dir:', static_dir)
+        logger.info(f'using static_dir: {static_dir}')
         # Create a custom Flask instance defined in the app.py file. Same as a
         # normal Flask class but with a specialised blueprint function.
         app = Flask(__name__,
                     instance_relative_config=True,
                     static_folder=GUI_BUILD_DIR + static_dir,
                     static_url_path=static_dir)
 
-    import logging
-    log = logging.getLogger('werkzeug')
-    log.setLevel(logging.DEBUG)
-    # Prevent the HTTP request logs polluting more important train logs
-    log.disabled = True
-
     # When making this multi-user, the secret key would require to be a secure hash.
     app.config.from_mapping(
         SECRET_KEY='dev'
@@ -105,7 +107,7 @@ def index_file():
     @app.route('/', defaults={'path': ''})
     @app.route("/<path:path>")
     def index(path):
-        print('in index with:', path)
+        logger.info(f'in index with: {path}')
         if (WEBPACK_DEV_SERVER_PROXY):
             # If we are running the webpack dev server, 
             # We proxy webpack requests through to the dev server

diff --git a/elpis/endpoints/model.py b/elpis/endpoints/model.py
@@ -1,6 +1,7 @@
 from typing import Callable, Dict
 from flask import request, current_app as app, jsonify
 from ..blueprint import Blueprint
+from loguru import logger
 import subprocess
 from elpis.engines.common.objects.model import Model
 from elpis.engines.common.errors import InterfaceError
@@ -33,7 +34,7 @@ def new():
     interface = app.config['INTERFACE']
     try:
         model = interface.new_model(request.json["name"])
-        print(f"New model created {model.name} {model.hash}")
+        logger.info(f"New model created {model.name} {model.hash}")
     except InterfaceError as e:
         return jsonify({
             "status": 500,
@@ -111,7 +112,7 @@ def build_data(model: Model):
 @bp.route("/train", methods=['GET'])
 def train():
     def setup(model: Model):
-        model.train(on_complete=lambda: print('Trained model!'))
+        model.train(on_complete=lambda: logger.info('Trained model!'))
 
     def build_data(model: Model):
         return {
@@ -148,7 +149,7 @@ def results():
     try:
         results = model.get_train_results()
     except FileNotFoundError:
-        print("Results file not found.")
+        logger.error("Results file not found.")
         return jsonify(MISSING_LOG_RESPONSE)
     data = {
         "results": results

diff --git a/elpis/endpoints/pron_dict.py b/elpis/endpoints/pron_dict.py
@@ -4,7 +4,7 @@
 from elpis.engines import Interface
 from elpis.engines.common.objects.pron_dict import PronDict
 from elpis.engines.common.errors import InterfaceError
-
+from loguru import logger
 
 bp = Blueprint("pron_dict", __name__, url_prefix="/pron-dict")
 
@@ -19,7 +19,7 @@ def new():
             "status": 500,
             "error": e.human_message
         })
-    print(f"****{request.json['name']}****")
+    logger.info(f"****{request.json['name']}****")
     dataset = interface.get_dataset(request.json['dataset_name'])
     pron_dict.link(dataset)
     app.config['CURRENT_PRON_DICT'] = pron_dict

diff --git a/elpis/endpoints/transcription.py b/elpis/endpoints/transcription.py
@@ -1,11 +1,12 @@
 from flask import request, current_app as app, jsonify
 from ..blueprint import Blueprint
+from loguru import logger
+
 from elpis.engines import Interface
 from elpis.engines.common.objects.model import Model
 from elpis.engines.common.objects.transcription import Transcription
 from elpis.engines.common.utilities import hasher
 
-
 bp = Blueprint("transcription", __name__, url_prefix="/transcription")
 
 # TODO transcriptions have no name
@@ -17,7 +18,7 @@ def new():
     transcription.link(model)
     app.config['CURRENT_TRANSCRIPTION'] = transcription
     file = request.files['file']
-    transcription.prepare_audio(file, on_complete=lambda: print('Prepared audio file!'))
+    transcription.prepare_audio(file, on_complete=lambda: logger.info('Prepared audio file!'))
     data = {
         "status": transcription.status,
         "originalFilename": file.filename
@@ -31,7 +32,7 @@ def new():
 @bp.route("/transcribe", methods=['GET'])
 def transcribe():
     transcription: Transcription = app.config['CURRENT_TRANSCRIPTION']
-    transcription.transcribe(on_complete=lambda: print('Transcribed text!'))
+    transcription.transcribe(on_complete=lambda: logger.info('Transcribed text!'))
     data = {
         "status": transcription.status,
         "stage_status": transcription.stage_status

diff --git a/elpis/engines/common/input/clean_json.py b/elpis/engines/common/input/clean_json.py
@@ -21,6 +21,7 @@
 import nltk
 from argparse import ArgumentParser
 from langid.langid import LanguageIdentifier, model
+from loguru import logger
 from nltk.corpus import words
 from typing import Dict, List, Set
 from ..utilities import load_json_file, write_data_to_json_file
@@ -96,7 +97,7 @@ def are_words_valid(clean_words: List[str],
 
     # Exclude utterance if > 10% english
     if remove_english and len(clean_words) > 0 and english_word_count / len(clean_words) > 0.1:
-        # print(round(english_word_count / len(clean_words)), trans, file=sys.stderr)
+        # logger.debug(round(english_word_count / len(clean_words)), trans, file=sys.stderr)
         return False
 
     # Exclude utterance if langid thinks its english
@@ -203,14 +204,14 @@ def extract_additional_corpora(additional_corpus: str = '',
     :param punctuation_to_collapse_by: punctuation marks to strip
     :param punctuation_to_explode_by: punctuation marks to replace with spaces
     """
-    print("corpus_txt", corpus_txt)
+    logger.info(f"{corpus_txt=}")
     if os.path.exists(corpus_txt):
         write_mode = 'a'  # append if already exists
     else:
         write_mode = 'w'  # make a new file if not
     with open(corpus_txt, write_mode) as corpus_txt_file:
         if os.path.exists(additional_corpus):
-            print(f"Extracting corpus examples from: {additional_corpus}")
+            logger.info(f"Extracting corpus examples from: {additional_corpus}")
             with open(additional_corpus, "r", encoding="utf-8", ) as file_:
                 for line in file_.readlines():
                     # clean the text along the way
@@ -222,7 +223,7 @@ def extract_additional_corpora(additional_corpus: str = '',
                         line = line + '\n'
                     corpus_txt_file.writelines(line)
         else:
-            print(f"Provided additional text additional_corpus file path invalid: "
+            logger.warning(f"Provided additional text additional_corpus file path invalid: "
                   f"{additional_corpus}")
 
 
@@ -280,7 +281,7 @@ def main() -> None:
     dirty_json_data: List[Dict[str, str]] = load_json_file(arguments.infile)
     outfile = arguments.outfile if arguments.outfile else sys.stdout
 
-    print(f"Filtering dirty json data {arguments.infile}...")
+    logger.info(f"Filtering dirty json data {arguments.infile}...")
 
     filtered_data = clean_json_data(json_data=dirty_json_data,
                                     remove_english=arguments.remove_english,
@@ -291,7 +292,7 @@ def main() -> None:
     write_data_to_json_file(data=list(filtered_data),
                             file_name=outfile)
 
-    print(f"Finished! Wrote {str(len(filtered_data))} transcriptions.")
+    logger.info(f"Finished! Wrote {str(len(filtered_data))} transcriptions.")
 
 
 if __name__ == "__main__":

diff --git a/elpis/engines/common/input/elan_to_json.py b/elpis/engines/common/input/elan_to_json.py
@@ -13,6 +13,7 @@
 
 import argparse
 import glob
+from loguru import logger
 import os
 import sys
 from typing import List, Dict, Tuple, Optional
@@ -67,15 +68,15 @@ def process_eaf(input_elan_file: str = '',
     :return: a list of dictionaries, where each dictionary is an annotation
     """
 
-    print(f"processing eaf {input_elan_file} using {tier_order} {tier_type} {tier_name}")
+    logger.info(f"processing eaf {input_elan_file} using {tier_order} {tier_type} {tier_name}")
 
     # Get paths to files
     input_directory, full_file_name = os.path.split(input_elan_file)
     file_name, extension = os.path.splitext(full_file_name)
 
     # Look for wav file matching the eaf file in same directory
     if os.path.isfile(os.path.join(input_directory, file_name + ".wav")):
-        print("WAV file found for " + file_name, file=sys.stderr)
+        logger.info(f"WAV file found for {file_name}")
     else:
         raise ValueError(f"WAV file not found for {full_file_name}. "
                          f"Please put it next to the eaf file in {input_directory}.")
@@ -101,38 +102,38 @@ def process_eaf(input_elan_file: str = '',
         # tier_order is 1-index but List indexing is 0-index
         try:
             tier_name = tier_names[tier_order - 1]
-            print(f"using tier order {tier_order} to get tier name {tier_name}")
+            logger.info(f"using tier order {tier_order} to get tier name {tier_name}")
         except IndexError:
-            print("couldn't find a tier")
+            logger.warning("couldn't find a tier")
             pass
     else:
         # else use tier type to get a tier name
         if tier_type in tier_types:
-            print(f"found tier type {tier_type}")
+            logger.info(f"found tier type {tier_type}")
             tier_names = input_eaf.get_tier_ids_for_linguistic_type(tier_type)
             tier_name = tier_names[0]
             if tier_name:
-                print(f"found tier name {tier_name}")
+                logger.info(f"found tier name {tier_name}")
         else:
-            print("tier type not found in this file")
+            logger.warning("tier type not found in this file")
 
     if tier_name in tier_names:
-        print(f"using tier name {tier_name}")
+        logger.info(f"using tier name {tier_name}")
         annotations = input_eaf.get_annotation_data_for_tier(tier_name)
 
     if annotations:
-        print(f"annotations {annotations}")
+        logger.info(f"annotations {annotations}")
         annotations = sorted(annotations)
         parameters: Dict[str, str] = input_eaf.get_parameters_for_tier(tier_name)
-        print(f"parameters {parameters}")
+        logger.info(f"parameters {parameters}")
         speaker_id: str = parameters.get("PARTICIPANT", "")
 
     for annotation in annotations:
         start: str = annotation[0]
         end: str = annotation[1]
         annotation_text: str = annotation[2]
 
-        print(f"annotation {annotation} {start} {end}")
+        logger.info(f"annotation {annotation} {start} {end}")
         obj = {
             "audio_file_name": f"{file_name}.wav",
             "transcript": annotation_text,

diff --git a/elpis/engines/common/input/make_prn_dict.py b/elpis/engines/common/input/make_prn_dict.py
@@ -9,6 +9,7 @@
 """
 
 import argparse
+from loguru import logger
 import sys
 from typing import List, Tuple, Set, TextIO
 
@@ -105,9 +106,9 @@ def generate_pronunciation_dictionary(word_list: str,
                                    missing_characters=missing_characters)
 
     for character in missing_characters:
-        print(f"Unexpected character: {character}", file=sys.stderr)
+        logger.warning(f"Unexpected character: {character}")
 
-    print(f"Wrote lexicon to {pronunciation_dictionary}", file=sys.stderr)
+    logger.info(f"Wrote lexicon to {pronunciation_dictionary}")
 
 
 def main():

diff --git a/elpis/engines/common/input/make_wordlist.py b/elpis/engines/common/input/make_wordlist.py
@@ -12,6 +12,7 @@
 """
 
 import argparse
+from loguru import logger
 import os
 import sys
 from typing import List, Dict
@@ -27,7 +28,7 @@ def save_word_list(word_list: List[str], file_name: str) -> None:
     with open(file_name, "w", encoding='utf-8') as f:
         for word in word_list:
             f.write(word + "\n",)
-        print(f"Wrote word list to {file_name}")
+        logger.info(f"Wrote word list to {file_name}")
 
 
 def extract_word_list(json_data: List[Dict[str, str]]) -> List[str]:
@@ -55,12 +56,12 @@ def extract_additional_words(file_name: str) -> List[str]:
     words = []
     if os.path.exists(file_name):
         with open(file_name, "r") as f:
-            print(f"Extracting additional words from {file_name}")
+            logger.info(f"Extracting additional words from {file_name}")
             for line in f.readlines():
                 new_words = line.strip().split(" ")
                 words += [word for word in new_words]
     else:
-        print(f"WARNING: Additional word list file at {file_name} does not exist, skipping!")
+        logger.warning(f"Additional word list file at {file_name} does not exist, skipping!")
     return words
 
 
@@ -80,7 +81,7 @@ def generate_word_list(transcription_file: str,
     """
     json_data: List[Dict[str, str]] = load_json_file(transcription_file)
 
-    print("Extracting word list(s)...", flush=True, file=sys.stderr)
+    logger.info("Extracting word list(s)...")
 
     # Retrieve ELAN word data
     word_list = extract_word_list(json_data)
@@ -97,9 +98,9 @@ def generate_word_list(transcription_file: str,
     # Remove duplicates
     word_list = list(set(word_list))
 
-    print(sorted(word_list))
+    logger.debug(sorted(word_list))
 
-    print(f"Writing wordlist to file...", flush=True, file=sys.stderr)
+    logger.info(f"Writing wordlist to file...")
     save_word_list(word_list, output_file)
 
 
@@ -135,7 +136,7 @@ def main():
                        additional_corpus_txt=arguments.additional_corpus_txt
                        )
 
-    print("Done.", file=sys.stderr)
+    logger.info("Done.")
 
 
 if __name__ == '__main__':

diff --git a/elpis/engines/common/input/split_on_silence.py b/elpis/engines/common/input/split_on_silence.py
@@ -8,6 +8,7 @@
               Nicholas Lambourne - (The University of Queensland, 2019)
 """
 
+from loguru import logger
 from argparse import ArgumentParser
 from pathlib import Path
 from pydub import AudioSegment
@@ -51,7 +52,7 @@ def split_audio_file_on_silence(file_path: str,
         audio_segment = silence + segment + silence
         normalised_segment = match_target_amplitude(audio_segment, -20)
         export_file_name = f"_file_{file_index}-part_{segment_index}.wav"
-        print(f"Exporting {export_file_name}")
+        logger.info(f"Exporting {export_file_name}")
         normalised_segment.export(Path(output_directory, export_file_name))
 
 

diff --git a/elpis/engines/common/input/vad.py b/elpis/engines/common/input/vad.py
@@ -1,6 +1,7 @@
 #!/usr/bin/python3
 
 import librosa
+from loguru import logger
 import numpy
 from typing import Any, Dict, List, Tuple
 
@@ -15,7 +16,7 @@ def get_chunks(audio_path: str, method: str, parameter: float) -> List[Tuple[flo
     """
     audio_data = read_audio_path(audio_path)
     threshold = find_best_threshold(audio_data, method=method, parameter=parameter)
-    print(f"""Top db = {audio_data["top db"]}, chosen threshold = {threshold} (method = {method})""")
+    logger.info(f"Top db = {audio_data['top db']}, chosen threshold = {threshold} (method = {method})")
     time_voice_sections = get_voice_sections(audio_data, threshold)
     return time_voice_sections