Formatted common/output and common/utilities (#287)

* Formatted common/output and common/utilities * Amending workflow file to use pyproject.toml as config * Added LINTER_RULES_PATH
CoEDL · Jun 16, 2022 · cda63ef · cda63ef
1 parent 902bc57
commit cda63ef
Show file tree

Hide file tree

Showing 7 changed files with 106 additions and 106 deletions.
diff --git a/.github/workflows/black_python_formatter.yml b/.github/workflows/black_python_formatter.yml
@@ -36,7 +36,9 @@ jobs:
       - name: Format Python Files
         uses: github/super-linter@v4
         env:
+          LINTER_RULES_PATH: /
           VALIDATE_ALL_CODEBASE: false
           VALIDATE_PYTHON_BLACK: true
+          PYTHON_BLACK_CONFIG_FILE: pyproject.toml
           DEFAULT_BRANCH: master
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/elpis/engines/common/output/ctm_to_elan.py b/elpis/engines/common/output/ctm_to_elan.py
@@ -24,9 +24,10 @@
 # Also happened for +1 but seems to be finding the parent better with this "buffer" of 20. Weird.
 PYMPI_CHILD_ANNOTATION_OFFSET = 20
 
-def ctm_to_dictionary(ctm_file_path: str,
-                      segments_dictionary: Dict[str, str],
-                      confidence: bool) -> dict:
+
+def ctm_to_dictionary(
+    ctm_file_path: str, segments_dictionary: Dict[str, str], confidence: bool
+) -> dict:
     with codecs.open(ctm_file_path, encoding="utf8") as file:
         ctm_entries = list(reader(file, delimiter=" "))
     ctm_dictionary = dict()
@@ -39,13 +40,16 @@ def ctm_to_dictionary(ctm_file_path: str,
         absolute_end_time = absolute_start_time + float(entry[3])
         inferred_text = entry[4]
         confidence = entry[5] if confidence else None
-        utterance_segment = (str(absolute_start_time),
-                             str(absolute_end_time),
-                             inferred_text,
-                             confidence)
+        utterance_segment = (
+            str(absolute_start_time),
+            str(absolute_end_time),
+            inferred_text,
+            confidence,
+        )
         ctm_dictionary[utterance_id].append(utterance_segment)
     return ctm_dictionary
 
+
 def get_segment_dictionary(segment_file_name: str) -> Dict[str, Tuple[str, float]]:
     with open(segment_file_name, "r") as file:
         segment_entries = list(reader(file, delimiter=" "))
@@ -57,21 +61,22 @@ def get_segment_dictionary(segment_file_name: str) -> Dict[str, Tuple[str, float
         segment_dictionary[segment_id] = (utterance_id, start_time)
     return segment_dictionary
 
+
 def wav_scp_to_dictionary(scp_file_name: str) -> dict:
     wav_dictionary = dict()
     with open(scp_file_name) as file:
         wav_entries = file.read().splitlines()
         for line in wav_entries:
-            entry = line.split(" ", 1) # use 1 here in case wav filenames include spaces
+            entry = line.split(" ", 1)  # use 1 here in case wav filenames include spaces
             utterance_id = entry[0]
             wav_file_path = entry[1]
             wav_dictionary[utterance_id] = wav_file_path
     return wav_dictionary
 
-def create_eaf_and_textgrid(wav_dictionary:dict,
-               ctm_dictionary:dict,
-               confidence:bool,
-               output_directory:str):
+
+def create_eaf_and_textgrid(
+    wav_dictionary: dict, ctm_dictionary: dict, confidence: bool, output_directory: str
+):
     for index, [utterance_id, audio_filename] in enumerate(wav_dictionary.items()):
         eaf = Eaf()
         eaf.add_linked_file(audio_filename)
@@ -90,38 +95,34 @@ def create_eaf_and_textgrid(wav_dictionary:dict,
             # Add the confidence value as a reference annotation
             if conf:
                 # Add a time value to the start time so the ref falls within a parent slot
-                eaf.add_ref_annotation("confidence", "default", start_ms+PYMPI_CHILD_ANNOTATION_OFFSET, conf[0])
+                eaf.add_ref_annotation(
+                    "confidence", "default", start_ms + PYMPI_CHILD_ANNOTATION_OFFSET, conf[0]
+                )
 
         # Save as Elan eaf file
-        output_eaf = str(Path(output_directory, f'utterance-{index}.eaf'))
+        output_eaf = str(Path(output_directory, f"utterance-{index}.eaf"))
         eaf.to_file(output_eaf)
 
         # Make a Textgrid format version
-        output_textgrid = str(Path(output_directory, f'utterance-{index}.Textgrid'))
+        output_textgrid = str(Path(output_directory, f"utterance-{index}.Textgrid"))
         textgrid = eaf.to_textgrid()
         textgrid.to_file(output_textgrid)
 
 
 def main() -> None:
-    parser: ArgumentParser = ArgumentParser(description="Converts Kaldi CTM format to Elan .eaf format.")
-    parser.add_argument("-c", "--ctm",
-                        type=str,
-                        help="The input CTM format file",
-                        required=True)
-    parser.add_argument("-w", "--wav",
-                        type=str,
-                        help="The input wav.scp file",
-                        required=True)
-    parser.add_argument("-s", "--seg",
-                        type=str,
-                        help="The segment to utterance mapping",
-                        default="./segments")
-    parser.add_argument("-o", "--outdir",
-                        type=str,
-                        help="The directory path for the Elan output",
-                        default=".")
-    parser.add_argument('--confidence', dest='confidence', action='store_true')
-    parser.add_argument('--no-confidence', dest='confidence', action='store_false')
+    parser: ArgumentParser = ArgumentParser(
+        description="Converts Kaldi CTM format to Elan .eaf format."
+    )
+    parser.add_argument("-c", "--ctm", type=str, help="The input CTM format file", required=True)
+    parser.add_argument("-w", "--wav", type=str, help="The input wav.scp file", required=True)
+    parser.add_argument(
+        "-s", "--seg", type=str, help="The segment to utterance mapping", default="./segments"
+    )
+    parser.add_argument(
+        "-o", "--outdir", type=str, help="The directory path for the Elan output", default="."
+    )
+    parser.add_argument("--confidence", dest="confidence", action="store_true")
+    parser.add_argument("--no-confidence", dest="confidence", action="store_false")
     parser.set_defaults(confidence=True)
 
     arguments = parser.parse_args()
@@ -140,10 +141,8 @@ def main() -> None:
     if not output_directory.parent:
         Path.mkdir(output_directory.parent, parents=True)
 
-    create_eaf_and_textgrid(wav_dictionary,
-                            ctm_dictionary,
-                            arguments.confidence,
-                            output_directory)
+    create_eaf_and_textgrid(wav_dictionary, ctm_dictionary, arguments.confidence, output_directory)
+
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/elpis/engines/common/output/ctm_to_textgrid.py b/elpis/engines/common/output/ctm_to_textgrid.py
@@ -17,8 +17,7 @@
 from loguru import logger
 
 
-def ctm_to_dictionary(ctm_file_path: str,
-                      segments_dictionary: Dict[str, str]) -> dict:
+def ctm_to_dictionary(ctm_file_path: str, segments_dictionary: Dict[str, str]) -> dict:
     with codecs.open(ctm_file_path, encoding="utf8") as file:
         ctm_entries = list(reader(file, delimiter=" "))
     textgrid_dictionary = dict()
@@ -30,9 +29,7 @@ def ctm_to_dictionary(ctm_file_path: str,
         absolute_start_time = segment_start_time + relative_start_time
         absolute_end_time = absolute_start_time + float(entry[3])
         inferred_text = entry[4]
-        utterance_segment = (str(absolute_start_time),
-                             str(absolute_end_time),
-                             inferred_text)
+        utterance_segment = (str(absolute_start_time), str(absolute_end_time), inferred_text)
         textgrid_dictionary[utterance_id].append(utterance_segment)
     return textgrid_dictionary
 
@@ -60,42 +57,35 @@ def wav_scp_to_dictionary(scp_file_name: str) -> dict:
     return wav_dictionary
 
 
-def create_textgrid(wav_dictionary: Dict[str, str],
-                    ctm_dictionary: dict,
-                    output_directory: str) -> None:
+def create_textgrid(
+    wav_dictionary: Dict[str, str], ctm_dictionary: dict, output_directory: str
+) -> None:
     logger.info(f"create_textgrid {ctm_dictionary=}")
 
     for index, utterance_id in enumerate(wav_dictionary.keys()):
         text_grid = textgrid.Textgrid()
-        tier = textgrid.IntervalTier(name='default',
-                                     entryList=ctm_dictionary[utterance_id],
-                                     minT=0
-                                     )
+        tier = textgrid.IntervalTier(name="default", entryList=ctm_dictionary[utterance_id], minT=0)
         text_grid.addTier(tier)
         name = str(Path(output_directory, f"utterance-{index}.TextGrid"))
-        text_grid.save(fn=name,
-                       format="short_textgrid",
-                       includeBlankSpaces=False
-                       )
+        text_grid.save(fn=name, format="short_textgrid", includeBlankSpaces=False)
+
 
 def main() -> None:
-    parser: ArgumentParser = ArgumentParser(description="Converts Kaldi CTM format to Praat Textgrid Format.")
-    parser.add_argument("-c", "--ctm",
-                        type=str,
-                        help="The input CTM format file",
-                        required=True)
-    parser.add_argument("-w", "--wav",
-                        type=str,
-                        help="The input wav.scp file",
-                        required=True)
-    parser.add_argument("-s", "--seg",
-                        type=str,
-                        help="The segment to utterance mapping",
-                        default="./segments")
-    parser.add_argument("-o", "--outdir",
-                        type=str,
-                        help="The directory path for the Praat TextGrid output",
-                        default=".")
+    parser: ArgumentParser = ArgumentParser(
+        description="Converts Kaldi CTM format to Praat Textgrid Format."
+    )
+    parser.add_argument("-c", "--ctm", type=str, help="The input CTM format file", required=True)
+    parser.add_argument("-w", "--wav", type=str, help="The input wav.scp file", required=True)
+    parser.add_argument(
+        "-s", "--seg", type=str, help="The segment to utterance mapping", default="./segments"
+    )
+    parser.add_argument(
+        "-o",
+        "--outdir",
+        type=str,
+        help="The directory path for the Praat TextGrid output",
+        default=".",
+    )
     arguments = parser.parse_args()
 
     segments_dictionary = get_segment_dictionary(arguments.seg)
@@ -108,10 +98,8 @@ def main() -> None:
 
     output_directory = str(output_directory)
 
-    create_textgrid(wav_dictionary,
-                    ctm_dictionary,
-                    output_directory)
+    create_textgrid(wav_dictionary, ctm_dictionary, output_directory)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/elpis/engines/common/output/textgrid_to_elan.py b/elpis/engines/common/output/textgrid_to_elan.py
@@ -19,15 +19,25 @@ def get_first_wav(wav_scp: str) -> str:
 
 
 def main() -> None:
-    parser: ArgumentParser = ArgumentParser(description=
-                                            "Converts Praat TextGrid format to ELAN eaf Format.")
-    parser.add_argument("--tg", "--textgrid", type=str, help="The input TextGrid format file",
-                        required=True)
-    parser.add_argument("--wav", type=str,
-                        help="The relative path to the .wav file associated with the TextGrid",
-                        required=True)
-    parser.add_argument("-o", "--outfile", type=str, help="The file path for the ELAN file output",
-                        default="./inferred-aligned.eaf")
+    parser: ArgumentParser = ArgumentParser(
+        description="Converts Praat TextGrid format to ELAN eaf Format."
+    )
+    parser.add_argument(
+        "--tg", "--textgrid", type=str, help="The input TextGrid format file", required=True
+    )
+    parser.add_argument(
+        "--wav",
+        type=str,
+        help="The relative path to the .wav file associated with the TextGrid",
+        required=True,
+    )
+    parser.add_argument(
+        "-o",
+        "--outfile",
+        type=str,
+        help="The file path for the ELAN file output",
+        default="./inferred-aligned.eaf",
+    )
     arguments = parser.parse_args()
 
     textgrid_file = arguments.tg
@@ -41,13 +51,15 @@ def main() -> None:
 
     elan = textgrid.to_eaf()
 
-    elan.add_linked_file(file_path=str(wav_file.absolute()),
-                         relpath=str(wav_file),
-                         mimetype=Elan.Eaf.MIMES.get("wav", ""),
-                         time_origin=0)
+    elan.add_linked_file(
+        file_path=str(wav_file.absolute()),
+        relpath=str(wav_file),
+        mimetype=Elan.Eaf.MIMES.get("wav", ""),
+        time_origin=0,
+    )
 
     elan.to_file(output_file)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/elpis/engines/common/utilities/hasher.py b/elpis/engines/common/utilities/hasher.py
@@ -4,4 +4,4 @@
 
 def new():
     # Meant to give a unique hash. TODO: fix to ensure it does.
-    return md5(bytes(str(time()), 'utf-8')).hexdigest()
+    return md5(bytes(str(time()), "utf-8")).hexdigest()
diff --git a/elpis/engines/common/utilities/json_utilities.py b/elpis/engines/common/utilities/json_utilities.py
@@ -28,8 +28,9 @@ def load_json_file(file_name: str) -> List[Dict[str, str]]:
     return data
 
 
-def write_data_to_json_file(data: object = None,
-                            file_name: Union[str, TextIOWrapper] = None) -> None:
+def write_data_to_json_file(
+    data: object = None, file_name: Union[str, TextIOWrapper] = None
+) -> None:
     """
     Writes the given Python dictionary (or list) object to a JSON file at the the given
     output location (which can either be a file - specified as a string, or
@@ -41,10 +42,7 @@ def write_data_to_json_file(data: object = None,
         data = dict()
     if not file_name:
         file_name = sys.stdout
-    json_data_string = json.dumps(data,
-                                  indent=4,
-                                  separators=(',', ': '),
-                                  sort_keys=False)
+    json_data_string = json.dumps(data, indent=4, separators=(",", ": "), sort_keys=False)
     if isinstance(file_name, str):
         with open(file_name, "w") as file:
             file.write(json_data_string)

diff --git a/elpis/engines/common/utilities/resampling.py b/elpis/engines/common/utilities/resampling.py
@@ -8,7 +8,7 @@
 from werkzeug.datastructures import FileStorage
 
 
-ORIGINAL_SOUND_FILE_DIRECTORY = Path('/tmp/origial_sound_files/')
+ORIGINAL_SOUND_FILE_DIRECTORY = Path("/tmp/origial_sound_files/")
 
 
 def load_audio(file: Path, target_sample_rate: int = None) -> Tuple[np.ndarray, int]:
@@ -25,6 +25,7 @@ def load_audio(file: Path, target_sample_rate: int = None) -> Tuple[np.ndarray,
     """
     return librosa.load(file, sr=target_sample_rate)
 
+
 def resample_audio(file: Path, destination: Path, target_sample_rate: int) -> None:
     """Writes a resampled audio file to the supplied destination, with a supplied
     sample rate.
@@ -47,21 +48,21 @@ def resample_audio(file: Path, destination: Path, target_sample_rate: int) -> No
     sf.write(destination, data, target_sample_rate)
 
 
-def resample_from_file_storage(file: FileStorage, destination: Path, target_sample_rate: int) -> Dict:
-    """ Performs audio resampling from a flask request FileStorage file, and
+def resample_from_file_storage(
+    file: FileStorage, destination: Path, target_sample_rate: int
+) -> Dict:
+    """Performs audio resampling from a flask request FileStorage file, and
     returns some information about the original file.
-    
+
     """
     # Create temporary directory if it hasn't already been created
     ORIGINAL_SOUND_FILE_DIRECTORY.mkdir(parents=True, exist_ok=True)
 
     original = ORIGINAL_SOUND_FILE_DIRECTORY / file.filename
-    with original.open(mode='wb') as fout:
+    with original.open(mode="wb") as fout:
         fout.write(file.read())
 
-    info = {
-        'duration': librosa.get_duration(filename=original)
-    }
+    info = {"duration": librosa.get_duration(filename=original)}
 
     resample_audio(original, destination, target_sample_rate)
-    return info
+    return info