Ben kaldi conf (#239)

* update praatio and nltk * add ctm-to-elan, make gmm-decode wav.scp scripts more explicit * add test conf sh files * remove the temp ctm scripts * add conf scripts to templates, rename gmm-decode template dir * copy the ctm file to the transcription hash dir * Add i18n for status * show confidence in the gui as text opacity * response to PR comments
CoEDL · Oct 28, 2021 · 02142fe · 02142fe
1 parent 8b74867
commit 02142fe
Show file tree

Hide file tree

Showing 34 changed files with 973 additions and 275 deletions.
diff --git a/elpis/endpoints/transcription.py b/elpis/endpoints/transcription.py
@@ -68,3 +68,15 @@ def elan():
     transcription: Transcription = app.config['CURRENT_TRANSCRIPTION']
     # TODO fix this to return json wrapper
     return transcription.elan()
+
+
+@bp.route("/confidence", methods=['GET'])
+def confidence():
+    transcription: Transcription = app.config['CURRENT_TRANSCRIPTION']
+    data = {
+        "confidence": transcription.get_confidence()
+    }
+    return jsonify({
+        "status": 200,
+        "data": data
+    })
diff --git a/elpis/engines/common/input/textgrid_to_json.py b/elpis/engines/common/input/textgrid_to_json.py
@@ -11,7 +11,7 @@
 """
 
 import argparse
-from praatio import tgio
+from praatio import textgrid
 from ..utilities import *
 
 
@@ -34,8 +34,9 @@ def process_textgrid(input_directory: str) -> List[Dict[str, Union[str, int]]]:
         for filename in files:
             basename, extension = os.path.splitext(filename)
             if filename.endswith(".TextGrid"):
-                textgrid: tgio.Textgrid = tgio.openTextgrid(os.path.join(root, filename))
-                speech_tier: tgio.TextgridTier = textgrid.tierDict["Speech"]
+                text_grid: textgrid.Textgrid = textgrid.openTextgrid(os.path.join(root, filename),
+                                                                     includeEmptyIntervals=False)
+                speech_tier: textgrid.TextgridTier = text_grid.tierDict["Speech"]
                 for start, stop, label in speech_tier.entryList:
                     label_word: str = label.replace('"', '')
                     intervals.append({

diff --git a/elpis/engines/common/objects/transcription.py b/elpis/engines/common/objects/transcription.py
@@ -89,3 +89,7 @@ def text(self):
     @abstractmethod
     def elan(self):
         pass
+
+    @abstractmethod
+    def get_confidence(self):
+        pass
diff --git a/elpis/engines/common/output/ctm_to_elan.py b/elpis/engines/common/output/ctm_to_elan.py
@@ -0,0 +1,147 @@
+#!/usr/bin/python3
+
+"""
+Takes a CTM (time aligned) file and produces an Elan file.
+If the CTM has confidence values, write them as a ref tier.
+
+Copyright: University of Queensland, 2021
+Contributors:
+             Ben Foley - (University of Queensland, 2021)
+             Nicholas Lambourne - (University of Queensland, 2018)
+"""
+
+from argparse import ArgumentParser
+from csv import reader
+from pathlib import Path
+from typing import Dict, Tuple
+import codecs
+from pympi.Elan import Eaf
+
+# The magic number 20 here is to help pympi find the parent annotation.
+# There may be a better way to do it but i noticed that if I used the exact start time,
+# sometimes pympi would locate the child annotation with the parent annotation that is adjacent to the intended one.
+# Also happened for +1 but seems to be finding the parent better with this "buffer" of 20. Weird.
+PYMPI_CHILD_ANNOTATION_OFFSET = 20
+
+def ctm_to_dictionary(ctm_file_path: str,
+                      segments_dictionary: Dict[str, str],
+                      confidence: bool) -> dict:
+    with codecs.open(ctm_file_path, encoding="utf8") as file:
+        ctm_entries = list(reader(file, delimiter=" "))
+    ctm_dictionary = dict()
+    for entry in ctm_entries:
+        utterance_id, segment_start_time = segments_dictionary[entry[0]]
+        if utterance_id not in ctm_dictionary:
+            ctm_dictionary[utterance_id] = []
+        relative_start_time = float(entry[2])
+        absolute_start_time = segment_start_time + relative_start_time
+        absolute_end_time = absolute_start_time + float(entry[3])
+        inferred_text = entry[4]
+        confidence = entry[5] if confidence else None
+        utterance_segment = (str(absolute_start_time),
+                             str(absolute_end_time),
+                             inferred_text,
+                             confidence)
+        ctm_dictionary[utterance_id].append(utterance_segment)
+    return ctm_dictionary
+
+def get_segment_dictionary(segment_file_name: str) -> Dict[str, Tuple[str, float]]:
+    with open(segment_file_name, "r") as file:
+        segment_entries = list(reader(file, delimiter=" "))
+    segment_dictionary = dict()
+    for entry in segment_entries:
+        segment_id = entry[0]
+        utterance_id = entry[1]
+        start_time = float(entry[2])
+        segment_dictionary[segment_id] = (utterance_id, start_time)
+    return segment_dictionary
+
+def wav_scp_to_dictionary(scp_file_name: str) -> dict:
+    wav_dictionary = dict()
+    with open(scp_file_name) as file:
+        wav_entries = list(reader(file, delimiter=" "))
+        for entry in wav_entries:
+            utterance_id = entry[0]
+            wav_file_path = entry[1]
+            wav_dictionary[utterance_id] = wav_file_path
+    return wav_dictionary
+
+def create_eaf_and_textgrid(wav_dictionary:dict,
+               ctm_dictionary:dict,
+               confidence:bool,
+               output_directory:str):
+    for index, [utterance_id, basename] in enumerate(wav_dictionary.items()):
+        eaf = Eaf()
+        eaf.add_linked_file(str(Path(output_directory, basename)))
+        eaf.add_linguistic_type("conf_lt", "Symbolic_Association")
+        eaf.add_tier("default")
+        if confidence:
+            eaf.add_tier("confidence", parent="default", ling="conf_lt")
+        for annotation in ctm_dictionary[utterance_id]:
+            # Annotation looks like ('0.32', '0.52', 'word', '0.81')
+            # Convert times to ms integers
+            start, end, value, *conf = annotation
+            start_ms = int(float(start) * 1000)
+            end_ms = int(float(end) * 1000)
+            # Add the transcription annotation
+            eaf.add_annotation("default", start_ms, end_ms, value)
+            # Add the confidence value as a reference annotation
+            if conf:
+                # Add a time value to the start time so the ref falls within a parent slot
+                eaf.add_ref_annotation("confidence", "default", start_ms+PYMPI_CHILD_ANNOTATION_OFFSET, conf[0])
+
+        # Save as Elan eaf file
+        output_eaf = str(Path(output_directory, f'utterance-{index}.eaf'))
+        eaf.to_file(output_eaf)
+
+        # Make a Textgrid format version
+        output_textgrid = str(Path(output_directory, f'utterance-{index}.Textgrid'))
+        textgrid = eaf.to_textgrid()
+        textgrid.to_file(output_textgrid)
+
+
+def main() -> None:
+    parser: ArgumentParser = ArgumentParser(description="Converts Kaldi CTM format to Elan .eaf format.")
+    parser.add_argument("-c", "--ctm",
+                        type=str,
+                        help="The input CTM format file",
+                        required=True)
+    parser.add_argument("-w", "--wav",
+                        type=str,
+                        help="The input wav.scp file",
+                        required=True)
+    parser.add_argument("-s", "--seg",
+                        type=str,
+                        help="The segment to utterance mapping",
+                        default="./segments")
+    parser.add_argument("-o", "--outdir",
+                        type=str,
+                        help="The directory path for the Elan output",
+                        default=".")
+    parser.add_argument('--confidence', dest='confidence', action='store_true')
+    parser.add_argument('--no-confidence', dest='confidence', action='store_false')
+    parser.set_defaults(confidence=True)
+
+    arguments = parser.parse_args()
+
+    segments_dictionary = get_segment_dictionary(arguments.seg)
+    ctm_dictionary = ctm_to_dictionary(arguments.ctm, segments_dictionary, arguments.confidence)
+    wav_dictionary = wav_scp_to_dictionary(arguments.wav)
+    output_directory = Path(arguments.outdir)
+
+    print("==== CTM to Elan args")
+    print("segments_dictionary", segments_dictionary)
+    print("ctm_dictionary", ctm_dictionary)
+    print("wav_dictionary", wav_dictionary)
+    print("output_directory", output_directory)
+
+    if not output_directory.parent:
+        Path.mkdir(output_directory.parent, parents=True)
+
+    create_eaf_and_textgrid(wav_dictionary,
+                            ctm_dictionary,
+                            arguments.confidence,
+                            output_directory)
+
+if __name__ == '__main__':
+    main()
diff --git a/elpis/engines/common/output/ctm_to_textgrid.py b/elpis/engines/common/output/ctm_to_textgrid.py
@@ -12,7 +12,7 @@
 from csv import reader
 from pathlib import Path
 from typing import Dict, Tuple
-from praatio import tgio
+from praatio import textgrid
 import codecs
 
 
@@ -63,14 +63,17 @@ def create_textgrid(wav_dictionary: Dict[str, str],
                     ctm_dictionary: dict,
                     output_directory: str) -> None:
     for index, utterance_id in enumerate(wav_dictionary.keys()):
-        textgrid = tgio.Textgrid()
-        tier = tgio.IntervalTier(name='default',
-                                 entryList=ctm_dictionary[utterance_id],
-                                 minT=0,
-                                 pairedWav=str(Path(wav_dictionary[utterance_id])))
-        textgrid.addTier(tier)
-        textgrid.save(str(Path(output_directory, f"utterance-{index}.TextGrid")))
-
+        text_grid = textgrid.Textgrid()
+        tier = textgrid.IntervalTier(name='default',
+                                     entryList=ctm_dictionary[utterance_id],
+                                     minT=0
+                                     )
+        text_grid.addTier(tier)
+        name = str(Path(output_directory, f"utterance-{index}.TextGrid"))
+        text_grid.save(fn=name,
+                       format="short_textgrid",
+                       includeBlankSpaces=False
+                       )
 
 def main() -> None:
     parser: ArgumentParser = ArgumentParser(description="Converts Kaldi CTM format to Praat Textgrid Format.")

diff --git a/elpis/engines/kaldi/inference/gmm-decode-conf/gmm-decode-conf.sh b/elpis/engines/kaldi/inference/gmm-decode-conf/gmm-decode-conf.sh
@@ -0,0 +1,124 @@
+#!/bin/bash
+
+# Copyright: University of Queensland, 2019
+# Contributors:
+#               Joshua Meyer - (2016)
+#               Scott Heath - (University of Queensland, 2018)
+#               Nicholas Lambourne - (University of Queensland, 2018)
+
+# USAGE:
+#    $ kaldi/egs/your-model/your-model-1/gmm-decode.sh
+#
+#    This script is meant to demonstrate how an existing GMM-HMM
+#    model and its corresponding HCLG graph, build via Kaldi,
+#    can be used to decode new audio files.
+#    Although this script takes no command line arguments, it assumes
+#    the existance of a directory (./transcriptions) and an scp file
+#    within that directory (./transcriptions/wav.scp). For more on scp
+#    files, consult the official Kaldi documentation.
+
+# INPUT:
+#       audio.wav
+#       data/
+#       infer/          <= these need to be created
+#           wav.scp
+#           utt2spk
+#           spk2utt
+#           text        <= put a transcription here for quick comparison against generated one
+#
+#    config/
+#        mfcc.conf
+#
+#    exp/
+#        tri/
+#            final.mdl
+#
+#            graph/
+#                HCLG.fst
+#                words.txt
+
+# OUTPUT:
+#    data/
+#       infer/
+#            feats.ark
+#            feats.scp
+#            delta-feats.ark
+#            lattices.ark
+#            one-best.tra
+#            one-best-hypothesis.txt
+
+
+
+. ./path.sh
+# make sure you include the path to the gmm bin(s)
+# the following two export commands are what my path.sh script contains:
+# export PATH=$PWD/utils/:$PWD/../../../src/bin:$PWD/../../../tools/openfst/bin:$PWD/../../../src/fstbin/:$PWD/../../../src/gmmbin/:$PWD/../../../src/featbin/:$PWD/../../../src/lm/:$PWD/../../../src/sgmmbin/:$PWD/../../../src/fgmmbin/:$PWD/../../../src/latbin/:$PWD/../../../src/nnet2bin/:$PWD:$PATH
+# export LC_ALL=C
+
+# Make the split dir with scp, utt2spk etc files
+. $PWD/make_split.sh
+
+
+# AUDIO --> FEATURE VECTORS
+echo "==== Extracting Feature Vectors ===="
+steps/make_mfcc.sh --nj 1 data/infer exp/make_mfcc/infer mfcc
+
+echo "==== Applying CMVN ===="
+apply-cmvn --utt2spk=ark:data/infer/utt2spk \
+    scp:mfcc/cmvn_test.scp \
+    scp:mfcc/raw_mfcc_infer.1.scp ark:- | \
+    add-deltas ark:- ark:data/infer/delta-feats.ark
+
+# TRAINED GMM-HMM + FEATURE VECTORS --> LATTICE
+echo "==== Producing Lattice ===="
+gmm-latgen-faster \
+    --word-symbol-table=exp/tri1/graph/words.txt \
+    exp/tri1/final.mdl \
+    exp/tri1/graph/HCLG.fst \
+    ark:data/infer/delta-feats.ark \
+    ark,t:data/infer/lattices.ark
+
+echo "==== Lattice to Conf ===="
+
+# Enable setting acoustic scale by ENV
+# eg run this as `ACOUSTIC_SCALE=2 gmm-deccode-conf.sh`
+# seems that 1/10 or 1/12 is a standard setting but idkw
+# TODO add a GUI setting for this
+acoustic_scale="${ACOUSTIC_SCALE:-0.1}"
+echo "Using acoustic scale: ${acoustic_scale}"
+
+lattice-align-words \
+  data/lang/phones/word_boundary.int \
+  exp/tri1/final.mdl \
+  ark:data/infer/lattices.ark \
+  ark:- | \
+
+lattice-to-ctm-conf --acoustic-scale=$acoustic_scale \
+  ark:- - | \
+
+utils/int2sym.pl -f 5 \
+    exp/tri1/graph/words.txt \
+    > data/infer/ctm_with_conf.ctm
+
+# Now, wav.scp needs to be in segment form
+# eg audio_id filename
+echo "decode audio.wav" > ./data/infer/split1/1/wav.scp
+
+echo "==== CTM output ===="
+awk  -F" " 'BEGIN { ORS=" " }; {print $(NF-1)}' \
+  data/infer/ctm_with_conf.ctm \
+  > data/infer/one-best-hypothesis.txt
+
+# Add a newline to the file
+echo >> data/infer/one-best-hypothesis.txt
+
+cat data/infer/one-best-hypothesis.txt
+
+echo "==== Build the Elan file ===="
+"${POETRY_PATH}/bin/python" /elpis/elpis/engines/common/output/ctm_to_elan.py \
+    --ctm data/infer/ctm_with_conf.ctm \
+    --wav data/infer/split1/1/wav.scp \
+    --seg data/infer/split1/1/segments \
+    --outdir data/infer \
+    --confidence
+
diff --git a/...ldi/inference/gmm-decode/0_feature_vec.sh → ...e/gmm-decode-online-conf/0_feature_vec.sh b/...ldi/inference/gmm-decode/0_feature_vec.sh → ...e/gmm-decode-online-conf/0_feature_vec.sh
diff --git a/.../inference/gmm-decode/1_model_creation.sh → ...mm-decode-online-conf/1_model_creation.sh b/.../inference/gmm-decode/1_model_creation.sh → ...mm-decode-online-conf/1_model_creation.sh
@@ -28,25 +28,5 @@ echo "# dummy file" > ./conf/online_cmvn.conf
     exp/tri1 \
     exp/tri1_online
 
-# Assuming we are decoding a single utterance still
-
-# Manipulate the wav.scp file in the first (and only) split
-line=$(head -n 1 ./data/infer/spk2utt)
-utt=` echo ${line} | cut -d ' ' -f 2`
-spk=` echo ${line} | cut -d ' ' -f 1` # this was seg
-audio="audio.wav"
-length=`sox --i -D ${audio}`
-recid="decode"
-
-# Prepare the split dir
-splitDir=./data/infer/split1/
-if [[ -d $splitDir ]]; then rm -r $splitDir; fi
-mkdir -p "$splitDir/1"
-
-# Argh.. the wav.scp file here should be in {utterance_id} to {audio_file} form
-# unlike other usage which requires {audio_id} to {audio_file} format
-# (such as below when we convert ctm to textgrid)
-echo "${utt} ${audio}" > ./data/infer/split1/1/wav.scp
-echo "${utt} ${spk}" > ./data/infer/split1/1/utt2spk
-echo "${spk} ${utt}" > ./data/infer/split1/1/spk2utt
-echo "${utt} ${recid} 0.00 ${length}" > ./data/infer/split1/1/segments
+# Make the split dir with scp, utt2spk etc files
+. $PWD/make_split.sh
diff --git a/...ence/gmm-decode/2_transcription_decode.sh → ...ode-online-conf/2_transcription_decode.sh b/...ence/gmm-decode/2_transcription_decode.sh → ...ode-online-conf/2_transcription_decode.sh