Skip to content

Commit

Permalink
Ben kaldi conf (#239)
Browse files Browse the repository at this point in the history
* update praatio and nltk

* add ctm-to-elan, make gmm-decode wav.scp scripts more explicit

* add test conf sh files

* remove the temp ctm scripts

* add conf scripts to templates, rename gmm-decode template dir

* copy the ctm file to the transcription hash dir

* Add i18n for status

* show confidence in the gui as text opacity

* response to PR comments
  • Loading branch information
benfoley committed Oct 28, 2021
1 parent 8b74867 commit 02142fe
Show file tree
Hide file tree
Showing 34 changed files with 973 additions and 275 deletions.
12 changes: 12 additions & 0 deletions elpis/endpoints/transcription.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,3 +68,15 @@ def elan():
transcription: Transcription = app.config['CURRENT_TRANSCRIPTION']
# TODO fix this to return json wrapper
return transcription.elan()


@bp.route("/confidence", methods=['GET'])
def confidence():
transcription: Transcription = app.config['CURRENT_TRANSCRIPTION']
data = {
"confidence": transcription.get_confidence()
}
return jsonify({
"status": 200,
"data": data
})
7 changes: 4 additions & 3 deletions elpis/engines/common/input/textgrid_to_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
"""

import argparse
from praatio import tgio
from praatio import textgrid
from ..utilities import *


Expand All @@ -34,8 +34,9 @@ def process_textgrid(input_directory: str) -> List[Dict[str, Union[str, int]]]:
for filename in files:
basename, extension = os.path.splitext(filename)
if filename.endswith(".TextGrid"):
textgrid: tgio.Textgrid = tgio.openTextgrid(os.path.join(root, filename))
speech_tier: tgio.TextgridTier = textgrid.tierDict["Speech"]
text_grid: textgrid.Textgrid = textgrid.openTextgrid(os.path.join(root, filename),
includeEmptyIntervals=False)
speech_tier: textgrid.TextgridTier = text_grid.tierDict["Speech"]
for start, stop, label in speech_tier.entryList:
label_word: str = label.replace('"', '')
intervals.append({
Expand Down
4 changes: 4 additions & 0 deletions elpis/engines/common/objects/transcription.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,3 +89,7 @@ def text(self):
@abstractmethod
def elan(self):
pass

@abstractmethod
def get_confidence(self):
pass
147 changes: 147 additions & 0 deletions elpis/engines/common/output/ctm_to_elan.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
#!/usr/bin/python3

"""
Takes a CTM (time aligned) file and produces an Elan file.
If the CTM has confidence values, write them as a ref tier.
Copyright: University of Queensland, 2021
Contributors:
Ben Foley - (University of Queensland, 2021)
Nicholas Lambourne - (University of Queensland, 2018)
"""

from argparse import ArgumentParser
from csv import reader
from pathlib import Path
from typing import Dict, Tuple
import codecs
from pympi.Elan import Eaf

# The magic number 20 here is to help pympi find the parent annotation.
# There may be a better way to do it but i noticed that if I used the exact start time,
# sometimes pympi would locate the child annotation with the parent annotation that is adjacent to the intended one.
# Also happened for +1 but seems to be finding the parent better with this "buffer" of 20. Weird.
PYMPI_CHILD_ANNOTATION_OFFSET = 20

def ctm_to_dictionary(ctm_file_path: str,
segments_dictionary: Dict[str, str],
confidence: bool) -> dict:
with codecs.open(ctm_file_path, encoding="utf8") as file:
ctm_entries = list(reader(file, delimiter=" "))
ctm_dictionary = dict()
for entry in ctm_entries:
utterance_id, segment_start_time = segments_dictionary[entry[0]]
if utterance_id not in ctm_dictionary:
ctm_dictionary[utterance_id] = []
relative_start_time = float(entry[2])
absolute_start_time = segment_start_time + relative_start_time
absolute_end_time = absolute_start_time + float(entry[3])
inferred_text = entry[4]
confidence = entry[5] if confidence else None
utterance_segment = (str(absolute_start_time),
str(absolute_end_time),
inferred_text,
confidence)
ctm_dictionary[utterance_id].append(utterance_segment)
return ctm_dictionary

def get_segment_dictionary(segment_file_name: str) -> Dict[str, Tuple[str, float]]:
with open(segment_file_name, "r") as file:
segment_entries = list(reader(file, delimiter=" "))
segment_dictionary = dict()
for entry in segment_entries:
segment_id = entry[0]
utterance_id = entry[1]
start_time = float(entry[2])
segment_dictionary[segment_id] = (utterance_id, start_time)
return segment_dictionary

def wav_scp_to_dictionary(scp_file_name: str) -> dict:
wav_dictionary = dict()
with open(scp_file_name) as file:
wav_entries = list(reader(file, delimiter=" "))
for entry in wav_entries:
utterance_id = entry[0]
wav_file_path = entry[1]
wav_dictionary[utterance_id] = wav_file_path
return wav_dictionary

def create_eaf_and_textgrid(wav_dictionary:dict,
ctm_dictionary:dict,
confidence:bool,
output_directory:str):
for index, [utterance_id, basename] in enumerate(wav_dictionary.items()):
eaf = Eaf()
eaf.add_linked_file(str(Path(output_directory, basename)))
eaf.add_linguistic_type("conf_lt", "Symbolic_Association")
eaf.add_tier("default")
if confidence:
eaf.add_tier("confidence", parent="default", ling="conf_lt")
for annotation in ctm_dictionary[utterance_id]:
# Annotation looks like ('0.32', '0.52', 'word', '0.81')
# Convert times to ms integers
start, end, value, *conf = annotation
start_ms = int(float(start) * 1000)
end_ms = int(float(end) * 1000)
# Add the transcription annotation
eaf.add_annotation("default", start_ms, end_ms, value)
# Add the confidence value as a reference annotation
if conf:
# Add a time value to the start time so the ref falls within a parent slot
eaf.add_ref_annotation("confidence", "default", start_ms+PYMPI_CHILD_ANNOTATION_OFFSET, conf[0])

# Save as Elan eaf file
output_eaf = str(Path(output_directory, f'utterance-{index}.eaf'))
eaf.to_file(output_eaf)

# Make a Textgrid format version
output_textgrid = str(Path(output_directory, f'utterance-{index}.Textgrid'))
textgrid = eaf.to_textgrid()
textgrid.to_file(output_textgrid)


def main() -> None:
parser: ArgumentParser = ArgumentParser(description="Converts Kaldi CTM format to Elan .eaf format.")
parser.add_argument("-c", "--ctm",
type=str,
help="The input CTM format file",
required=True)
parser.add_argument("-w", "--wav",
type=str,
help="The input wav.scp file",
required=True)
parser.add_argument("-s", "--seg",
type=str,
help="The segment to utterance mapping",
default="./segments")
parser.add_argument("-o", "--outdir",
type=str,
help="The directory path for the Elan output",
default=".")
parser.add_argument('--confidence', dest='confidence', action='store_true')
parser.add_argument('--no-confidence', dest='confidence', action='store_false')
parser.set_defaults(confidence=True)

arguments = parser.parse_args()

segments_dictionary = get_segment_dictionary(arguments.seg)
ctm_dictionary = ctm_to_dictionary(arguments.ctm, segments_dictionary, arguments.confidence)
wav_dictionary = wav_scp_to_dictionary(arguments.wav)
output_directory = Path(arguments.outdir)

print("==== CTM to Elan args")
print("segments_dictionary", segments_dictionary)
print("ctm_dictionary", ctm_dictionary)
print("wav_dictionary", wav_dictionary)
print("output_directory", output_directory)

if not output_directory.parent:
Path.mkdir(output_directory.parent, parents=True)

create_eaf_and_textgrid(wav_dictionary,
ctm_dictionary,
arguments.confidence,
output_directory)

if __name__ == '__main__':
main()
21 changes: 12 additions & 9 deletions elpis/engines/common/output/ctm_to_textgrid.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from csv import reader
from pathlib import Path
from typing import Dict, Tuple
from praatio import tgio
from praatio import textgrid
import codecs


Expand Down Expand Up @@ -63,14 +63,17 @@ def create_textgrid(wav_dictionary: Dict[str, str],
ctm_dictionary: dict,
output_directory: str) -> None:
for index, utterance_id in enumerate(wav_dictionary.keys()):
textgrid = tgio.Textgrid()
tier = tgio.IntervalTier(name='default',
entryList=ctm_dictionary[utterance_id],
minT=0,
pairedWav=str(Path(wav_dictionary[utterance_id])))
textgrid.addTier(tier)
textgrid.save(str(Path(output_directory, f"utterance-{index}.TextGrid")))

text_grid = textgrid.Textgrid()
tier = textgrid.IntervalTier(name='default',
entryList=ctm_dictionary[utterance_id],
minT=0
)
text_grid.addTier(tier)
name = str(Path(output_directory, f"utterance-{index}.TextGrid"))
text_grid.save(fn=name,
format="short_textgrid",
includeBlankSpaces=False
)

def main() -> None:
parser: ArgumentParser = ArgumentParser(description="Converts Kaldi CTM format to Praat Textgrid Format.")
Expand Down
124 changes: 124 additions & 0 deletions elpis/engines/kaldi/inference/gmm-decode-conf/gmm-decode-conf.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
#!/bin/bash

# Copyright: University of Queensland, 2019
# Contributors:
# Joshua Meyer - (2016)
# Scott Heath - (University of Queensland, 2018)
# Nicholas Lambourne - (University of Queensland, 2018)

# USAGE:
# $ kaldi/egs/your-model/your-model-1/gmm-decode.sh
#
# This script is meant to demonstrate how an existing GMM-HMM
# model and its corresponding HCLG graph, build via Kaldi,
# can be used to decode new audio files.
# Although this script takes no command line arguments, it assumes
# the existance of a directory (./transcriptions) and an scp file
# within that directory (./transcriptions/wav.scp). For more on scp
# files, consult the official Kaldi documentation.

# INPUT:
# audio.wav
# data/
# infer/ <= these need to be created
# wav.scp
# utt2spk
# spk2utt
# text <= put a transcription here for quick comparison against generated one
#
# config/
# mfcc.conf
#
# exp/
# tri/
# final.mdl
#
# graph/
# HCLG.fst
# words.txt

# OUTPUT:
# data/
# infer/
# feats.ark
# feats.scp
# delta-feats.ark
# lattices.ark
# one-best.tra
# one-best-hypothesis.txt



. ./path.sh
# make sure you include the path to the gmm bin(s)
# the following two export commands are what my path.sh script contains:
# export PATH=$PWD/utils/:$PWD/../../../src/bin:$PWD/../../../tools/openfst/bin:$PWD/../../../src/fstbin/:$PWD/../../../src/gmmbin/:$PWD/../../../src/featbin/:$PWD/../../../src/lm/:$PWD/../../../src/sgmmbin/:$PWD/../../../src/fgmmbin/:$PWD/../../../src/latbin/:$PWD/../../../src/nnet2bin/:$PWD:$PATH
# export LC_ALL=C

# Make the split dir with scp, utt2spk etc files
. $PWD/make_split.sh


# AUDIO --> FEATURE VECTORS
echo "==== Extracting Feature Vectors ===="
steps/make_mfcc.sh --nj 1 data/infer exp/make_mfcc/infer mfcc

echo "==== Applying CMVN ===="
apply-cmvn --utt2spk=ark:data/infer/utt2spk \
scp:mfcc/cmvn_test.scp \
scp:mfcc/raw_mfcc_infer.1.scp ark:- | \
add-deltas ark:- ark:data/infer/delta-feats.ark

# TRAINED GMM-HMM + FEATURE VECTORS --> LATTICE
echo "==== Producing Lattice ===="
gmm-latgen-faster \
--word-symbol-table=exp/tri1/graph/words.txt \
exp/tri1/final.mdl \
exp/tri1/graph/HCLG.fst \
ark:data/infer/delta-feats.ark \
ark,t:data/infer/lattices.ark

echo "==== Lattice to Conf ===="

# Enable setting acoustic scale by ENV
# eg run this as `ACOUSTIC_SCALE=2 gmm-deccode-conf.sh`
# seems that 1/10 or 1/12 is a standard setting but idkw
# TODO add a GUI setting for this
acoustic_scale="${ACOUSTIC_SCALE:-0.1}"
echo "Using acoustic scale: ${acoustic_scale}"

lattice-align-words \
data/lang/phones/word_boundary.int \
exp/tri1/final.mdl \
ark:data/infer/lattices.ark \
ark:- | \

lattice-to-ctm-conf --acoustic-scale=$acoustic_scale \
ark:- - | \

utils/int2sym.pl -f 5 \
exp/tri1/graph/words.txt \
> data/infer/ctm_with_conf.ctm

# Now, wav.scp needs to be in segment form
# eg audio_id filename
echo "decode audio.wav" > ./data/infer/split1/1/wav.scp

echo "==== CTM output ===="
awk -F" " 'BEGIN { ORS=" " }; {print $(NF-1)}' \
data/infer/ctm_with_conf.ctm \
> data/infer/one-best-hypothesis.txt

# Add a newline to the file
echo >> data/infer/one-best-hypothesis.txt

cat data/infer/one-best-hypothesis.txt

echo "==== Build the Elan file ===="
"${POETRY_PATH}/bin/python" /elpis/elpis/engines/common/output/ctm_to_elan.py \
--ctm data/infer/ctm_with_conf.ctm \
--wav data/infer/split1/1/wav.scp \
--seg data/infer/split1/1/segments \
--outdir data/infer \
--confidence

Original file line number Diff line number Diff line change
Expand Up @@ -28,25 +28,5 @@ echo "# dummy file" > ./conf/online_cmvn.conf
exp/tri1 \
exp/tri1_online

# Assuming we are decoding a single utterance still

# Manipulate the wav.scp file in the first (and only) split
line=$(head -n 1 ./data/infer/spk2utt)
utt=` echo ${line} | cut -d ' ' -f 2`
spk=` echo ${line} | cut -d ' ' -f 1` # this was seg
audio="audio.wav"
length=`sox --i -D ${audio}`
recid="decode"

# Prepare the split dir
splitDir=./data/infer/split1/
if [[ -d $splitDir ]]; then rm -r $splitDir; fi
mkdir -p "$splitDir/1"

# Argh.. the wav.scp file here should be in {utterance_id} to {audio_file} form
# unlike other usage which requires {audio_id} to {audio_file} format
# (such as below when we convert ctm to textgrid)
echo "${utt} ${audio}" > ./data/infer/split1/1/wav.scp
echo "${utt} ${spk}" > ./data/infer/split1/1/utt2spk
echo "${spk} ${utt}" > ./data/infer/split1/1/spk2utt
echo "${utt} ${recid} 0.00 ${length}" > ./data/infer/split1/1/segments
# Make the split dir with scp, utt2spk etc files
. $PWD/make_split.sh

0 comments on commit 02142fe

Please sign in to comment.