-
Notifications
You must be signed in to change notification settings - Fork 33
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* update praatio and nltk * add ctm-to-elan, make gmm-decode wav.scp scripts more explicit * add test conf sh files * remove the temp ctm scripts * add conf scripts to templates, rename gmm-decode template dir * copy the ctm file to the transcription hash dir * Add i18n for status * show confidence in the gui as text opacity * response to PR comments
- Loading branch information
Showing
34 changed files
with
973 additions
and
275 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -89,3 +89,7 @@ def text(self): | |
@abstractmethod | ||
def elan(self): | ||
pass | ||
|
||
@abstractmethod | ||
def get_confidence(self): | ||
pass |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,147 @@ | ||
#!/usr/bin/python3 | ||
|
||
""" | ||
Takes a CTM (time aligned) file and produces an Elan file. | ||
If the CTM has confidence values, write them as a ref tier. | ||
Copyright: University of Queensland, 2021 | ||
Contributors: | ||
Ben Foley - (University of Queensland, 2021) | ||
Nicholas Lambourne - (University of Queensland, 2018) | ||
""" | ||
|
||
from argparse import ArgumentParser | ||
from csv import reader | ||
from pathlib import Path | ||
from typing import Dict, Tuple | ||
import codecs | ||
from pympi.Elan import Eaf | ||
|
||
# The magic number 20 here is to help pympi find the parent annotation. | ||
# There may be a better way to do it but i noticed that if I used the exact start time, | ||
# sometimes pympi would locate the child annotation with the parent annotation that is adjacent to the intended one. | ||
# Also happened for +1 but seems to be finding the parent better with this "buffer" of 20. Weird. | ||
PYMPI_CHILD_ANNOTATION_OFFSET = 20 | ||
|
||
def ctm_to_dictionary(ctm_file_path: str, | ||
segments_dictionary: Dict[str, str], | ||
confidence: bool) -> dict: | ||
with codecs.open(ctm_file_path, encoding="utf8") as file: | ||
ctm_entries = list(reader(file, delimiter=" ")) | ||
ctm_dictionary = dict() | ||
for entry in ctm_entries: | ||
utterance_id, segment_start_time = segments_dictionary[entry[0]] | ||
if utterance_id not in ctm_dictionary: | ||
ctm_dictionary[utterance_id] = [] | ||
relative_start_time = float(entry[2]) | ||
absolute_start_time = segment_start_time + relative_start_time | ||
absolute_end_time = absolute_start_time + float(entry[3]) | ||
inferred_text = entry[4] | ||
confidence = entry[5] if confidence else None | ||
utterance_segment = (str(absolute_start_time), | ||
str(absolute_end_time), | ||
inferred_text, | ||
confidence) | ||
ctm_dictionary[utterance_id].append(utterance_segment) | ||
return ctm_dictionary | ||
|
||
def get_segment_dictionary(segment_file_name: str) -> Dict[str, Tuple[str, float]]: | ||
with open(segment_file_name, "r") as file: | ||
segment_entries = list(reader(file, delimiter=" ")) | ||
segment_dictionary = dict() | ||
for entry in segment_entries: | ||
segment_id = entry[0] | ||
utterance_id = entry[1] | ||
start_time = float(entry[2]) | ||
segment_dictionary[segment_id] = (utterance_id, start_time) | ||
return segment_dictionary | ||
|
||
def wav_scp_to_dictionary(scp_file_name: str) -> dict: | ||
wav_dictionary = dict() | ||
with open(scp_file_name) as file: | ||
wav_entries = list(reader(file, delimiter=" ")) | ||
for entry in wav_entries: | ||
utterance_id = entry[0] | ||
wav_file_path = entry[1] | ||
wav_dictionary[utterance_id] = wav_file_path | ||
return wav_dictionary | ||
|
||
def create_eaf_and_textgrid(wav_dictionary:dict, | ||
ctm_dictionary:dict, | ||
confidence:bool, | ||
output_directory:str): | ||
for index, [utterance_id, basename] in enumerate(wav_dictionary.items()): | ||
eaf = Eaf() | ||
eaf.add_linked_file(str(Path(output_directory, basename))) | ||
eaf.add_linguistic_type("conf_lt", "Symbolic_Association") | ||
eaf.add_tier("default") | ||
if confidence: | ||
eaf.add_tier("confidence", parent="default", ling="conf_lt") | ||
for annotation in ctm_dictionary[utterance_id]: | ||
# Annotation looks like ('0.32', '0.52', 'word', '0.81') | ||
# Convert times to ms integers | ||
start, end, value, *conf = annotation | ||
start_ms = int(float(start) * 1000) | ||
end_ms = int(float(end) * 1000) | ||
# Add the transcription annotation | ||
eaf.add_annotation("default", start_ms, end_ms, value) | ||
# Add the confidence value as a reference annotation | ||
if conf: | ||
# Add a time value to the start time so the ref falls within a parent slot | ||
eaf.add_ref_annotation("confidence", "default", start_ms+PYMPI_CHILD_ANNOTATION_OFFSET, conf[0]) | ||
|
||
# Save as Elan eaf file | ||
output_eaf = str(Path(output_directory, f'utterance-{index}.eaf')) | ||
eaf.to_file(output_eaf) | ||
|
||
# Make a Textgrid format version | ||
output_textgrid = str(Path(output_directory, f'utterance-{index}.Textgrid')) | ||
textgrid = eaf.to_textgrid() | ||
textgrid.to_file(output_textgrid) | ||
|
||
|
||
def main() -> None: | ||
parser: ArgumentParser = ArgumentParser(description="Converts Kaldi CTM format to Elan .eaf format.") | ||
parser.add_argument("-c", "--ctm", | ||
type=str, | ||
help="The input CTM format file", | ||
required=True) | ||
parser.add_argument("-w", "--wav", | ||
type=str, | ||
help="The input wav.scp file", | ||
required=True) | ||
parser.add_argument("-s", "--seg", | ||
type=str, | ||
help="The segment to utterance mapping", | ||
default="./segments") | ||
parser.add_argument("-o", "--outdir", | ||
type=str, | ||
help="The directory path for the Elan output", | ||
default=".") | ||
parser.add_argument('--confidence', dest='confidence', action='store_true') | ||
parser.add_argument('--no-confidence', dest='confidence', action='store_false') | ||
parser.set_defaults(confidence=True) | ||
|
||
arguments = parser.parse_args() | ||
|
||
segments_dictionary = get_segment_dictionary(arguments.seg) | ||
ctm_dictionary = ctm_to_dictionary(arguments.ctm, segments_dictionary, arguments.confidence) | ||
wav_dictionary = wav_scp_to_dictionary(arguments.wav) | ||
output_directory = Path(arguments.outdir) | ||
|
||
print("==== CTM to Elan args") | ||
print("segments_dictionary", segments_dictionary) | ||
print("ctm_dictionary", ctm_dictionary) | ||
print("wav_dictionary", wav_dictionary) | ||
print("output_directory", output_directory) | ||
|
||
if not output_directory.parent: | ||
Path.mkdir(output_directory.parent, parents=True) | ||
|
||
create_eaf_and_textgrid(wav_dictionary, | ||
ctm_dictionary, | ||
arguments.confidence, | ||
output_directory) | ||
|
||
if __name__ == '__main__': | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
124 changes: 124 additions & 0 deletions
124
elpis/engines/kaldi/inference/gmm-decode-conf/gmm-decode-conf.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,124 @@ | ||
#!/bin/bash | ||
|
||
# Copyright: University of Queensland, 2019 | ||
# Contributors: | ||
# Joshua Meyer - (2016) | ||
# Scott Heath - (University of Queensland, 2018) | ||
# Nicholas Lambourne - (University of Queensland, 2018) | ||
|
||
# USAGE: | ||
# $ kaldi/egs/your-model/your-model-1/gmm-decode.sh | ||
# | ||
# This script is meant to demonstrate how an existing GMM-HMM | ||
# model and its corresponding HCLG graph, build via Kaldi, | ||
# can be used to decode new audio files. | ||
# Although this script takes no command line arguments, it assumes | ||
# the existance of a directory (./transcriptions) and an scp file | ||
# within that directory (./transcriptions/wav.scp). For more on scp | ||
# files, consult the official Kaldi documentation. | ||
|
||
# INPUT: | ||
# audio.wav | ||
# data/ | ||
# infer/ <= these need to be created | ||
# wav.scp | ||
# utt2spk | ||
# spk2utt | ||
# text <= put a transcription here for quick comparison against generated one | ||
# | ||
# config/ | ||
# mfcc.conf | ||
# | ||
# exp/ | ||
# tri/ | ||
# final.mdl | ||
# | ||
# graph/ | ||
# HCLG.fst | ||
# words.txt | ||
|
||
# OUTPUT: | ||
# data/ | ||
# infer/ | ||
# feats.ark | ||
# feats.scp | ||
# delta-feats.ark | ||
# lattices.ark | ||
# one-best.tra | ||
# one-best-hypothesis.txt | ||
|
||
|
||
|
||
. ./path.sh | ||
# make sure you include the path to the gmm bin(s) | ||
# the following two export commands are what my path.sh script contains: | ||
# export PATH=$PWD/utils/:$PWD/../../../src/bin:$PWD/../../../tools/openfst/bin:$PWD/../../../src/fstbin/:$PWD/../../../src/gmmbin/:$PWD/../../../src/featbin/:$PWD/../../../src/lm/:$PWD/../../../src/sgmmbin/:$PWD/../../../src/fgmmbin/:$PWD/../../../src/latbin/:$PWD/../../../src/nnet2bin/:$PWD:$PATH | ||
# export LC_ALL=C | ||
|
||
# Make the split dir with scp, utt2spk etc files | ||
. $PWD/make_split.sh | ||
|
||
|
||
# AUDIO --> FEATURE VECTORS | ||
echo "==== Extracting Feature Vectors ====" | ||
steps/make_mfcc.sh --nj 1 data/infer exp/make_mfcc/infer mfcc | ||
|
||
echo "==== Applying CMVN ====" | ||
apply-cmvn --utt2spk=ark:data/infer/utt2spk \ | ||
scp:mfcc/cmvn_test.scp \ | ||
scp:mfcc/raw_mfcc_infer.1.scp ark:- | \ | ||
add-deltas ark:- ark:data/infer/delta-feats.ark | ||
|
||
# TRAINED GMM-HMM + FEATURE VECTORS --> LATTICE | ||
echo "==== Producing Lattice ====" | ||
gmm-latgen-faster \ | ||
--word-symbol-table=exp/tri1/graph/words.txt \ | ||
exp/tri1/final.mdl \ | ||
exp/tri1/graph/HCLG.fst \ | ||
ark:data/infer/delta-feats.ark \ | ||
ark,t:data/infer/lattices.ark | ||
|
||
echo "==== Lattice to Conf ====" | ||
|
||
# Enable setting acoustic scale by ENV | ||
# eg run this as `ACOUSTIC_SCALE=2 gmm-deccode-conf.sh` | ||
# seems that 1/10 or 1/12 is a standard setting but idkw | ||
# TODO add a GUI setting for this | ||
acoustic_scale="${ACOUSTIC_SCALE:-0.1}" | ||
echo "Using acoustic scale: ${acoustic_scale}" | ||
|
||
lattice-align-words \ | ||
data/lang/phones/word_boundary.int \ | ||
exp/tri1/final.mdl \ | ||
ark:data/infer/lattices.ark \ | ||
ark:- | \ | ||
|
||
lattice-to-ctm-conf --acoustic-scale=$acoustic_scale \ | ||
ark:- - | \ | ||
|
||
utils/int2sym.pl -f 5 \ | ||
exp/tri1/graph/words.txt \ | ||
> data/infer/ctm_with_conf.ctm | ||
|
||
# Now, wav.scp needs to be in segment form | ||
# eg audio_id filename | ||
echo "decode audio.wav" > ./data/infer/split1/1/wav.scp | ||
|
||
echo "==== CTM output ====" | ||
awk -F" " 'BEGIN { ORS=" " }; {print $(NF-1)}' \ | ||
data/infer/ctm_with_conf.ctm \ | ||
> data/infer/one-best-hypothesis.txt | ||
|
||
# Add a newline to the file | ||
echo >> data/infer/one-best-hypothesis.txt | ||
|
||
cat data/infer/one-best-hypothesis.txt | ||
|
||
echo "==== Build the Elan file ====" | ||
"${POETRY_PATH}/bin/python" /elpis/elpis/engines/common/output/ctm_to_elan.py \ | ||
--ctm data/infer/ctm_with_conf.ctm \ | ||
--wav data/infer/split1/1/wav.scp \ | ||
--seg data/infer/split1/1/segments \ | ||
--outdir data/infer \ | ||
--confidence | ||
|
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
File renamed without changes.
Oops, something went wrong.