In [1]:
import os
import textgrids
import subprocess
import shutil
from scipy.io import wavfile
from src import asr

### Define relevant paths and file locations.

In [None]:
# Define input and output directories.
mfa_root_dir = "TODO_ADD" # Usually in Users/Documents/MFA or something similar.
input_dir = os.path.join("input_files", "test_dataset", "input")
output_dir = os.path.join("input_files", "test_dataset", "results")

### These path's should exist relative to the MFA root directory.
pronunc_dict_dir = os.path.join(mfa_root_dir, 'pretrained_models', 'dictionary', 'german_mfa.dict')
g2p_model_path = os.path.join(mfa_root_dir, 'pretrained_models', 'g2p', 'german_mfa.zip')
acoustic_model_path = os.path.join(mfa_root_dir, 'pretrained_models', 'acoustic', 'german_mfa.zip')

# Find all audio files in input directory.
files = os.listdir(input_dir)
audio_files = [f for f in files if f.endswith('.wav')]
text_files = [f for f in files if f.endswith('.txt')]
assert len(audio_files) == len(text_files)
audio_files = sorted(audio_files, key=lambda x: int(x.split('_')[0]))
text_files = sorted(text_files, key=lambda x: int(x.split('_')[0]))
# Add full path to files.
audio_files = [os.path.join(input_dir, f) for f in audio_files]
text_files = [os.path.join(input_dir, f) for f in text_files]
print(f"Found {len(audio_files)} audio files and {len(text_files)} text files.")

### Create the textgrid files that will be populated during the alignment process.

In [24]:
for idx, files in enumerate(zip(audio_files, text_files)):

    audio_file_path, text_file_path = files
    assert audio_file_path.split('_')[0] == text_file_path.split('_')[0]
    
    with open(text_file_path, 'r') as f:
        graphemes = f.read().strip()
        graphemes = asr.normalize_sentence(graphemes)

    fs, audio = wavfile.read(audio_file_path)
    assert audio.ndim == 1, "Audio must be mono."

    # Create a textgrid.
    intervals = []
    tg = textgrids.TextGrid()
    xmax_s = len(audio) / fs
    interval = textgrids.Interval(text=graphemes, xmin=0.0, xmax=xmax_s)
    intervals.append(interval)
    tier = textgrids.Tier(data=intervals, xmin=0.0, xmax=xmax_s)
    tg['sentences'] = tier
    tg.xmax = xmax_s

    # Place the text file in the input directory.
    textgrid_file_path = os.path.join(input_dir, f"{idx}_textgrid.TextGrid")
    tg.write(textgrid_file_path)

### Validate the sentences.

In [None]:
command = f"mfa validate --debug --ignore_acoustics {input_dir} {pronunc_dict_dir}"
print(f"Running command: {command}")
out = subprocess.run(command, shell=True, capture_output=True, text=True)
print(out.stderr)

### Phonemize missing words

In [None]:
missing_words_file_stump = 'oovs_found_german_mfa' # automatically generated by MFA

oov_output_file_path = os.path.join(mfa_root_dir, 'input', f"{missing_words_file_stump}.txt")
oov_folder_dir = os.path.join(mfa_root_dir, 'oov_corpus')
os.makedirs(oov_folder_dir, exist_ok=True)

shutil.copy(oov_output_file_path, oov_folder_dir)
command = f"mfa g2p --clean {oov_folder_dir} german_mfa {os.path.join(mfa_root_dir, missing_words_file_stump, missing_words_file_stump)}_g2p.txt"
out = subprocess.run(command, shell=True, capture_output=True, text=True)

### Add the missing words to the dictionary.

In [None]:
command = f"mfa model add_words german_mfa {os.path.join(mfa_root_dir, missing_words_file_stump, missing_words_file_stump)}_g2p.txt"
subprocess.run(command, shell=True, capture_output=True, text=True)

### Run the aligner

In [26]:
os.makedirs(output_dir, exist_ok=True)
command = f"mfa align --clean --debug --overwrite {input_dir} {pronunc_dict_dir} {acoustic_model_path} {output_dir}"
out = subprocess.run(command, shell=True, capture_output=True, text=True)
print(out.stderr)