In [5]:
import os
import textgrids
import subprocess
import shutil
from scipy.io import wavfile
from src import asr

### Define relevant paths and file locations.

In [None]:
mfa_root_dir = 'C:\\Users\\chris\\Documents\\MFA'
dataset_dir = 'test_dataset'

os.makedirs(os.path.join(mfa_root_dir, 'input_files'), exist_ok=True) # Apparently, MFA needs the input files in a specific directory
input_dir = os.path.join(mfa_root_dir, 'input_files', dataset_dir)
output_dir = os.path.join(mfa_root_dir, 'input_files', dataset_dir, 'results')
pronunc_dict_dir = os.path.join(mfa_root_dir, 'pretrained_models', 'dictionary', 'german_mfa.dict')
g2p_model_path = os.path.join(mfa_root_dir, 'pretrained_models', 'g2p', 'german_mfa.zip')
acoustic_model_path = os.path.join(mfa_root_dir, 'pretrained_models', 'acoustic', 'german_mfa.zip')

audio_files = os.listdir(input_dir)
audio_file_prefix = [f.split('_')[0] for f in audio_files if f.endswith('.wav')]
print(f"Found {len(audio_file_prefix)} audio files in input directory")

### Create the textgrid files that will be populated during the alignment process.

In [7]:
for idx in audio_file_prefix:
    text_file_path = os.path.join(input_dir, f"{idx}_graphemes.txt")
    audio_file_path = os.path.join(input_dir, f"{idx}_audio.wav")

    with open(text_file_path, 'r') as f:
        graphemes = f.read().strip()
        graphemes = asr.normalize_sentence(graphemes)

    fs, audio = wavfile.read(audio_file_path)
    assert audio.ndim == 1, "Audio must be mono."

    # Create a textgrid.
    intervals = []
    tg = textgrids.TextGrid()
    xmax_s = len(audio) / fs
    interval = textgrids.Interval(text=graphemes, xmin=0.0, xmax=xmax_s)
    intervals.append(interval)
    tier = textgrids.Tier(data=intervals, xmin=0.0, xmax=xmax_s)
    tg['sentences'] = tier
    tg.xmax = xmax_s

    # Place the text file in the input directory.
    textgrid_file_path = os.path.join(input_dir, f"{idx}_textgrid.TextGrid")
    tg.write(textgrid_file_path)

### Validate the sentences.

In [None]:
command = f"mfa validate --debug --ignore_acoustics {input_dir} {pronunc_dict_dir}"
print(f"Running command: {command}")
out = subprocess.run(command, shell=True, capture_output=True, text=True)
print(out.stderr)

### Phonemize missing words

In [None]:
missing_words_file_stump = 'oovs_found_german_mfa' # automatically generated by MFA

oov_output_file_path = os.path.join(mfa_root_dir, 'input', f"{missing_words_file_stump}.txt")
oov_folder_dir = os.path.join(mfa_root_dir, 'oov_corpus')
os.makedirs(oov_folder_dir, exist_ok=True)

shutil.copy(oov_output_file_path, oov_folder_dir)
command = f"mfa g2p --clean {oov_folder_dir} german_mfa {os.path.join(mfa_root_dir, missing_words_file_stump, missing_words_file_stump)}_g2p.txt"
out = subprocess.run(command, shell=True, capture_output=True, text=True)

### Add the missing words to the dictionary.

In [None]:
command = f"mfa model add_words german_mfa {os.path.join(mfa_root_dir, missing_words_file_stump, missing_words_file_stump)}_g2p.txt"
subprocess.run(command, shell=True, capture_output=True, text=True)

### Run the aligner

In [8]:
os.makedirs(output_dir, exist_ok=True)
command = f"mfa align --clean --debug --overwrite {input_dir} {pronunc_dict_dir} {acoustic_model_path} {output_dir}"
out = subprocess.run(command, shell=True, capture_output=True, text=True)
# TODO: FIX ERROR. REINSTALL ENVIRONMENT

In [None]:
print(out.stderr)