The code for this training pipeline is derived from ESPnet github page : https://espnet.github.io/espnet/notebook/ESPnetEZ/TTS/TTS_finetune_vctk_dump.html

# Intalling espnet, espnet model zoo and camel tools

Installing espnet and espnet model zoo

In [None]:
!pip install espnet espnet_model_zoo

Installing Camel Tools for converting the Buckwalter transcription to standard Arabic script

In [None]:
!pip install camel-tools --no-build-isolation

# Downloading and Pre-Processing Dataset

Downloading and Extracting Arabic Speech Corpus Dataset

Link to website: https://en.arabicspeechcorpus.com/

Sometimes the dataset will show error on downloading multiple times continously from the url. But it will resolve when you try again after some time

In [None]:
import os
import zipfile
import urllib.request

# Define the URL and output path
url = "https://en.arabicspeechcorpus.com/arabic-speech-corpus.zip"
zip_path = "/content/arabic-speech-corpus.zip"
extract_path = "/content"

# Download the dataset
print("Downloading dataset...")
urllib.request.urlretrieve(url, zip_path)
print("Download complete.")

# Unzip the dataset
print("Extracting files...")
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print("Extraction complete.")

# List extracted files
os.listdir(extract_path)


Cloning Speaker Embeddings (X-vector) generated from the Dataset.

Link: https://github.com/Addalin-CP3445/speaker_embedding/tree/main

In [None]:
!git clone https://github.com/Addalin-CP3445/speaker_embedding.git
%cp speaker_embedding/extract_spk_embedding.py extract_spk_embedding.py
%cp speaker_embedding/train_speaker_embeddings -r train_speaker_embeddings
%cp speaker_embedding/test_speaker_embeddings -r test_speaker_embeddings

Renaming Audio files for Training and Test sets

In [None]:
import os

# Set the directory where your wav files are stored
wav_directory = '/content/arabic-speech-corpus/test set/wav'  # <-- update this path

# Define the prefix to remove
prefix = "ARA NORM  "

# Iterate through all files in the directory
for filename in os.listdir(wav_directory):
    if filename.startswith(prefix):
        # Remove the prefix
        new_filename = filename[len(prefix):]
        old_filepath = os.path.join(wav_directory, filename)
        new_filepath = os.path.join(wav_directory, new_filename)
        print(f"Renaming: {old_filepath} -> {new_filepath}")
        os.rename(old_filepath, new_filepath)

print("Renaming testing audio files complete!")


In [None]:
import os

# Set the directory where your wav files are stored
wav_directory = '/content/arabic-speech-corpus/wav'  # <-- update this path

# Define the prefix to remove
prefix = "ARA NORM  "

# Iterate through all files in the directory
for filename in os.listdir(wav_directory):
    if filename.startswith(prefix):
        # Remove the prefix
        new_filename = filename[len(prefix):]
        old_filepath = os.path.join(wav_directory, filename)
        new_filepath = os.path.join(wav_directory, new_filename)
        print(f"Renaming: {old_filepath} -> {new_filepath}")
        os.rename(old_filepath, new_filepath)

print("Renaming training audio files complete!")


Creating Kaldi-style files: wav.scp, text, and utt2spk and converting Buckwalter to Standard Arabic

In [None]:
import os
from camel_tools.utils.transliterate import Transliterator
from camel_tools.utils.charmap import CharMapper

# Initialize the Buckwalter transliterator
bw2ar = CharMapper.builtin_mapper("bw2ar")
bt = Transliterator(bw2ar)

# Set your dataset paths (update these paths as needed)
dataset_path = '/content/arabic-speech-corpus'  # Replace with your dataset root directory
wav_dir = os.path.join(dataset_path, 'wav')
transcript_file = os.path.join(dataset_path, 'phonetic-transcipt.txt')

# Output directory for Kaldi-style files (e.g., for training)
kaldi_data_dir = '/content/arabic-speech-corpus/kaldi_data/train'  # Update this as needed
os.makedirs(kaldi_data_dir, exist_ok=True)

# Open output files for Kaldi-style directory
wav_scp = open(os.path.join(kaldi_data_dir, 'wav.scp'), 'w', encoding='utf-8')
text_f = open(os.path.join(kaldi_data_dir, 'text'), 'w', encoding='utf-8')
utt2spk = open(os.path.join(kaldi_data_dir, 'utt2spk'), 'w', encoding='utf-8')

# Set a default speaker ID (adjust if you have multiple speakers)
default_spk = "arabic"

with open(transcript_file, 'r', encoding='utf-8') as f:
    for line in f:
        # Remove extra quotes and split the line into fields
        # Expected format: "ARA NORM  0002.wav" "buckwalter transcription"
        parts = line.strip().split('" "')
        if len(parts) < 2:
            continue
        # Clean up the fields (remove any remaining quotes)
        utt_field = parts[0].replace('"', '').strip()
        buckwalter_transcription = parts[1].replace('"', '').strip()

        # Extract the filename from utt_field. Example: "ARA NORM  0002.wav"
        utt_filename = utt_field.split()[-1]
        # Remove the file extension to create an utterance ID (e.g., "0002")
        utt_id = os.path.splitext(utt_filename)[0]

        #Has been commented out to test with phonetic transcription

        # Convert the Buckwalter transcription to standard Arabic script
        # arabic_transcription = bt.transliterate(buckwalter_transcription)

        # Write to wav.scp (assumes the wav files are in the wav/ directory)
        wav_path = os.path.join(wav_dir, utt_filename)
        wav_scp.write(f"{utt_id} {wav_path}\n")

        # Write the converted Arabic transcript to the text file
        text_f.write(f"{utt_id} {buckwalter_transcription}\n")

        # Write to utt2spk (assign default speaker)
        utt2spk.write(f"{utt_id} {default_spk}\n")

# Close the files
wav_scp.close()
text_f.close()
utt2spk.close()

print("Kaldi-style training data files have been created in:", kaldi_data_dir)


In [None]:
# Set your dataset paths (update these paths as needed)
dataset_path = '/content/arabic-speech-corpus/test set'  # Replace with your dataset root directory
wav_dir = os.path.join(dataset_path, 'wav')
transcript_file = os.path.join(dataset_path, 'phonetic-transcript.txt')

# Output directory for Kaldi-style files (e.g., for training)
kaldi_data_dir = '/content/arabic-speech-corpus/kaldi_data/test'  # Update this as needed
os.makedirs(kaldi_data_dir, exist_ok=True)

# Open output files for Kaldi-style directory
wav_scp = open(os.path.join(kaldi_data_dir, 'wav.scp'), 'w', encoding='utf-8')
text_f = open(os.path.join(kaldi_data_dir, 'text'), 'w', encoding='utf-8')
utt2spk = open(os.path.join(kaldi_data_dir, 'utt2spk'), 'w', encoding='utf-8')

# Set a default speaker ID (adjust if you have multiple speakers)
default_spk = "arabic"

with open(transcript_file, 'r', encoding='utf-8') as f:
    for line in f:
        # Remove extra quotes and split the line into fields
        # Expected format: "ARA NORM  0002.wav" "buckwalter transcription"
        parts = line.strip().split('" "')
        if len(parts) < 2:
            continue
        # Clean up the fields (remove any remaining quotes)
        utt_field = parts[0].replace('"', '').strip()
        buckwalter_transcription = parts[1].replace('"', '').strip()

        # Extract the filename from utt_field. Example: "ARA NORM  0002.wav"
        utt_filename = utt_field.split()[-1]
        # Remove the file extension to create an utterance ID (e.g., "0002")
        utt_id = os.path.splitext(utt_filename)[0]

        #Has been commented out to test with phonetic transcription

        # Convert the Buckwalter transcription to standard Arabic script
        # arabic_transcription = bt.transliterate(buckwalter_transcription)

        # Write to wav.scp (assumes the wav files are in the wav/ directory)
        wav_path = os.path.join(wav_dir, utt_filename)
        wav_scp.write(f"{utt_id} {wav_path}\n")

        # Write the converted Arabic transcript to the text file
        text_f.write(f"{utt_id} {buckwalter_transcription}\n")

        # Write to utt2spk (assign default speaker)
        utt2spk.write(f"{utt_id} {default_spk}\n")

# Close the files
wav_scp.close()
text_f.close()
utt2spk.close()

print("Kaldi-style testing data files have been created in:", kaldi_data_dir)

In [None]:
import os

# Set the paths to your utt2spk and the output spk2utt file
utt2spk_path = '/content/arabic-speech-corpus/kaldi_data/train/utt2spk'  # Update this path
spk2utt_path = '/content/arabic-speech-corpus/kaldi_data/train/spk2utt'    # Update this path

# Dictionary to accumulate utterances for each speaker
speaker_dict = {}

# Read utt2spk file
with open(utt2spk_path, 'r', encoding='utf-8') as f:
    for line in f:
        parts = line.strip().split()
        if len(parts) != 2:
            continue  # Skip any malformed lines
        utt, spk = parts
        if spk not in speaker_dict:
            speaker_dict[spk] = []
        speaker_dict[spk].append(utt)

# Write spk2utt file
with open(spk2utt_path, 'w', encoding='utf-8') as f:
    for spk, utt_list in speaker_dict.items():
        f.write(f"{spk} {' '.join(utt_list)}\n")

print(f"train spk2utt file has been created at: {spk2utt_path}")


In [None]:
import os

# Set the paths to your utt2spk and the output spk2utt file
utt2spk_path = '/content/arabic-speech-corpus/kaldi_data/test/utt2spk'  # Update this path
spk2utt_path = '/content/arabic-speech-corpus/kaldi_data/test/spk2utt'    # Update this path

# Dictionary to accumulate utterances for each speaker
speaker_dict = {}

# Read utt2spk file
with open(utt2spk_path, 'r', encoding='utf-8') as f:
    for line in f:
        parts = line.strip().split()
        if len(parts) != 2:
            continue  # Skip any malformed lines
        utt, spk = parts
        if spk not in speaker_dict:
            speaker_dict[spk] = []
        speaker_dict[spk].append(utt)

# Write spk2utt file
with open(spk2utt_path, 'w', encoding='utf-8') as f:
    for spk, utt_list in speaker_dict.items():
        f.write(f"{spk} {' '.join(utt_list)}\n")

print(f"test spk2utt file has been created at: {spk2utt_path}")


Creating Token list

In [None]:
kaldi_data_dir = "/content/arabic-speech-corpus/kaldi_data/train"

token_set = set()
text_path = os.path.join(kaldi_data_dir, 'text')
with open(text_path, 'r', encoding='utf-8') as f:
    for line in f:
        parts = line.strip().split(maxsplit=1)
        if len(parts) == 2:
            transcript = parts[1]
            token_set.update(list(transcript))

# Write token list
token_list_path = os.path.join(kaldi_data_dir, 'token_list.txt')
with open(token_list_path, 'w', encoding='utf-8') as f:
    for token in sorted(token_set):
        f.write(token + "\n")
print("Token list created at:", token_list_path)


Need to install 1.26.4 version of Numpy as Trainer.train() requires Dtypes. To check if other numpy versions are suitable use the below code.



```
import numpy as np
print(np.__version__)
print(hasattr(np, "dtypes"))
```



# Re-Installing Libraries and Model Weights for Training

In [None]:
!pip install numpy==1.26.4

Downloading the model checkpoint and training configuration file from HuggingFace

In [None]:
from espnet_model_zoo.downloader import ModelDownloader
d = ModelDownloader()  # <module_dir> is used as cachedir by default
# model_id = "espnet/kan-bayashi_libritts_xvector_vits" #originally used to train VITS model
model_id = "kan-bayashi/ljspeech_tacotron2"

model_dir = d.download_and_unpack(model_id)
print(f"Model '{model_id}' downloaded and unpacked at: {model_dir}")

Filling the variables DUMP_DIR and data_info for training configuration

In [None]:
arabic_data_dir = "/content/arabic-speech-corpus/kaldi_data"
# Directory containing your dumped Arabic dataset in Kaldi-style
DUMP_DIR = arabic_data_dir

# Data information mapping keys to file names and types:
data_info = {
    "speech": ["wav.scp", "sound"],
    "text": ["text", "text"],
}

Installing Protobuf with version 3.20.1 since trainer.collect_stats() requires it. The pip depency error for Google can be ignored as it does not affect the training.

In [None]:
!pip install protobuf==3.20.1

Importing espnetez

In [None]:
import espnetez as ez

Logging into Wandb for metric gathering. It will ask for API key from Wandb to store and display the metrics

In [None]:
!wandb login --relogin

# Configuring for Training

Configuring the training config downloaded from HuggingFace. Depending on the task, finetune_config["tts"] accepts certain models. The models can be seen in the link below through the files. This error will be shown in the Trainer.train()

Link: https://github.com/espnet/espnet/tree/master/espnet2/tasks

In [None]:
TASK = "tts" #Depending on the model the task changes Eg: VITS works only with gan_tts task

pretrain_config = ez.config.from_yaml(TASK, model_dir["train_config"])

# Update the configuration with the downloaded model file path
pretrain_config["model_file"] = model_dir["model_file"]

# Modify configuration for fine-tuning
finetune_config = pretrain_config.copy()
finetune_config["tts"] = "tacotron2" #Models that comply with the task
finetune_config["batch_size"] = 1
finetune_config["num_workers"] = 1
finetune_config["max_epoch"] = 100
finetune_config["batch_bins"] = 500000
finetune_config["num_iters_per_epoch"] = 2
finetune_config["generator_first"] = True
finetune_config["use_wandb"] = True
finetune_config["wandb_project"] = "ESPnet Training"
finetune_config["wandb_name"] = "ESPnet Tacatron2 run 100 epochs"

# Disable distributed training
finetune_config["distributed"] = False
finetune_config["multiprocessing_distributed"] = False
finetune_config["dist_world_size"] = None
finetune_config["dist_rank"] = None
finetune_config["local_rank"] = None
finetune_config["dist_master_addr"] = None
finetune_config["dist_master_port"] = None
finetune_config["dist_launcher"] = None
finetune_config["pretrain_path"] = None

Contents of the configuration yaml file been dumped to verify

In [None]:
import yaml

print("Fine-tuning configuration:")
print(yaml.dump(finetune_config, sort_keys=False))

Defining Experiment and Stats Directory, and initializes ez.Trainer

In [None]:
DATASET_NAME = "asc"
EXP_DIR = f"./exp/finetune_{TASK}_{DATASET_NAME}" ## output directory containing the trained model weights and config.yaml file
STATS_DIR = f"./exp/stats_{DATASET_NAME}"
ngpu = 1

trainer = ez.Trainer(
    task=TASK,
    train_config=finetune_config,
    train_dump_dir=f"{DUMP_DIR}/train",
    valid_dump_dir=f"{DUMP_DIR}/test",
    data_info=data_info,
    output_dir=EXP_DIR,
    stats_dir=STATS_DIR,
    ngpu=ngpu,
)

# Add the xvector paths to the configuration
trainer.train_config.train_data_path_and_name_and_type += [
    ["/content/train_speaker_embeddings/train_spk_embed.scp", "spembs", "kaldi_ark"],
]
trainer.train_config.valid_data_path_and_name_and_type += [
    ["/content/test_speaker_embeddings/test_spk_embed.scp", "spembs", "kaldi_ark"],
]

Downloading NLTK POS Tagger. Even if the it is been downloded while importing espnetez, an error is thrown that averaged_perceptron_tagger_eng is missing

In [None]:
import nltk
nltk.download('averaged_perceptron_tagger_eng')

Collecting stats from the dataset

In [None]:
# Temporarily disable normalization to collect stats
trainer.train_config.normalize = None
trainer.train_config.pitch_normalize = None
trainer.train_config.energy_normalize = None

# Collect statistics from the training dump
trainer.collect_stats()

# After collecting stats, re-enable normalization if required.
trainer.train_config.write_collected_feats = False
if finetune_config.get("normalize") is not None:
    trainer.train_config.normalize = finetune_config["normalize"]
    trainer.train_config.normalize_conf["stats_file"] = f"{STATS_DIR}/train/feats_stats.npz"
if finetune_config.get("pitch_normalize") is not None:
    trainer.train_config.pitch_normalize = finetune_config["pitch_normalize"]
    trainer.train_config.pitch_normalize_conf["stats_file"] = f"{STATS_DIR}/train/pitch_stats.npz"
if finetune_config.get("energy_normalize") is not None:
    trainer.train_config.energy_normalize = finetune_config["energy_normalize"]
    trainer.train_config.energy_normalize_conf["stats_file"] = f"{STATS_DIR}/train/energy_stats.npz"


# Training

Training/Fine-tuning the model. The output of this training is in the exp folder, the exp folder will be need for inferencing.  

In [None]:
trainer.train()

# Inferencing

Inferencing the model using the fine-tuned model weights. Sometimes other models only require config.yaml file but others will throw an error if the exp folder is missing.

In [None]:
from espnet2.bin.tts_inference import Text2Speech
import kaldiio  # This is commonly used to read Kaldi-style scp files
sf.write("output.wav", wav.numpy(), tts.fs, "PCM_16")

# The scp file is just a mapping file - you need to get an actual embedding
# First, load the mapping
spk_dict = kaldiio.load_scp("/content/train_speaker_embeddings/train_spk_embed.scp")

# Get the first speaker embedding
spk_id = list(spk_dict.keys())[0]  # Get the first speaker ID
spembs = spk_dict[spk_id]  # Get the embedding for that speaker

# with local model
tts = Text2Speech.from_pretrained(model_file="/content/67epoch.pth")
wav = tts("sil w a r a' jj A H a tt A q r ii0' r u0 ll a * i0 < a E a' dd a h u0 m a' E h a d u0 < a b H aa' ^ i0 h A D A' b a t i0 tt i1' b t i0 f i0 l < a k aa d ii0 m ii0' y a t i0 SS II0 n ii0' y a t i0 l u0 l E u0 l uu0' m i0 sil < a' n t a s t a m i0' rr a d a r a j aa' t u0 l H a r aa' r a t i0 w a m u0 s t a w a y aa' t u0 rr U0 T UU0' b a t i0 f i0 l Ah i0 r t i0 f aa' E i0 T A' w A l a h aa' * a l q A' r n sil",spembs=spembs)["wav"]
sf.write("output.wav", wav.numpy(), tts.fs, "PCM_16")