# **Fine-tuning Text-To-Speech model**
Please refer to the respective sections in the book for further details.


## **Step 1. Installing Libraries and Data loading**

In [None]:
!pip install transformers datasets soundfile speechbrain accelerate



In [None]:
from datasets import load_dataset, Audio
from transformers import SpeechT5Processor,SpeechT5ForTextToSpeech, Seq2SeqTrainingArguments, SpeechT5HifiGan, Seq2SeqTrainer
import os
import torch
from speechbrain.pretrained import EncoderClassifier
from dataclasses import dataclass
from typing import Any, Dict, List, Union
from functools import partial
from collections import defaultdict

In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
welsh_voice_train_dataset = load_dataset("mozilla-foundation/common_voice_13_0", "cy", split="train")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


## **Step 2. Data pre-processing**

In [None]:
welsh_voice_train_dataset = welsh_voice_train_dataset.cast_column("audio", Audio(sampling_rate=16000))

In [None]:
model_checkpoint = "microsoft/speecht5_tts"
tts_processor = SpeechT5Processor.from_pretrained(model_checkpoint)
tts_tokenizer = tts_processor.tokenizer

preprocessor_config.json:   0%|          | 0.00/433 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/232 [00:00<?, ?B/s]

spm_char.model:   0%|          | 0.00/238k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/40.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

In [None]:
def compile_dataset_vocabulary(batch):
    concatenated_sentences = " ".join(batch["sentence"])
    unique_vocab = list(set(concatenated_sentences))
    return {"unique_vocab": [unique_vocab], "concatenated_sentences": [concatenated_sentences]}

extracted_vocab = dataset.map(
    compile_dataset_vocabulary,
    batched=True,
    batch_size=-1,
    keep_in_memory=True,
    remove_columns=dataset.column_names,
)

Map:   0%|          | 0/7810 [00:00<?, ? examples/s]

In [None]:
complete_dataset_vocab = set(extracted_vocab["unique_vocab"][0])
tts_processor_vocab = {k for k, _ in tts_processor.tokenizer.get_vocab().items()}
missing_vocab_in_processor = complete_dataset_vocab - tts_processor_vocab
missing_vocab_in_processor

In [None]:
welsh_to_english_replacements = {
    ' ': ' ',
    '¬': '',
    'Â': 'A',
    'Ô': 'O',
    'à': 'a',
    'á': 'a',
    'â': 'a',
    'ä': 'a',
    'ë': 'e',
    'î': 'i',
    'ï': 'i',
    'ò': 'o',
    'ô': 'o',
    'ö': 'o',
    'û': 'u',
    'Ŵ': 'W',
    'ŵ': 'w',
    'ŷ': 'y',
    '–': '-',
    '‘': "'",
    '“': '"',
    '”': '"'
}

missing_vocab_replacements = [(src, dst) for src, dst in welsh_to_english_replacements.items()]

In [None]:
def normalize_sentence_characters(sentence_mapping):
    for original_char, replacement_char in missing_vocab_replacements:
        sentence_mapping["sentence"] = sentence_mapping["sentence"].replace(original_char, replacement_char)
    return sentence_mapping

welsh_voice_train_dataset = welsh_voice_train_dataset.map(normalize_sentence_characters)

In [None]:
speaker_frequency = defaultdict(int)
for client_identifier in welsh_voice_train_dataset["client_id"]:
    speaker_frequency[client_identifier] += 1

def is_speaker_within_range(client_id):
    return 100 <= speaker_frequency[client_id] <= 400

dataset_with_selected_speakers = welsh_voice_train_dataset.filter(is_speaker_within_range, input_columns=["client_id"])

In [None]:
speaker_recognition_model_name = "speechbrain/spkrec-xvect-voxceleb"
computation_device = "cuda" if torch.cuda.is_available() else "cpu"
speaker_recognition_model = EncoderClassifier.from_hparams(
    source=speaker_recognition_model_name,
    run_opts={"device": computation_device, "timeout": 30},
    savedir=os.path.join("/tmp", speaker_recognition_model_name),
)

def generate_speaker_embedding(audio_waveform):
    with torch.no_grad():
        embeddings = speaker_recognition_model.encode_batch(torch.tensor(audio_waveform))
        normalized_embeddings = torch.nn.functional.normalize(embeddings, dim=2)
        flattened_embeddings = normalized_embeddings.squeeze().cpu().numpy()
    return flattened_embeddings

hyperparams.yaml:   0%|          | 0.00/2.04k [00:00<?, ?B/s]

embedding_model.ckpt:   0%|          | 0.00/16.9M [00:00<?, ?B/s]

mean_var_norm_emb.ckpt:   0%|          | 0.00/3.20k [00:00<?, ?B/s]

classifier.ckpt:   0%|          | 0.00/15.9M [00:00<?, ?B/s]

label_encoder.txt:   0%|          | 0.00/129k [00:00<?, ?B/s]

In [None]:
def process_audio_data(sample):
    audio_data = sample["audio"]
    processed_sample = tts_processor(
        text=sample["sentence"],
        audio_target=audio_data["array"],
        sampling_rate=audio_data["sampling_rate"],
        return_attention_mask=False,
    )

    processed_sample["labels"] = processed_sample["labels"][0]
    processed_sample["speaker_embeddings"] = generate_speaker_embedding(audio_data["array"])
    return processed_sample


dataset_with_selected_speakers = dataset_with_selected_speakers.map(process_audio_data, remove_columns=dataset_with_selected_speakers.column_names)

## **Step 3. Model training (fine-tuning)**

In [None]:
dataset_with_selected_speakers = dataset_with_selected_speakers.train_test_split(test_size=0.2)

In [None]:
class TTSDataCollatorWithSpeakerEmbedding:
    tts_processor: Any

    def __call__(
        self, samples: List[Dict[str, Union[List[int], torch.Tensor]]]
    ) -> Dict[str, torch.Tensor]:
        text_input_ids = [{"input_ids": sample["input_ids"]} for sample in samples]
        audio_labels = [{"input_values": sample["labels"]} for sample in samples]
        speaker_embeddings_list = [sample["speaker_embeddings"] for sample in samples]

        batched_data = tts_processor.pad(
            input_ids=text_input_ids, labels=audio_labels, return_tensors="pt"
        )

        batched_data["labels"] = batched_data["labels"].masked_fill(
            batched_data.decoder_attention_mask.unsqueeze(-1).ne(1), -100
        )

        del batched_data["decoder_attention_mask"]

        if model.config.reduction_factor > 1:
            target_lengths = torch.tensor(
                [len(sample["input_values"]) for sample in audio_labels]
            )
            adjusted_lengths = target_lengths.new(
                [
                    length - length % model.config.reduction_factor
                    for length in target_lengths
                ]
            )
            max_length = max(adjusted_lengths)
            batched_data["labels"] = batched_data["labels"][:, :max_length]

        batched_data["speaker_embeddings"] = torch.tensor(speaker_embeddings_list)
        return batched_data

In [None]:
TTSdata_collator = TTSDataCollatorWithSpeakerEmbedding()
tts_model = SpeechT5ForTextToSpeech.from_pretrained(model_checkpoint)
tts_model.config.use_cache = False
tts_model.generate = partial(tts_model.generate, use_cache=True)

In [None]:
tts_training_args = Seq2SeqTrainingArguments(
    output_dir="dkhublani/test_speecht5",
    hub_model_id="dkhublani/test_speecht5",
    hub_strategy="every_save",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=8,
    learning_rate=1e-5,
    warmup_steps=500,
    max_steps=4000,
    gradient_checkpointing=True,
    fp16=False,
    evaluation_strategy="steps",
    per_device_eval_batch_size=2,
    save_steps=1000,
    eval_steps=1000,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    greater_is_better=False,
    label_names=["labels"],
    push_to_hub=True,
    save_total_limit=2,
    save_strategy="steps",
)

In [None]:
tts_trainer = Seq2SeqTrainer(
    args=tts_training_args,
    model=tts_model,
    train_dataset=dataset_with_selected_speakers["train"],
    eval_dataset=dataset_with_selected_speakers["test"],
    data_collator=TTSdata_collator,
    tokenizer=tts_processor,
)

In [None]:
tts_trainer.train()



Step,Training Loss,Validation Loss
1000,0.6032,0.558628
2000,0.5654,0.534424
3000,0.5611,0.529091
4000,0.5437,0.527492




TrainOutput(global_step=4000, training_loss=0.607949235200882, metrics={'train_runtime': 5490.5964, 'train_samples_per_second': 23.313, 'train_steps_per_second': 0.729, 'total_flos': 7773118084424088.0, 'train_loss': 0.607949235200882, 'epoch': 34.41})

In [None]:
trainer.push_to_hub()

CommitInfo(commit_url='https://huggingface.co/dkhublani/test_speecht5/commit/fe691c64a2ee5f7b90f63bebeec1547d89220349', commit_message='End of training', commit_description='', oid='fe691c64a2ee5f7b90f63bebeec1547d89220349', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
fine_tuned_model = SpeechT5ForTextToSpeech.from_pretrained("dkhublani/test_speecht5")

config.json:   0%|          | 0.00/2.10k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/578M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

## **Step 4. Model Inference**

In [None]:
sample_speaker = dataset_with_selected_speakers["test"][304]
speaker_embeddings = torch.tensor(sample_speaker["speaker_embeddings"]).unsqueeze(0)

In [None]:
tts_input_parameters = tts_processor(text="Doedd hi ddim wedi arfer gyda'r math yma o beth chwaith.", return_tensors="pt")
audio_generator = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
generated_audio = fine_tuned_model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=audio_generator)

In [None]:
from IPython.display import Audio
Audio(speech, rate=16000)