# Review Original TTS Model

In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-to-speech", model="facebook/mms-tts-ind")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForTextToWaveform

tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-ind")
model = AutoModelForTextToWaveform.from_pretrained("facebook/mms-tts-ind")

In [None]:
import torch
text = "Saya tinggal di Jakarta"
inputs = tokenizer(text, return_tensors="pt")

with torch.no_grad():
    output = model(**inputs)
    waveform = output.waveform

In [None]:
from IPython.display import Audio

Audio(waveform, rate=model.config.sampling_rate)

# Requirement

## Setup

In [None]:
!git clone https://github.com/ylacombe/finetune-hf-vits.git

fatal: destination path 'finetune-hf-vits' already exists and is not an empty directory.


In [None]:
import os
os.chdir("/content/finetune-hf-vits")

In [None]:
!pip install -r requirements.txt



## Linking HuggingFace Account

In [None]:
!git config --global credential.helper store

In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: fineG

## Build the monotonic alignment search function using cython

In [None]:
os.chdir("monotonic_align")

In [None]:
!pwd

/content/finetune-hf-vits/monotonic_align


In [None]:
# Cython-version Monotonoic Alignment Search
!mkdir monotonic_align
!python setup.py build_ext --inplace

mkdir: cannot create directory ‘monotonic_align’: File exists


In [None]:
os.chdir("..")

In [None]:
!pwd

/content/finetune-hf-vits


## Dataset loading

In [None]:
import datasets

data = datasets.load_dataset("indonesian-nlp/librivox-indonesia", "ind", split="test[-10:]")

In [None]:
data

Dataset({
    features: ['path', 'language', 'reader', 'sentence', 'audio'],
    num_rows: 10
})

In [None]:
data["audio"]

[{'path': '/root/.cache/huggingface/datasets/downloads/extracted/47dd7920f5d9231fa35d9da681c367d8078ab934374eb8f60f37f2ddebcada5b/librivox-indonesia/test/indonesian/universal-declaration-of-human-rights/human_rights_un_ind_rd_0035.mp3',
  'array': array([0.0933058 , 0.17418097, 0.03711945, ..., 0.00232086, 0.00412499,
         0.00320231]),
  'sampling_rate': 44100},
 {'path': '/root/.cache/huggingface/datasets/downloads/extracted/47dd7920f5d9231fa35d9da681c367d8078ab934374eb8f60f37f2ddebcada5b/librivox-indonesia/test/indonesian/universal-declaration-of-human-rights/human_rights_un_ind_rd_0042.mp3',
  'array': array([0.00412783, 0.00586171, 0.00327811, ..., 0.00661701, 0.0081538 ,
         0.00720846]),
  'sampling_rate': 44100},
 {'path': '/root/.cache/huggingface/datasets/downloads/extracted/47dd7920f5d9231fa35d9da681c367d8078ab934374eb8f60f37f2ddebcada5b/librivox-indonesia/test/indonesian/universal-declaration-of-human-rights/human_rights_un_ind_rd_0073.mp3',
  'array': array([ 0.01

# Model Selection

In [None]:
!python convert_original_discriminator_checkpoint.py --language_code ind --pytorch_dump_folder_path ../finetuned_model --push_to_hub tts-mms-ind-finetuned

2024-12-08 07:51:09.666746: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-08 07:51:09.686022: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-12-08 07:51:09.691933: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--facebook--mms-tts-ind/snapshots/50dc8c10320a328391efaab25e4c629f9bf254ec/config.json
Model config VitsConfig {
  "activation_dropout": 0.1,
  "architectures": [
    "VitsModel"
  ],
  "attention_dropout": 0.1,
  "depth_separable_channels": 2,
  "depth_separable_num_layers": 

# Fine Tuning

In [None]:
!accelerate launch run_vits_finetuning.py ../finetuned_model/finetune_mms_ind.json

The following values were not passed to `accelerate launch` and had defaults used instead:
	`--num_processes` was set to a value of `1`
	`--num_machines` was set to a value of `1`
	`--mixed_precision` was set to a value of `'no'`
	`--dynamo_backend` was set to a value of `'no'`
2024-12-08 07:54:12.804501: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-08 07:54:12.823576: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-12-08 07:54:12.829321: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
12/08/2024 07:54:15 - INFO - __main__ - Training/evaluation parameters VITSTrainingArg

# Inference

In [None]:
from transformers import pipeline
import scipy

model_id = "fadhilamri/tts-mms-ind-finetuned"
synthesiser = pipeline("text-to-speech", model_id, device=0) # add device=0 if you want to use a GPU

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/2.02k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/145M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/670 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/380 [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/18.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/47.0 [00:00<?, ?B/s]

In [None]:
speech = synthesiser("Saya tinggal di Jakarta")

In [None]:
from IPython.display import Audio

Audio(speech["audio"][0], rate=speech["sampling_rate"])