### Install packages and download models

In [1]:
%%shell
git clone https://github.com/yl4579/StyleTTS2.git
cd StyleTTS2
pip install SoundFile torchaudio munch torch pydub pyyaml librosa nltk matplotlib accelerate transformers phonemizer einops einops-exts tqdm typing-extensions git+https://github.com/resemble-ai/monotonic_align.git
sudo apt-get install espeak-ng
git-lfs clone https://huggingface.co/yl4579/StyleTTS2-LibriTTS
mv StyleTTS2-LibriTTS/Models .

Cloning into 'StyleTTS2'...
remote: Enumerating objects: 372, done.[K
remote: Counting objects: 100% (144/144), done.[K
remote: Compressing objects: 100% (48/48), done.[K
remote: Total 372 (delta 102), reused 96 (delta 96), pack-reused 228 (from 1)[K
Receiving objects: 100% (372/372), 133.97 MiB | 13.56 MiB/s, done.
Resolving deltas: 100% (203/203), done.
Updating files: 100% (48/48), done.
Collecting git+https://github.com/resemble-ai/monotonic_align.git
  Cloning https://github.com/resemble-ai/monotonic_align.git to /tmp/pip-req-build-itva9ul_
  Running command git clone --filter=blob:none --quiet https://github.com/resemble-ai/monotonic_align.git /tmp/pip-req-build-itva9ul_
  Resolved https://github.com/resemble-ai/monotonic_align.git to commit c6e5e6cb19882164027eb6e35118e841eed9298e
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting munch
  Downloa



### Download dataset (LJSpeech, 200 samples, ~15 minutes of data)

You can definitely do it with fewer samples. This is just a proof of concept with 200 smaples.

In [2]:
%ls

[0m[01;34msample_data[0m/  [01;34mStyleTTS2[0m/


In [3]:
%cd StyleTTS2
!rm -rf Data

/content/StyleTTS2


In [4]:
#!gdown --id 1vqz26D3yn7OXS2vbfYxfSnpLS6m6tOFP
!gdown --id 1UO_8yjJtPXWIYk6vXa1rOKhrqjpwEqZ8
!unzip Data.zip

Downloading...
From (original): https://drive.google.com/uc?id=1UO_8yjJtPXWIYk6vXa1rOKhrqjpwEqZ8
From (redirected): https://drive.google.com/uc?id=1UO_8yjJtPXWIYk6vXa1rOKhrqjpwEqZ8&confirm=t&uuid=161c544a-c40f-426f-b98f-c435aed0e53d
To: /content/StyleTTS2/Data.zip
100% 49.1M/49.1M [00:01<00:00, 29.6MB/s]
Archive:  Data.zip
   creating: Data/
  inflating: Data/LJ001-0048.wav     
  inflating: Data/LJ001-0060.wav     
  inflating: Data/LJ001-0074.wav     
  inflating: Data/LJ001-0128.wav     
  inflating: Data/LJ001-0114.wav     
  inflating: Data/LJ001-0100.wav     
  inflating: Data/LJ001-0101.wav     
  inflating: Data/LJ001-0115.wav     
  inflating: Data/LJ001-0129.wav     
  inflating: Data/LJ001-0075.wav     
  inflating: Data/LJ001-0061.wav     
  inflating: Data/LJ001-0049.wav     
  inflating: Data/LJ001-0077.wav     
  inflating: Data/LJ001-0063.wav     
  inflating: Data/LJ001-0088.wav     
  inflating: Data/LJ001-0103.wav     
  inflating: Data/LJ001-0117.wav     
  inflatin

In [5]:
input_file_name = "200_sample.txt"
output_file_name = "phonemized_200.txt"
# load phonemizer
import phonemizer
global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True,  with_stress=True)

def text_to_phonemes(text):
    text = text.strip()
    ps = global_phonemizer.phonemize([text])
    return ps[0]

file_out = open(output_file_name, 'w')

with open(input_file_name, 'r', encoding='utf-8') as f:
    for line in f:
      #print(line)
      wave_file, text = line.split('|', 1)
      wave_file_name = wave_file + '.wav'
      #print(wave_file_name, text)
      phonemized = text_to_phonemes(text)
      #print(phonemized)
      #file_out.write(wave_file_name+ '.wav' + '|' + phonemized +'|'+'0'+'\n')
      file_out.write(f"{wave_file_name}|{phonemized}|{0}\n")
file_out.close()



In [6]:
!head -n 150 phonemized_200.txt > Data/train_list.txt
!tail -n 50 phonemized_200.txt > Data/val_list.txt
!head -n 150 phonemized_200.txt > Data/OOD_texts.txt

### Change the finetuning config

Depending on the GPU you got, you may want to change the bacth size, max audio length, epiochs and so on.

In [7]:
config_path = "Configs/config_ft.yml"

import yaml
config = yaml.safe_load(open(config_path))
!cat Configs/config_ft.yml
!head Data/train_list.txt
!head 200_sample.txt

log_dir: "Models/LJSpeech"
save_freq: 5
log_interval: 10
device: "cuda"
epochs: 50 # number of finetuning epoch (1 hour of data)
batch_size: 8
max_len: 400 # maximum number of frames
pretrained_model: "Models/LibriTTS/epochs_2nd_00020.pth"
second_stage_load_pretrained: true # set to true if the pre-trained model is for 2nd stage
load_only_params: true # set to true if do not want to load epoch numbers and optimizer parameters

F0_path: "Utils/JDC/bst.t7"
ASR_config: "Utils/ASR/config.yml"
ASR_path: "Utils/ASR/epoch_00080.pth"
PLBERT_dir: 'Utils/PLBERT/'

data_params:
  train_data: "Data/train_list.txt"
  val_data: "Data/val_list.txt"
  root_path: "/local/LJSpeech-1.1/wavs"
  OOD_data: "Data/OOD_texts.txt"
  min_length: 50 # sample until texts with this size are obtained for OOD texts

preprocess_params:
  sr: 24000
  spect_params:
    n_fft: 2048
    win_length: 1200
    hop_length: 300

model_params:
  multispeaker: true

  dim_in: 64 
  hidden_dim: 512
  max_conv_dim: 512
  n_layer: 

In [9]:
config['data_params']['root_path'] = "Data/"

config['batch_size'] = 6 # not enough RAM
config['save_freq'] = 1
config['max_len'] = 100 # not enough RAM
config['epochs'] = 15
config['loss_params']['diff_epoch'] = 6
config['loss_params']['joint_epoch'] = 110 # we do not do SLM adversarial training due to not enough RAM

with open(config_path, 'w') as outfile:
  yaml.dump(config, outfile, default_flow_style=True)

### Start finetuning


Do not forget to change the line 604 of the 'models.py' python script with:

params = torch.load(model_path, map_location='cpu', weights_only=False)['model']


In [10]:
!python train_finetune.py --config_path ./Configs/config_ft.yml

2025-07-12 05:37:43.111482: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-07-12 05:37:43.128690: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752298663.150293    3356 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752298663.156747    3356 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-07-12 05:37:43.178148: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

### Test the model quality

Note that this mainly serves as a proof of concept due to RAM limitation of free Colab instances. A lot of settings are suboptimal. In the future when DDP works for train_second.py, we will also add mixed precision finetuning to save time and RAM. You can also add SLM adversarial training run if you have paid Colab services (such as A100 with 40G of RAM).

In [11]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [12]:
import torch
torch.manual_seed(0)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

import random
random.seed(0)

import numpy as np
np.random.seed(0)

# load packages
import time
import random
import yaml
from munch import Munch
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
import torchaudio
import librosa
from nltk.tokenize import word_tokenize

from models import *
from utils import *
from text_utils import TextCleaner
textclenaer = TextCleaner()

%matplotlib inline

to_mel = torchaudio.transforms.MelSpectrogram(
    n_mels=80, n_fft=2048, win_length=1200, hop_length=300)
mean, std = -4, 4

def length_to_mask(lengths):
    mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)
    mask = torch.gt(mask+1, lengths.unsqueeze(1))
    return mask

def preprocess(wave):
    wave_tensor = torch.from_numpy(wave).float()
    mel_tensor = to_mel(wave_tensor)
    mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - mean) / std
    return mel_tensor

def compute_style(path):
    wave, sr = librosa.load(path, sr=24000)
    audio, index = librosa.effects.trim(wave, top_db=30)
    if sr != 24000:
        audio = librosa.resample(audio, sr, 24000)
    mel_tensor = preprocess(audio).to(device)

    with torch.no_grad():
        ref_s = model.style_encoder(mel_tensor.unsqueeze(1))
        ref_p = model.predictor_encoder(mel_tensor.unsqueeze(1))

    return torch.cat([ref_s, ref_p], dim=1)

device = 'cuda' if torch.cuda.is_available() else 'cpu'

# load phonemizer
import phonemizer
global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True,  with_stress=True)

config = yaml.safe_load(open("Models/LJSpeech/config_ft.yml"))

# load pretrained ASR model
ASR_config = config.get('ASR_config', False)
ASR_path = config.get('ASR_path', False)
text_aligner = load_ASR_models(ASR_path, ASR_config)

# load pretrained F0 model
F0_path = config.get('F0_path', False)
pitch_extractor = load_F0_models(F0_path)

# load BERT model
from Utils.PLBERT.util import load_plbert
BERT_path = config.get('PLBERT_dir', False)
plbert = load_plbert(BERT_path)

model_params = recursive_munch(config['model_params'])
model = build_model(model_params, text_aligner, pitch_extractor, plbert)
_ = [model[key].eval() for key in model]
_ = [model[key].to(device) for key in model]

177


  WeightNorm.apply(module, name, dim)


In [13]:
import os

files = [f for f in os.listdir("Models/LJSpeech/") if f.endswith('.pth')]
sorted_files = sorted(files, key=lambda x: int(x.split('_')[-1].split('.')[0]))

In [14]:
params_whole = torch.load("Models/LJSpeech/" + sorted_files[-1], map_location='cpu')
params = params_whole['net']

In [15]:
for key in model:
    if key in params:
        print('%s loaded' % key)
        try:
            model[key].load_state_dict(params[key])
        except:
            from collections import OrderedDict
            state_dict = params[key]
            new_state_dict = OrderedDict()
            for k, v in state_dict.items():
                name = k[7:] # remove `module.`
                new_state_dict[name] = v
            # load params
            model[key].load_state_dict(new_state_dict, strict=False)
#             except:
#                 _load(params[key], model[key])
_ = [model[key].eval() for key in model]

bert loaded
bert_encoder loaded
predictor loaded
decoder loaded
text_encoder loaded
predictor_encoder loaded
style_encoder loaded
diffusion loaded
text_aligner loaded
pitch_extractor loaded
mpd loaded
msd loaded
wd loaded


In [16]:
!pip install Modules
!pip install diffusion


Collecting Modules
  Downloading modules-1.0.0.tar.gz (525 bytes)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: Modules
  Building wheel for Modules (setup.py) ... [?25l[?25hdone
  Created wheel for Modules: filename=modules-1.0.0-py3-none-any.whl size=1198 sha256=2ef6991f1ba4dccee97116a82e7c47fbd1504c4b407904623aa0717497c67c9e
  Stored in directory: /root/.cache/pip/wheels/88/b6/9f/1be8ae0f2710dd7fcc2be7b5c3d12a893a33ac69d581425530
Successfully built Modules
Installing collected packages: Modules
Successfully installed Modules-1.0.0
Collecting diffusion
  Downloading diffusion-6.11.6-1-py3-none-any.whl.metadata (2.8 kB)
Collecting attrs<24.0,>=21.4.0 (from diffusion)
  Downloading attrs-23.2.0-py3-none-any.whl.metadata (9.5 kB)
Collecting cbor2==5.*,>=5.1.2 (from diffusion)
  Downloading cbor2-5.6.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.0 kB)
Collecting stringcase==1.*,>=1.2.0 (from diffusion)
  Download

In [17]:
from Modules.diffusion.sampler import DiffusionSampler, ADPM2Sampler, KarrasSchedule

In [18]:
sampler = DiffusionSampler(
    model.diffusion.diffusion,
    sampler=ADPM2Sampler(),
    sigma_schedule=KarrasSchedule(sigma_min=0.0001, sigma_max=3.0, rho=9.0), # empirical parameters
    clamp=False
)

In [19]:
def inference(text, ref_s, alpha = 0.3, beta = 0.7, diffusion_steps=5, embedding_scale=1):
    text = text.strip()
    ps = global_phonemizer.phonemize([text])
    #ps = word_tokenize(ps[0])
    ps = ' '.join(ps)
    print(ps)
    tokens = textclenaer(ps)
    tokens.insert(0, 0)
    tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)

    with torch.no_grad():
        input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
        text_mask = length_to_mask(input_lengths).to(device)

        t_en = model.text_encoder(tokens, input_lengths, text_mask)
        bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())
        d_en = model.bert_encoder(bert_dur).transpose(-1, -2)

        s_pred = sampler(noise = torch.randn((1, 256)).unsqueeze(1).to(device),
                                          embedding=bert_dur,
                                          embedding_scale=embedding_scale,
                                            features=ref_s, # reference from the same speaker as the embedding
                                             num_steps=diffusion_steps).squeeze(1)


        s = s_pred[:, 128:]
        ref = s_pred[:, :128]

        ref = alpha * ref + (1 - alpha)  * ref_s[:, :128]
        s = beta * s + (1 - beta)  * ref_s[:, 128:]

        d = model.predictor.text_encoder(d_en,
                                         s, input_lengths, text_mask)

        x, _ = model.predictor.lstm(d)
        duration = model.predictor.duration_proj(x)

        # Fix NaN and infinite values in duration
        duration = torch.nan_to_num(duration, nan=1.0, posinf=1.0, neginf=1.0)
        duration = torch.sigmoid(duration).sum(axis=-1)
        pred_dur = torch.round(duration.squeeze()).clamp(min=1)

        pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))
        c_frame = 0
        for i in range(pred_aln_trg.size(0)):
            pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1
            c_frame += int(pred_dur[i].data)

        # encode prosody
        en = (d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device))
        if model_params.decoder.type == "hifigan":
            asr_new = torch.zeros_like(en)
            asr_new[:, :, 0] = en[:, :, 0]
            asr_new[:, :, 1:] = en[:, :, 0:-1]
            en = asr_new

        F0_pred, N_pred = model.predictor.F0Ntrain(en, s)

        asr = (t_en @ pred_aln_trg.unsqueeze(0).to(device))
        if model_params.decoder.type == "hifigan":
            asr_new = torch.zeros_like(asr)
            asr_new[:, :, 0] = asr[:, :, 0]
            asr_new[:, :, 1:] = asr[:, :, 0:-1]
            asr = asr_new

        out = model.decoder(asr,
                                F0_pred, N_pred, ref.squeeze().unsqueeze(0))


    return out.squeeze().cpu().numpy()[..., :-50] # weird pulse at the end of the model, need to be fixed later



```
# This is formatted as code
```

### Synthesize speech

In [None]:
text = '''This is found to be fun to work on text to speech.'''
text = "I am Amruth Ashok Gadag. It is an honor to work for a basic TTS model at Indian Institute of Informtaion Technology Dharwad."
#text = "Speech synthesis is the artificial production of human speech."

In [None]:
# get a random reference in the training set, note that it doesn't matter which one you use
path = "Data/LJ001-0110.wav"
# this style vector ref_s can be saved as a parameter together with the model weights
ref_s = compute_style(path)

In [None]:
start = time.time()
wav = inference(text, ref_s, alpha=0, beta=0, diffusion_steps=10, embedding_scale=1)
rtf = (time.time() - start) / (len(wav) / 24000)
print(f"RTF = {rtf:5f}")
import IPython.display as ipd
display(ipd.Audio(wav, rate=24000, normalize=False))

display(ipd.Audio(path, rate=24000, normalize=False))



aɪɐm ˈæmɹuːθ ɐʃˈɑːk ɡˈædæɡ. ɪɾ ɪz ɐn ˈɑːnɚ tə wˈɜːk fɚɹə bˈeɪsɪk tˌiːtˌiːˈɛs mˈɑːdəl æɾ ˈɪndiən ˈɪnstɪtˌuːt ʌv ɪnfˈɔːɹmteɪən tɛknˈɑːlədʒi dˈɑːɹwɑːd. 
RTF = 0.095678


In [None]:
import torch
import time
import IPython.display as ipd

def inference(text, ref_s, alpha = 0.3, beta = 0.7, diffusion_steps=5, embedding_scale=1):
    text = text.strip()
    ps = global_phonemizer.phonemize([text])
    #ps = word_tokenize(ps[0])
    ps = ' '.join(ps)
    print(ps)
    tokens = textclenaer(ps)
    tokens.insert(0, 0)
    tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)

    with torch.no_grad():
        input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
        text_mask = length_to_mask(input_lengths).to(device)

        t_en = model.text_encoder(tokens, input_lengths, text_mask)
        bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())
        d_en = model.bert_encoder(bert_dur).transpose(-1, -2)

        s_pred = sampler(noise = torch.randn((1, 256)).unsqueeze(1).to(device),
                                          embedding=bert_dur,
                                          embedding_scale=embedding_scale,
                                            features=ref_s, # reference from the same speaker as the embedding
                                             num_steps=diffusion_steps).squeeze(1)


        s = s_pred[:, 128:]
        ref = s_pred[:, :128]

        ref = alpha * ref + (1 - alpha)  * ref_s[:, :128]
        s = beta * s + (1 - beta)  * ref_s[:, 128:]

        d = model.predictor.text_encoder(d_en,
                                         s, input_lengths, text_mask)

        x, _ = model.predictor.lstm(d)
        duration = model.predictor.duration_proj(x)

        # Fix NaN and infinite values in duration
        duration = torch.nan_to_num(duration, nan=1.0, posinf=1.0, neginf=1.0)
        duration = torch.sigmoid(duration).sum(axis=-1)
        pred_dur = torch.round(duration.squeeze()).clamp(min=1)

        pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))
        c_frame = 0
        for i in range(pred_aln_trg.size(0)):
            pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1
            c_frame += int(pred_dur[i].data)

        # encode prosody
        en = (d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device))
        if model_params.decoder.type == "hifigan":
            asr_new = torch.zeros_like(en)
            asr_new[:, :, 0] = en[:, :, 0]
            asr_new[:, :, 1:] = en[:, :, 0:-1]
            en = asr_new

        F0_pred, N_pred = model.predictor.F0Ntrain(en, s)

        asr = (t_en @ pred_aln_trg.unsqueeze(0).to(device))
        if model_params.decoder.type == "hifigan":
            asr_new = torch.zeros_like(asr)
            asr_new[:, :, 0] = asr[:, :, 0]
            asr_new[:, :, 1:] = asr[:, :, 0:-1]
            asr = asr_new

        out = model.decoder(asr,
                                F0_pred, N_pred, ref.squeeze().unsqueeze(0))


    return out.squeeze().cpu().numpy()[..., :-50] # weird pulse at the end of the model, need to be fixed later


# Usage example
start = time.time()
wav = inference("Hello world", "reference", alpha=0, beta=0, diffusion_steps=10, embedding_scale=1)
if wav is not None:
    rtf = (time.time() - start) / (len(wav) / 24000)
    print(f"RTF = {rtf:5f}")
    display(ipd.Audio(wav, rate=24000, normalize=False))
else:
    print("Inference failed.")

həlˈoʊ wˈɜːld 


TypeError: linear(): argument 'input' (position 1) must be Tensor, not str

#Speech Synthesis by Amruth Ashok Gadag

In [25]:
#text = '''This is found to be fun to work on text to speech.'''
text = "Hi hello this is Keerthi from NMIT Bengaluru. This is a test on text to speech synthesis of Keerti's voice trained in American accent"
#text = "Speech synthesis is the artificial production of human speech."

Try to provide the audio to imitate the voice

In [29]:
# get a random reference in the training set, note that it doesn't matter which one you use
path = "/content/Keerti_M.wav"
# this style vector ref_s can be saved as a parameter together with the model weights
ref_s = compute_style(path)

Process and synthesize the speech

In [30]:
start = time.time()
wav = inference(text, ref_s, alpha=0, beta=0, diffusion_steps=10, embedding_scale=1)
rtf = (time.time() - start) / (len(wav) / 24000)
print(f"RTF = {rtf:5f}")
import IPython.display as ipd

display(ipd.Audio(path, rate=24000, normalize=False)) # input audio fed

display(ipd.Audio(wav, rate=24000, normalize=False))  # output synthesized

hˈaɪ həlˈoʊ ðɪs ɪz kˈɪɹθaɪ fɹʌm ˈɛnmˈɪt bˈɛŋɡɐlˌʊɹɹuː. ðɪs ɪz ɐ tˈɛst ˌɔn tˈɛkst tə spˈiːtʃ sˈɪnθəsˌɪs ʌv kˈɪɹɾiz vˈɔɪs tɹˈeɪnd ɪn ɐmˈɛɹɪkən ˈæksənt 
RTF = 0.033075


I will now modify the `models.py` file to fix the `UnpicklingError`.

In [None]:
import torch
import os

models_file_path = "StyleTTS2/models.py"

try:
    with open(models_file_path, 'r') as f:
        lines = f.readlines()

    with open(models_file_path, 'w') as f:
        for line in lines:
            if "params = torch.load(model_path, map_location='cpu')" in line:
                # Modify the line to ensure weights_only=False is present and handle getattr
                f.write("        with torch.serialization.safe_globals([getattr]):\n")
                f.write("            params = torch.load(model_path, map_location='cpu', weights_only=False)['model']\n")
            else:
                f.write(line)
    print(f"Successfully modified {models_file_path}")

except FileNotFoundError:
    print(f"Error: {models_file_path} not found. Please ensure the StyleTTS2 directory is cloned correctly.")
except Exception as e:
    print(f"An error occurred while modifying the file: {e}")

Error: StyleTTS2/models.py not found. Please ensure the StyleTTS2 directory is cloned correctly.


After the modification, please re-run the cell `jIIAoDACXJL0` to load the models.