In [1]:
import torch
from transformers import SpeechT5ForTextToSpeech, SpeechT5Tokenizer

# Load pre-trained SpeechT5 model and tokenizer
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
tokenizer = SpeechT5Tokenizer.from_pretrained("microsoft/speecht5_tts")


Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "C:\Users\ADMIN\AppData\Local\Programs\Python\Python311\Lib\site-packages\transformers\utils\import_utils.py", line 1764, in _get_module
    return importlib.import_module("." + module_name, self.__name__)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\ADMIN\AppData\Local\Programs\Python\Python311\Lib\importlib\__init__.py", line 126, in import_module
    return _bootstrap._gcd_import(name[level:], package, level)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "<frozen importlib._bootstrap>", line 1206, in _gcd_import
  File "<frozen importlib._bootstrap>", line 1178, in _find_and_load
  File "<frozen importlib._bootstrap>", line 1149, in _find_and_load_unlocked
  File "<frozen importlib._bootstrap>", line 690, in _load_unlocked
  File "<frozen importlib._bootstrap_external>", line 940, in exec_module
  File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed
  F

In [None]:
import torch.nn.utils.prune as prune

# Define the pruning method and parameters
pruning_method = prune.L1Unstructured
pruning_amount = 0.2  # Prune 20% of connections

# Apply pruning to desired layers
for name, module in model.named_modules():
    if isinstance(module, torch.nn.Linear):
        prune.l1_unstructured(module, name='weight', amount=pruning_amount)
        # Optionally remove the pruning re-parametrization to make it permanent
        prune.remove(module, 'weight')


In [None]:
model.save_pretrained("D:/lalwani/tts-finetuning/pruned-models/pruned_speecht5")
tokenizer.save_pretrained("D:/lalwani/tts-finetuning/pruned-models/pruned_speecht5")


In [None]:
# from datasets import load_dataset
# 
# dataset = load_dataset('csv', data_files=r'D:\lalwani\tts-finetuning\pythonPro\audio_data.csv', split='train')

In [None]:
dataset

In [2]:
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech

In [None]:
# Load the fine-tuned SpeechT5 model
model_path = "D:/lalwani/tts-finetuning/pruned-models/pruned_speecht5"
model = SpeechT5ForTextToSpeech.from_pretrained(model_path)

# Load the corresponding tokenizer
tokenizer = SpeechT5Tokenizer.from_pretrained(model_path)


In [None]:
checkpoint = "microsoft/speecht5_tts"
processor = SpeechT5Processor.from_pretrained(checkpoint)

In [None]:
dataset = load_dataset('csv', data_files=r'D:\lalwani\tts-finetuning\pythonPro\final_dataset.csv')
print(dataset['train']['file_path2'])

In [None]:
dataset = dataset.remove_columns('original_file_path')
print(dataset)

In [None]:
import soundfile as sf
import os

In [None]:
def get_audio_data(folder_path):
    data_list = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.wav'):
            file_path = os.path.join(folder_path, filename)
            file_path = os.path.normpath(file_path)
            print(file_path)
            audio_array, samplerate = sf.read(file_path)
            data = {
                'audio_id': os.path.splitext(os.path.basename(file_path))[0],
                'audio': {
                    'path': file_path,
                    'array': audio_array,
                    'sampling_rate': 16000
                }
            }
            data_list.append(data)
    return data_list

# Usage
base_path_final = r"D:\lalwani\tts-finetuning\data-finetuning\extracted_data\mono_combined\resampled" 
base_path_final = os.path.normpath(base_path_final) 
file_paths = dataset['train']['file_path2']

audio_data = get_audio_data(base_path_final)

In [None]:
def add_gender_to_audio_data(audio_data):
    for data in audio_data:
        filename = data['audio_id'].lower()
        if 'female' in filename:
            data['gender'] = 'female'
        elif 'male' in filename:
            data['gender'] = 'male'
        else:
            data['gender'] = 'unknown'
    return audio_data


audio_data_with_gender = add_gender_to_audio_data(audio_data)

In [None]:
dataset['train']['englishText']

In [None]:
def add_transcription_to_audio_data(audio_data_with_gender, english_text):
    if len(audio_data_with_gender) != len(english_text):
        print("Warning: The lengths of audio_data_with_gender and english_text do not match.")
        print(f"audio_data_with_gender length: {len(audio_data_with_gender)}")
        print(f"english_text length: {len(english_text)}")
    
    for i, audio_item in enumerate(audio_data_with_gender):
        if i < len(english_text):
            audio_item['transcription'] = english_text[i]
        else:
            print(f"Warning: No matching transcription for audio item at index {i}")
    
    return audio_data_with_gender

# Use the function to add transcriptions to your audio data
audio_data_with_transcription = add_transcription_to_audio_data(audio_data_with_gender, dataset['train']['englishText'])

# Print a sample to verify
print(audio_data_with_transcription[0])

In [None]:
from datasets import Dataset, concatenate_datasets

batch_size = 100  # Adjust this based on your available memory
dataset = None

for i in range(0, len(audio_data_with_transcription), batch_size):
       batch = audio_data_with_transcription[i:i+batch_size]
       batch_dataset = Dataset.from_list(batch)
       
       if dataset is None:
           dataset = batch_dataset
       else:
           dataset = concatenate_datasets([dataset, batch_dataset])

In [None]:
dataset

In [None]:
import os
import torch
from speechbrain.pretrained import EncoderClassifier

spk_model_name = "speechbrain/spkrec-xvect-voxceleb"

device = "cuda" if torch.cuda.is_available() else "cpu"
speaker_model = EncoderClassifier.from_hparams(
    source=spk_model_name,
    run_opts={"device": device},
    savedir=os.path.join("/tmp", spk_model_name),
)


def create_speaker_embedding(waveform):
    with torch.no_grad():
        speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform))
        speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
        speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy()
    return speaker_embeddings

In [None]:
def prepare_dataset(example):
    audio = example["audio"]

    example = processor(
        text=example["transcription"],
        audio_target=audio["array"],
        sampling_rate=audio["sampling_rate"],
        return_attention_mask=False,
    )

    # strip off the batch dimension
    example["labels"] = example["labels"][0]

    # use SpeechBrain to obtain x-vector
    example["speaker_embeddings"] = create_speaker_embedding(audio["array"])

    return example

In [None]:
processed_example = prepare_dataset(dataset[0])
list(processed_example.keys())

In [None]:
processed_example["speaker_embeddings"].shape

In [None]:
import matplotlib.pyplot as plt

plt.figure()
plt.imshow(processed_example["labels"].T)
plt.show()

In [None]:
dataset = dataset.map(prepare_dataset, remove_columns=dataset.column_names)

In [None]:
def is_not_too_long(input_ids):
    input_length = len(input_ids)
    return input_length < 200


dataset3 = dataset.filter(is_not_too_long, input_columns=["input_ids"])
len(dataset3)

In [None]:
dataset3 = dataset3.train_test_split(test_size=0.1)

In [None]:
from dataclasses import dataclass
from typing import Any, Dict, List, Union


@dataclass
class TTSDataCollatorWithPadding:
    processor: Any

    def __call__(
        self, features: List[Dict[str, Union[List[int], torch.Tensor]]]
    ) -> Dict[str, torch.Tensor]:
        input_ids = [{"input_ids": feature["input_ids"]} for feature in features]
        label_features = [{"input_values": feature["labels"]} for feature in features]
        speaker_features = [feature["speaker_embeddings"] for feature in features]

        # collate the inputs and targets into a batch
        batch = processor.pad(
            input_ids=input_ids, labels=label_features, return_tensors="pt"
        )

        # replace padding with -100 to ignore loss correctly
        batch["labels"] = batch["labels"].masked_fill(
            batch.decoder_attention_mask.unsqueeze(-1).ne(1), -100
        )

        # not used during fine-tuning
        del batch["decoder_attention_mask"]

        # round down target lengths to multiple of reduction factor
        if model.config.reduction_factor > 1:
            target_lengths = torch.tensor(
                [len(feature["input_values"]) for feature in label_features]
            )
            target_lengths = target_lengths.new(
                [
                    length - length % model.config.reduction_factor
                    for length in target_lengths
                ]
            )
            max_length = max(target_lengths)
            batch["labels"] = batch["labels"][:, :max_length]

        # also add in the speaker embeddings
        batch["speaker_embeddings"] = torch.tensor(speaker_features)

        return batch

In [None]:
data_collator = TTSDataCollatorWithPadding(processor=processor)

In [None]:
from huggingface_hub import login
login(token="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")

In [None]:
from functools import partial

# disable cache during training since it's incompatible with gradient checkpointing
model.config.use_cache = False

# set language and task for generation and re-enable cache
model.generate = partial(model.generate, use_cache=True)

In [None]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="speecht5_finetuned_hindi_mono",  
    per_device_train_batch_size=4,
    gradient_accumulation_steps=8,
    learning_rate=1e-5,
    warmup_steps=500,
    max_steps=4000,
    gradient_checkpointing=True,
    fp16=True,
    eval_strategy="steps",
    per_device_eval_batch_size=2,
    save_steps=1000,
    eval_steps=1000,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    greater_is_better=False,
    label_names=["labels"],
    push_to_hub=True,
)

In [None]:
!pip install tf-keras

In [None]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=dataset3["train"],
    eval_dataset=dataset3["test"],
    data_collator=data_collator,
    tokenizer=processor,
)

In [None]:
trainer.train()

In [None]:
trainer.push_to_hub()

In [None]:
print("hola")