In [None]:
import os
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from transformers import StyleTTSForConditionalGeneration, StyleTTSConfig, Trainer, TrainingArguments
from transformers import Wav2Vec2Processor

In [None]:
class LJSpeechDataset(Dataset):
    def __init__(self, folder_path):
        self.folder_path = folder_path
        self.metadata = pd.read_csv(os.path.join(folder_path, "metadata.csv"), sep="|", header=None, names=["audio_path", "text"])

    def __len__(self):
        return len(self.metadata)

    def __getitem__(self, idx):
        audio_path = os.path.join(self.folder_path, "wavs", self.metadata.iloc[idx]["audio_path"])
        text = self.metadata.iloc[idx]["text"]
        return audio_path, text


In [None]:
# Initialize the dataset
dataset = LJSpeechDataset(r"C:\Users\Abhishek A\Desktop\AI&ML\NLP Projects\Style TTS\StyleTTS2-main\LJSpeech")

# Initialize the StyleTTS model
config = StyleTTSConfig.from_pretrained("tugstugi/style-speech")
model = StyleTTSForConditionalGeneration.from_pretrained("tugstugi/style-speech", config=config)

# Initialize the Wav2Vec2 processor
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")

In [2]:

# Define a function to preprocess the data
def preprocess_data(batch):
    audio_paths, texts = zip(*batch)
    inputs = processor(texts, return_tensors="pt", padding=True, truncation=True, max_length=512, return_tensors="pt")
    labels = processor(audio_paths, return_tensors="pt", padding=True, truncation=True, max_length=512, return_tensors="pt")
    return {
        "input_ids": inputs["input_ids"],
        "attention_mask": inputs["attention_mask"],
        "labels": labels["input_values"],
        "labels_attention_mask": labels["attention_mask"],
    }

# Preprocess the data
train_dataset = dataset
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=preprocess_data)

# Define the training arguments
training_args = TrainingArguments(
    output_dir="./output",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    logging_dir="./logs",
)

SyntaxError: keyword argument repeated: return_tensors (3828263169.py, line 4)

In [None]:

# Define the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=preprocess_data,
)

# Start training
trainer.train()