In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from os.path import exists
from pydub import AudioSegment
from TTS.api import TTS
from TTS.utils.manage import ModelManager
import os
import torch
from TTS.utils.radam import RAdam
import numpy.core.multiarray
import shutil
import json

torch.serialization.add_safe_globals([RAdam, numpy.core.multiarray.scalar])

import sys
sys.path.insert(0, "../Src/")

%autosave 5

Autosaving every 5 seconds


In [2]:
#dataSet = pd.read_csv('../Data/train_data.csv')
#dataSet[dataSet['speaker_id'] == 'MMDM0']

In [3]:
def readCsv(dataset):
    return pd.read_csv(f'../Data/{dataset}_data.csv')

In [4]:
def getSpeakers(df):
    speakerIds = df['speaker_id']
    return list(set(speakerIds))

In [5]:
def getFilesBySpeaker(df,speakerId):
    return df[df['speaker_id']==speakerId]

In [6]:
def concatenateAudio(speakerId, speakerDF):
    if speakerDF.empty:
        print(f"Empty DataFrame for speaker {speakerId}. Skipping.")
        return

    finalAudioFile = f'../Data/concatenatedInputs/{speakerId}.wav'
    audioData = speakerDF[speakerDF['filename'].str.endswith('.wav', na=False)]

    if audioData.empty:
        print(f"No .wav files for speaker {speakerId}. Skipping.")
        return

    audioFileList = audioData['path_from_data_dir']

    if not exists(finalAudioFile):
        concat_audio = AudioSegment.empty()
    else:
        concat_audio = AudioSegment.from_wav(finalAudioFile)
    for audioFile in audioFileList:
        try:
            audio = AudioSegment.from_wav(f'../Data/data/{audioFile}') + AudioSegment.silent(duration=1000)
            concat_audio += audio
        except Exception as e:
            print(f"Failed to load {audioFile}: {e}")

    if len(concat_audio) > 0:
        concat_audio.export(finalAudioFile, format='wav')
    else:
        print(f"No valid audio for speaker {speakerId}, nothing exported.")
    

In [7]:
def readSentenceFromFile(sentenceFile):
    try:
        with open(f'../Data/data/{sentenceFile}', 'r') as file:
            return " ".join(file.read().strip().split(" ")[2:])
    except Exception as e:
        print(f"Failed to read sentence file {sentenceFile}: {e}")
        return None




In [8]:
def generateAndNormalizeAudio(tts, sentence, inputAudioFile, outputAudioFile):
    try:
        print(f"Generating audio for sentence: {sentence}")
        tts.tts_with_vc_to_file(
            text=sentence,
            file_path=outputAudioFile,
            speaker_wav=inputAudioFile
        )
        if not exists(outputAudioFile):
            print(f"Warning: Output file {outputAudioFile} was not created.")
            return False

        # Normalize the generated audio
        original_audio = AudioSegment.from_wav(inputAudioFile)
        generated_audio = AudioSegment.from_wav(outputAudioFile)

        gain = original_audio.dBFS - generated_audio.dBFS
        normalized_audio = generated_audio.apply_gain(gain)

        # Export the normalized audio back to the same file
        normalized_audio.export(outputAudioFile, format='wav')
        return True
    except Exception as e:
        print(f"Failed to generate or normalize audio: {e}")
        return False

In [9]:
def saveGeneratedSentences(speakerSentences, model_dir_name, modelDirectory):
    outputFile = f'../Data/ttsOutputs/{model_dir_name}_generatedSentences.csv'
    if speakerSentences:
        try:
            pd.DataFrame(speakerSentences).to_csv(
                outputFile, index=False
            )
        except Exception as e:
            print(f"Failed to save generated sentences: {e}")
            shutil.rmtree(modelDirectory, ignore_errors=True)
    if not exists(outputFile):
        print(f"Warning: Output file {outputFile} was not created.")

In [10]:
def loadGeneratedSentencesFromJson(speakerSentencesPath):
    if exists(speakerSentencesPath):
        with open(speakerSentencesPath, 'r') as f:
            speakerSentences = json.load(f)
    else:
        speakerSentences = []
    return speakerSentences

In [11]:
def generateTTS(model):
    try:
        # Scoped override of torch.load
        original_torch_load = torch.load
        torch.load = lambda *args, **kwargs: original_torch_load(*args, weights_only=False, **kwargs)
        tts = TTS(model_name=model, progress_bar=False, gpu=False)
        result = tts
    except Exception as e:
        print(f"Failed to load TTS model {model}: {e}")
        result = None
    finally:
        torch.load = original_torch_load  # Restore the original torch.load
    return result

In [12]:
def generateAudioInBatches(speakers, trainDF, batch_size=10):
    if trainDF.empty or not speakers:
        print("Empty dataset or no speakers provided. Exiting.")
        return

    manager = ModelManager()
    englishModels = [model for model in manager.list_models() if "/en/" in model]

    if not englishModels:
        print("No English models found. Exiting.")
        return

    for model in englishModels:
        print(f"Processing model: {model}")
        model_dir_name = model.replace("/", "_")
        modelDirectory = f'../Data/ttsOutputs/{model_dir_name}'
        os.makedirs(modelDirectory, exist_ok=True)
        speakerSentencesPath = f'../Data/ttsOutputs/{model_dir_name}_generatedSentences.json'
        speakerSentences = loadGeneratedSentencesFromJson(speakerSentencesPath)

        # Process speakers in batches
        for i in range(0, len(speakers), batch_size):
            batch = speakers[i:i + batch_size]
            print(f"Processing batch {i // batch_size + 1}: {batch}")

            # Generate a fresh TTS instance for each batch
            tts = generateTTS(model)
            if tts is None:
                print(f"Failed to generate TTS for model {model}. Skipping.")
                continue

            for speaker in batch:
                outputFilePath = f'{modelDirectory}/{speaker}.wav'
                if exists(outputFilePath):
                    print(f"Output file {outputFilePath} already exists. Skipping.")
                    continue
                speakerDF = getFilesBySpeaker(trainDF, speaker)
                if speakerDF.empty:
                    print(f"No data for speaker {speaker}. Skipping.")
                    continue

                textFiles = speakerDF[speakerDF['path_from_data_dir'].str.contains('.TXT', na=False)]
                if textFiles.empty:
                    print(f"No valid sentences for speaker {speaker}. Skipping.")
                    continue

                chosenSentenceFile = np.random.choice(textFiles['path_from_data_dir'])
                chosenSentence = readSentenceFromFile(chosenSentenceFile)
                if not chosenSentence:
                    print(f"Chosen sentence for speaker {speaker} is empty. Skipping.")
                    continue

                audioFile = f'../Data/concatenatedInputs/{speaker}.wav'
                if not exists(audioFile):
                    print(f"Audio file for {speaker} does not exist. Skipping.")
                    continue

                if generateAndNormalizeAudio(tts, chosenSentence, audioFile, outputFilePath):
                    speakerSentences.append({'speakerId': speaker, 'generatedSentence': chosenSentence})
                    with open(speakerSentencesPath, 'w') as f:
                        json.dump(speakerSentences, f)

            saveGeneratedSentences(speakerSentences, model_dir_name, modelDirectory)


In [13]:
def main():
    trainDF = readCsv('train')
    speakers = getSpeakers(trainDF)
    if not exists('../Data/concatenatedInputs'):
        os.makedirs('../Data/concatenatedInputs')
        for speaker in speakers:
            speakerDF = getFilesBySpeaker(trainDF, speaker)
            concatenateAudio(speaker, speakerDF)
    if not exists('../Data/ttsOutputs/'):
        os.makedirs('../Data/ttsOutputs/')
    
    # Process speakers in batches
    generateAudioInBatches(speakers, trainDF, batch_size=10)
    
    print("done")
    
    

In [None]:
if __name__ == '__main__':
    main()


 Name format: type/language/dataset/model
 1: tts_models/multilingual/multi-dataset/xtts_v2
 2: tts_models/multilingual/multi-dataset/xtts_v1.1
 3: tts_models/multilingual/multi-dataset/your_tts
 4: tts_models/multilingual/multi-dataset/bark
 5: tts_models/bg/cv/vits
 6: tts_models/cs/cv/vits
 7: tts_models/da/cv/vits
 8: tts_models/et/cv/vits
 9: tts_models/ga/cv/vits
 10: tts_models/en/ek1/tacotron2 [already downloaded]
 11: tts_models/en/ljspeech/tacotron2-DDC [already downloaded]
 12: tts_models/en/ljspeech/tacotron2-DDC_ph [already downloaded]
 13: tts_models/en/ljspeech/glow-tts [already downloaded]
 14: tts_models/en/ljspeech/speedy-speech [already downloaded]
 15: tts_models/en/ljspeech/tacotron2-DCA [already downloaded]
 16: tts_models/en/ljspeech/vits [already downloaded]
 17: tts_models/en/ljspeech/vits--neon [already downloaded]
 18: tts_models/en/ljspeech/fast_pitch [already downloaded]
 19: tts_models/en/ljspeech/overflow [already downloaded]
 20: tts_models/en/ljspeech/