In [1]:
import sys
sys.path.insert(0, "../Src/")
import generateVoices as gv
import evaluateVoices as ev
import pandas as pd
import os
from os.path import exists
import json

%autosave 5

Autosaving every 5 seconds


Autosaving every 5 seconds


Autosaving every 5 seconds


In [2]:
# df = pd.read_csv("../Data/test_data.csv")
# df

In [3]:
def getBestModelName():
    wordErrorRateResults = pd.read_csv("../Data/wordErrorRateResults.csv")
    bestModel = wordErrorRateResults.sort_values(by="averageWER").iloc[0]
    return bestModel["model"]

In [4]:
def getSourceVoice(speaker,speakerDf,datasetName="train"):
    if datasetName == "train":
        sourceVoiceFile = f"../Data/concatenatedInputs/{speaker}.wav"
    else:
        sourceVoiceFile = f"../Data/concatenatedTestInputs/{speaker}.wav"
    if exists(sourceVoiceFile):
        return sourceVoiceFile
    gv.concatenateAudio(speaker, speakerDf,datasetName)
    if exists(sourceVoiceFile):
        return sourceVoiceFile
    else:
        print(f"Source voice file for {speaker} not found after concatenation.")
        return None

In [5]:
def getFADJson(datasetName):
    fadJson = []
    if exists(f"../Data/fad_{datasetName}.json"):
        with open(f"../Data/fad_{datasetName}.json", "r") as f:
            fadJson = json.load(f)
    return fadJson

In [6]:
def saveFADJson(datasetName, fadJson):
    with open(f"../Data/fad_{datasetName}.json", "w") as f:
        json.dump(fadJson, f)

In [7]:
def createFakeAudio(speakerDf, sourceVoice, modelName, datasetName):
    fadJson = getFADJson(datasetName)
    textFiles = speakerDf[speakerDf['path_from_data_dir'].str.contains('.TXT', na=False)]
    textFileNames = textFiles['path_from_data_dir'].tolist()

    for textFileName in textFileNames:
        realFile = f"../Data/data/{textFileName.replace('.TXT', '.WAV.wav')}"
        fakeAudioFile = f"../Data/fakeAudio/{textFileName.replace('.TXT', '.wav')}"
        if exists(fakeAudioFile):
            print(f"Fake audio file {fakeAudioFile} already exists.")
            continue
        os.makedirs(os.path.dirname(fakeAudioFile), exist_ok=True)
        model = gv.generateTTS(modelName)
        sentence = gv.readSentenceFromFile(textFileName)
        gv.generateAndNormalizeAudio(model, sentence, sourceVoice, fakeAudioFile)
        if not exists(fakeAudioFile):
            raise FileNotFoundError(f"Failed to create fake audio file: {fakeAudioFile}")
        fadJson.append({
            "file": fakeAudioFile,
            "text": sentence,
            "isFake": True,
        })
        if not exists(realFile):
            raise FileNotFoundError(f"Failed to find real audio file: {realFile}")
        fadJson.append({
            "file": realFile,
            "text": sentence,
            "isFake": False,
        })
        saveFADJson(datasetName,fadJson)



In [8]:
def prepareFADData(modelName,datasetName):
    print(f"Creating voices for {modelName} on {datasetName} dataset")
    
    df = gv.readCsv(datasetName)
    speakers = gv.getSpeakers(df)
    for speaker in speakers:
        speakerDf = gv.getFilesBySpeaker(df, speaker)
        sourceVoice = getSourceVoice(speaker, speakerDf,datasetName)
        if sourceVoice is None:
            print(f"Skipping {speaker} due to missing source voice.")
            continue
        createFakeAudio(speakerDf, sourceVoice, modelName, datasetName)
    voiceDF = pd.DataFrame(list(set(getFADJson(datasetName))))
    voiceDF.to_csv(f"../Data/fad_{datasetName}.csv", index=False)
    return voiceDF



In [9]:
def main():
    bestModelName = getBestModelName()
    trainDF = prepareFADData(bestModelName, 'train')
    testDF =  prepareFADData(bestModelName, 'test')
    display(trainDF.head())
    display(testDF.head())
    print('done')

In [None]:
if __name__ == '__main__':
    main()

Creating voices for tts_models/en/ljspeech/vits--neon on train dataset
 > tts_models/en/ljspeech/vits--neon is already downloaded.
 > Using model: vits
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:0
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:None
 | > fft_size:1024
 | > power:None
 | > preemphasis:0.0
 | > griffin_lim_iters:None
 | > signal_norm:None
 | > symmetric_norm:None
 | > mel_fmin:0
 | > mel_fmax:None
 | > pitch_fmin:None
 | > pitch_fmax:None
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:1.0
 | > clip_norm:True
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024
Generating audio for sentence: So rules we made, in unabashed collusion.
 > Text splitted to sentences.