In [None]:
#### Installing Necessary Models And Tools.  Other Libraries and Dependencies may be called elsewhere as needed.
#### This needs to be run before everything else, and it does not need to be run more than once.  Consider this
#### the equivalant of booting up a desktop / laptop.
!pip install TTS
!pip install pydub
import IPython
from pydub import AudioSegment
# Sometimes the TTS Install throws weird errors about NumPY.  They're largely irrelevant.  Rerunning this panel will
# clear them.

In [None]:
#### This needs to be run second.  It will only ever need to be rerun if the model is changed.
#### While this specific notebook is written to use the TTS model, if you have a different model
#### the code to use it should be close to this.  Probably.  Feel free to see what happens.
import torch
from TTS.api import TTS

# Get device
device = "cuda" if torch.cuda.is_available() else "cpu"

# List available 🐸TTS models (This is commented out by default.)
#print(TTS().list_models())

# Initalize TTS
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)


In [None]:
#### This needs to be run once at startup, and then again any time variables are changed.
#### Variables are cAsE sEnSiTiVe.
#### It is assumed that you have mounted a google drive, with a directory structure matching all the folders below.  you can use whatever
#### directory structure you want, but you'll have to change it here.  Google Drives are NOT mounted by default.  If you are unsure how to
#### do that, a quick search on your favorite engine will easily find you a very simple tutorial.

# Holds the base path to the folder where all bambi files are found.
BambiPath ="/content/drive/MyDrive/bambi/"
# Holds the name of the file with the script.
BambiScript = "edited.txt"
# Holds the voice sample subdirectory.
BambiVoiceDir = "/content/drive/MyDrive/bambi/Voice/Finished/"
# Holds the file name of the voice sample.
BambiVoice = "voice.wav"
# Holds the directory all generated files will be saved to.
BambiCloned = "/content/drive/MyDrive/bambi/Voice/Cloned/"
# Holds the timecode file name.
BambiTimes = "timecode.txt"

In [None]:
###
# Checks for script file, and if it is not found it breaks.
# If the script has been changed or a new file has been uploaded, this must be
# run to load the new script into memory.
#
try:
  # Opens the file.
  file = open(str(BambiPath)+str(BambiScript), "r")
  # Puts the raw contents of the file in a variable.
  content = file.read()
  # Splits the file into an array, using the line break as a delimiter.
  script=(content.splitlines())
  # Closes the file.  If the file is left open, it won't be able to be used by any other processes. (Not strictly true, but don't go leaving files open.  Trust Me, bro.)
  file.close()
except:
  # If there is no file, throw an error and give up.
  print("File Not Found.  BambiGenerator Going Offline.  Check All Paths And File Names, As Well As Ensuring They Exisit.")

In [None]:
###
import pydub
# Creates an empty audio file in memory.
bambispeech = AudioSegment.empty()
# Generates a WAV file for each line of the script, using the voice sample.
# If there is no script, this will fail.
#
# Keeps track of the expected time of the audio file, updated on every iteration.
total = 0
# Every scripted trigger is noted here, and this is later used to place them in the finished product.
triggers = []
# An ugly hack, but it cuts way down of read/write overhead.  An array holding the file name of all the voice clips
# generated, and will only ever be called by the optional sub that provides clip lengths in a log file.  This can help
# placing triggers, but it is completely unrelated to anything that generates anything.  It's strictly debugging.
talkings=[]
# Used to keep track of the last time code of the last trigger.  When the next trigger is generated, the difference
# between the current timecode and whatever stored value is here will be used to generate the silence between the triggers
# on the audio track.  It is updated with the latest value for every trigger found in the script, proceduraly.
lastrig = 0

for i in range(0,len(script)):
  # Prints the line of the script currently being generated.  Commented out by default.
  #print(script[i],i)
  # Parses the script file, and sorts lines. $ will generate a silence in miliseconds, ! will inster a snap/moan trigger, and all other lines
  # will be sent to the speech processor.  There are multiple local variables used, wich will be used for each itteration of the loop as well as
  # global arrays that will be reinitalized here on each call to this routine.  Other routines will depend on these, and will fail if they are not
  # populated with the correct values.
  if script[i].startswith("$"):
    x = script[i].split("$")
    # Generates a silence of X miliseconds, according to the script.
    bambispeech += AudioSegment.silent(duration= int(x[1].rstrip()))
    # Updates the expected length of the audio file, in seconds.
    total += int(x[1].rstrip())/1000
  else:
    # Checks for triggers, if none are found, it will default to generating whatever is in the script to text.
    if script[i].startswith("!"):
      s = script[i].split("!")
      t = int(s[1])
      # CURrentPOSition (curpos) is a temporary variable, used to find the relative time the trigger should be played.
      # It is only ever used in this one IF statement and it is not passed between iterations in the loop.
      curpos=(total-float(t/1000))
      # Appends the result of the last known total time less the timeshift of the trigger to the array triggers
      triggers.append(curpos-lastrig)
      # updates lastrig variable with current value of curpos.
      lastrig=total
      # I'm not destroying curpos here, but only because it will destroy itself next time this is called. If it was destroyed,
      # nothing would happen.
    # If no indication of silences or triggers are found in the line, it will pass it to the TTS Model, generate speech, and append
    # it to whatever has been stored in the bambispeech object. It will also generate a WAV file on disk.  This is mostly for ease of
    # error checking later, when listening to the finished audio file, it can save a lot of time to have a playlist of the raw speech
    # files.  They can also be saved and used for fine tuning or building other voices later.
    else:
      # Send the current line of the script to the voice generator and returns a WAV file.  File is saved wherever BambiCloned is set to.
      # File names are generated in order, with the first line corresponding to 0.wav, etc.
      # It does not check if the file exists before it attempts to write, and it WILL OVERWRITE any file with the same name if it already exists.
      if str(script[i]) == "":
        pass
      else:
        deers = str(script[i]).replace(".","")
        tts.tts_to_file(text=deers, speaker_wav=str(BambiVoiceDir)+str(BambiVoice), language="en", file_path=str(BambiCloned)+str(i)+".wav")
        bambispeech += AudioSegment.from_wav(str(BambiCloned)+str(i)+".wav")
      # Adds the time of the generated voice to total.
        total += AudioSegment.from_wav(str(BambiCloned)+str(i)+".wav").duration_seconds
      # Logs that an audio file was created.  Debugging purposes only.
        talkings.append(i)
# Writes the audio file of voice and silence to disk.
bambispeech.export(BambiCloned+"output1.mp3", format="mp3")
# Audio player loads output1.mp3 for manual review.  Can be commented out. IPython is imported at initalization.  IF it is not, it can be imported here
# by commenting out the line below.
#import IPython
#IPython.display.Audio(BambiCloned+"output1.mp3")
###

In [None]:
###
# Commented out, prints values of the triggers array.  It is useful for debugging, but it does nothing.
#print(triggers)
#
# Creates an empty object in memory the build the snap trigger track.
snaps = AudioSegment.empty()
# Iterates through the triggers array, adding snaps at the specified times.  There is nothing
# else on this track.  If snaps are appearing in the wrong places, adjusting the time in the script will
# move them.
for i in range(0,len(triggers)):

  snaps += AudioSegment.silent(duration=(triggers[i]*1000))
  snaps += AudioSegment.from_mp3(BambiPath+"snapmoan.mp3")
# Pads the track to the exact length of the audio track with silence.
snaps += AudioSegment.silent(duration=(total-snaps.duration_seconds)*1000)
# Writes the track to disk.
snaps.export(BambiCloned+"output2.mp3", format="mp3")
# If the track needs to be reviewed for any reason, uncommenting the next line will enable
# IPython to load an audio player with the file.
#IPython.display.Audio(BambiCloned+"output2.mp3")

In [None]:
###
# Commented out, prints values of the triggers array.  It is useful for debugging, but it does nothing.
#print(triggers)
#
# Initalizes two local objects and loads the vocal track (output1.mp3) and the trigger/snap track (output2.mp3)
sound1 = AudioSegment.from_file(BambiCloned+"output1.mp3", format="mp3")
sound2 = AudioSegment.from_file(BambiCloned+"output2.mp3", format="mp3")
#
# Sound levels can be changed by changing the numbers.  Sometimes TTS can make unreasonably loud text for no reason.
# If one track is too loud or too quiet, this is where the volume can be adjusted in decibels.  By default they are
# set to zero.
AdjustedSpeech = sound1 + 0
AdjustedTriggers = sound2 - 100
#
# Since the two files should be exactly the same length, it should make no difference which is overlayed to which.
# However, in the case of two different length files, if a longer file is overlaid a shorter file, pydub will trim
# the output to the length of the shorter file.  There is also an option of where to align the tracks, and for this
# it is set to zero miliseconds.
BlendedOne = AdjustedSpeech.overlay(AdjustedTriggers,0)
#
# Exports the result of both files being blended to disk.
file_handle = BlendedOne.export(BambiCloned+"output3.mp3", format="mp3")
# It is suggested to listen to the result, and make sure each trigger is more or less where it should be in relation to speech.
# IPython will load the result and play it, and printing the triggers array will provide the times between triggers for easier seeking.
# Both of these lines can be commented out without any effects on the program.  They are only here for debugging.
#print(triggers)
#IPython.display.Audio(BambiCloned+"output3.mp3")

In [None]:
###
#
# Loads the previous track and the prerendered drone track in objects.
TriggerEdit = AudioSegment.from_file(BambiCloned+"output3.mp3", format="mp3")
Background = AudioSegment.from_file(BambiPath+"drone.mp3", format="mp3")
#
# Variables for creating a fade in or our on the drone track, in miliseconds. Set to 1ms
# by default.
#### FUCKED IF I CAN GET THIS TO WORK.
FadedTriggerEdit = TriggerEdit.fade_in(1)
FadedTriggerEdit = TriggerEdit.fade_out(1)
BackgroundFade = Background.fade_in(1)
BackgroundFade = Background.fade_out(1)
#
# A little bit of variable nonsense, but essentially wether or not anything was faded,
# those variables will be passed here for volume adjustment.  Since the fades are applied
# as they are called, one cariable can hold two fades and then be sent on.  THOSE will be set
# as overlays.  The volume adjustment is set to zero by default.
FinalTriggerEdit = FadedTriggerEdit - 0
FinalBackground = BackgroundFade - 0
#
# The background track is set to loop by default, and fill the entire track regardless of the track
# length.  This can be changed, but it is very convienient right now.
BambiSleepTrack = FinalTriggerEdit.overlay(FinalBackground, 0, loop=True)
#
# Writes to an MP3 file on disk.  It is possible here to write metadata, all
# metadata is in JSON format.
#
##################################################################################
# THIS WILL OVERWRITE ANY FILE WITH THE SAME NAME IN THE  SAME PATH WITHOUT
# PROMPTING.  IT WILL NOT BE POSSIBLE TO GET IT BACK.  BE SURE TO HAVE A BACKUP
# OF ANY FILES THAT ARE NOT MEANT TO BE DELETED!!!!
##################################################################################
BambiSleepTrack.export(BambiPath+"FinalTrack.mp3", format="mp3", tags={'artist': 'Bambi'})
#
# Loads the final product in the IPython player.  This can be saved by right clicking
# the player.  There is no copy protection.
IPython.display.Audio(BambiPath+"FinalTrack.mp3")
#
# Enjoy.
#

In [None]:
###
# Opens a file to store the timecodes.  It WILL OVERWRITE the files if it already exisits.
# This can be skipped, but it is a very handy reference and it is suggested to make the file.
f = open(str(BambiPath)+str(BambiTimes), "w")
# Loops through the array holding each line of the script, and writes the text to a file, as well as the length (in miliseconds)
# and the file name.  It is delimited by a semicolon, with each entry being on a new line.  When it is needed, it will be loaded
# into a 2 dimensional array, with each line being an array split by semicolons, and held in a larger array that is split by
# line breaks.  It could be done with JSON or CSV files, but I prefer multidim arrays.
#
# If there are no voice files, this will fail.
for i in talkings:
  # Loads a cloned/generated clip.
  clip=AudioSegment.from_file(str(BambiCloned)+str(i)+".wav")
  # Concats the file name WITHOUT THE EXTENTION + the file duration IN MILISECONDS + the line from the script.
  # If the output is desired in another unit, clip.duration_seconds*1000 should be adjusted.
  vals=str(i)+":"+str(clip.duration_seconds)+":"+str(script[i])
  # Prints the string to be written to file.  Disabled by default.
  #print(vals)
  # Write to the file, and adds a CRLF character / new line.
  f.write(str(vals)+"\n")
  print(i,str(clip.duration_seconds),script[i])
# Closes the file.
f.close()