#The Following code is an implementation having faster inference time. But only single voice.

##Implementation workflow
=============================

###| Malayalam audio in | --> | Whisper | --> | TTS-engine | --> | English audio out |

In [6]:
!pip install git+https://github.com/openai/whisper.git
!sudo apt update && sudo apt install ffmpeg
!pip install ttsmms soundfile

!curl https://dl.fbaipublicfiles.com/mms/tts/eng.tar.gz --output eng.tar.gz #update lang
!mkdir -p data && tar -xzf eng.tar.gz -C data/ #update langcode

Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-lc3u9skb
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-lc3u9skb
  Resolved https://github.com/openai/whisper.git to commit b91c907694f96a3fb9da03d4bbdc83fbcd3a40a4
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Hit:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:2 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:3 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [119 kB]
Get:4 http://security.ubuntu.com/ubuntu jammy-security InRelease [110 kB]
Get:5 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [108 kB]
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 Packages [1,085 kB]
Hit:7 https://ppa.launchp

In [7]:
import os
import re
import subprocess
import soundfile as sf
from ttsmms import TTS
from IPython.display import Audio


############################################
#   Audio translation with transcription   #
############################################
def run_whisper(audio_file_path, model="medium", language="Malayalam", task="translate"):
    # Construct the Whisper command
    whisper_command = [
        "whisper",
        audio_file_path,
        "--model", model,
        "--language", language,
        "--task", task
    ]
    try:
        # Execute the Whisper command using subprocess
        completed_process = subprocess.run(whisper_command, capture_output=True, text=True, check=True)

        # Extract and return the output from the completed process
        return completed_process.stdout.strip()

    except subprocess.CalledProcessError as e:
        print(f"Error occurred while executing Whisper: {e}")
        return None

def extract_text_from_whisper_output(whisper_output):
    # Define a regular expression pattern to match the timestamp format [00:00.000 --> 00:07.000]
    timestamp_pattern = r"\[\d+:\d+\.\d+\s+-->\s+\d+:\d+\.\d+\]\s+"

    # Use the re.sub() function to replace all occurrences of the timestamp pattern with an empty string
    text_only = re.sub(timestamp_pattern, "", whisper_output)

    # Strip any leading/trailing whitespace and return the text
    return text_only.strip()


#########################################
#   main function pipeline via ttsmms   #
#########################################
def clone_with_ttsmms(input_audio_path:str='/content/audio.wav', input_language:str='Malayalam', output_audio_path:str='/content/synthesized_speech.wav', output_language:str='eng'):
    try:
        tts=TTS(f"data/{output_language}") #update lang code

        result = run_whisper(input_audio_path, model="medium", language=input_language, task="translate")

        # Extract text from the Whisper output
        text = extract_text_from_whisper_output(result)

        wav=tts.synthesis(text)

        sf.write(output_audio_path, wav["x"], wav["sampling_rate"])

        return output_audio_path
    except:
        print('An exception occured ',Exception)

In [8]:
path = clone_with_ttsmms(input_audio_path='/content/audio_imax.wav',
                         output_audio_path='/content/synthesized_speech.wav')

In [9]:
Audio(path)

#This code is a slow inference version for translation and cloning using tortoise-TTS voice cloning model and whisper. Tortoise can clone a voice clearly but is very slow.

##Implementation workflow
=============================

###| Malayalam audio in | --> | Whisper | --> | TTS-engine | --> | Segmentation | -->  | Segment combination | --> | English audio out |


The transcription is splited into a list and audio is generated for each segments this is because the model can only deal with shorter lines of text input. So each segmented audio is generated individually and later combined to make the whole audio.

In [3]:
!pip install pydub
!git clone https://github.com/neonbjb/tortoise-tts.git
%cd tortoise-tts
!python -m pip install -r ./requirements.txt
!python setup.py install

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1
Cloning into 'tortoise-tts'...
remote: Enumerating objects: 1673, done.[K
remote: Counting objects: 100% (596/596), done.[K
remote: Compressing objects: 100% (150/150), done.[K
remote: Total 1673 (delta 493), reused 486 (delta 446), pack-reused 1077[K
Receiving objects: 100% (1673/1673), 54.08 MiB | 34.08 MiB/s, done.
Resolving deltas: 100% (724/724), done.
/content/tortoise-tts/tortoise-tts
running install
!!

        ********************************************************************************
        Please avoid running ``setup.py`` directly.
        Instead, use pypa/build, pypa/installer, pypa/build or
        other standards-based tools.

        See https://blog.ganssle.io/articles/2021/10/setup-py-deprecated.html for details.
        ********************************************************************************

!!
  self.in

In [None]:
import os
import torch
import torchaudio
import torch.nn as nn
from pydub import AudioSegment
import torch.nn.functional as F
from IPython.display import Audio
from tortoise.api import TextToSpeech
from tortoise.utils.audio import load_audio, load_voice, load_voices

# This will download all the models used by Tortoise from the HuggingFace hub.
tts = TextToSpeech()


############################################
#   Audio translation with transcription   #
############################################
def run_whisper(audio_file_path, model="medium", language="Malayalam", task="translate"):
    # Construct the Whisper command
    whisper_command = [
        "whisper",
        audio_file_path,
        "--model", model,
        "--language", language,
        "--task", task
    ]
    try:
        # Execute the Whisper command using subprocess
        completed_process = subprocess.run(whisper_command, capture_output=True, text=True, check=True)

        # Extract and return the output from the completed process
        return completed_process.stdout.strip()

    except subprocess.CalledProcessError as e:
        print(f"Error occurred while executing Whisper: {e}")
        return None

def extract_text_from_whisper_output(whisper_output):
    # Define a regular expression pattern to match the timestamp format [00:00.000 --> 00:07.000]
    timestamp_pattern = r"\[\d+:\d+\.\d+\s+-->\s+\d+:\d+\.\d+\]\s+"

    # Use the re.sub() function to replace all occurrences of the timestamp pattern with an empty string
    text_only = re.sub(timestamp_pattern, "", whisper_output)

    # Strip any leading/trailing whitespace and return the text
    return text_only.strip()


##################################################################
#   transcription text to audio segment generation and merging   #
##################################################################
def create_audio_segments(text:str, voice:str='tom', preset='fast', folder_path:str='/content/tts_files'):
    text_list = text.split('\n')
    for i in range(len(text_list))
        output_wav_file = f"{folder_path}/tts_{i}.wav"
        voice_samples, conditioning_latents = load_voice(voice)
        gen = tts.tts_with_preset(text_list[i],
                                  voice_samples=voice_samples,
                                  conditioning_latents=conditioning_latents,
                                  preset=preset)
        torchaudio.save(output_wav_file, gen.squeeze(0).cpu(), 24000)

def combine_tts_audio_segments(folder:str='/content/tts_files', output_audio_path:str='output_combined_audio.wav'):
    # Initialize an empty audio segment to concatenate the TTS clips
    full_audio = AudioSegment.empty()

    # List all the TTS audio files in the folder
    tts_files = sorted([f for f in os.listdir(folder) if f.startswith('tts_')])

    for tts_file in tts_files:
        tts_audio_file = os.path.join(folder, tts_file)

        # Load the TTS audio clip
        tts_audio = AudioSegment.from_file(tts_audio_file)

        # Append the TTS audio clip to the full_audio segment
        full_audio += tts_audio

    # Export the concatenated audio to the output file
    full_audio.export(output_audio_path, format="wav")


###########################################
#   main function pipeline via tortoise   #
###########################################
def clone_with_tortoise(voice:str='tom', segment_storage_path:str='/content/tts_files', input_audio_path:str='/content/audio.wav', input_language:str='Malayalam', output_audio_path:str='/content/output_combined_audio.wav'):
    try:
        result = run_whisper(input_audio_path, model="medium", language=input_language, task="translate")

        # Extract text from the Whisper output
        text = extract_text_from_whisper_output(result)

        create_audio_segments(text, voice, folder_path=segment_storage_path)

        combine_tts_audio_segments(segment_storage_path, output_audio_path)
        return output_audio_path
    except:
        print('An exception occured ',Exception)

###You can add your custom voice into the folder /content/tortoise-tts/tortoise/voices. Add atleast 3 sample audio wav files of duration about 10sec.
Each wav sample should be named like 1.wav, 2.wav, 3.wav and so on...

In [None]:
available_voice_list = ['mol',
                        'tom',
                        'emma',
                        'angie',
                        'halle',
                        'geralt',
                        'daniel',
                        'deniro',
                        'william',
                        'freeman',
                        'applejack',
                        'tim_reynolds']

In [None]:
path = clone_with_tortoise(voice='freeman',
                           input_audio_path='/content/audio.wav',
                           output_audio_path='/content/output_combined_audio.wav')

In [None]:
Audio(path)