<a href="https://colab.research.google.com/github/Abhi-3026/AI_ML_AVEN/blob/main/TTS%20Translation%20Demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import sys
import os
import csv
import time
from TTS.api import TTS # Ensure Mozilla TTS is installed and available
from pydub import AudioSegment  # For converting the output to mp3


# Ensure you have installed dependencies: pip install tts pydub

def get_model_info(model_name):
    # Modify to extract model size details
    model_info = {
        "tts_models/en/ljspeech/fastspeech2": "FastSpeech2-Base",
        # Add other models and their sizes if needed
    }
    return model_info.get(model_name, "Unknown-Model")


def generate_speech(model_name, input_text, output_audio_file):
    # Load the TTS model
    tts = TTS(model_name)

    # Synthesize speech from text
    tts.tts_to_file(text=input_text, file_path=output_audio_file)


def convert_to_mp3(wav_filename, mp3_filename):
    # Convert WAV to MP3 using pydub
    audio = AudioSegment.from_wav(wav_filename)
    audio.export(mp3_filename, format="mp3")
    return mp3_filename


def transcribe_to_speech(input_file, model_name, output_csv):
    # Get basename of the input file (without extension)
    basename = os.path.basename(input_file).replace('.txt', '')

    # Read the input text file
    with open(input_file, 'r', encoding='utf-8') as f:
        input_text = f.read()

    # Prepare output filenames
    wav_output_file = f"{basename}-{get_model_info(model_name)}.wav"
    mp3_output_file = f"{basename}-{get_model_info(model_name)}.mp3"

    # Start timing the transcription process
    start_time = time.time()

    # Generate speech (WAV format)
    generate_speech(model_name, input_text, wav_output_file)

    # Convert WAV to MP3
    convert_to_mp3(wav_output_file, mp3_output_file)

    # Measure duration of process
    duration = round(time.time() - start_time, 2)

    # Record the result to CSV
    with open(output_csv, mode='a', newline='', encoding='utf-8') as csv_file:
        fieldnames = ['filename', 'output_filename', 'model_name', 'duration']
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)

        # Write headers if file is empty
        if csv_file.tell() == 0:
            writer.writeheader()

        writer.writerow({
            'filename': input_file,
            'output_filename': mp3_output_file,
            'model_name': model_name,
            'duration': duration
        })

    print(f"Transcription complete. Output saved as {mp3_output_file}")
    print(f"Details logged in {output_csv}")


if __name__ == "__main__":
    if len(sys.argv) < 2:
        print("Usage: python script.py <input_text_file> [model_name] [output_csv]")
        sys.exit(1)

    # Arguments: 1 - Input text file, 2 - Model name, 3 - Output CSV file
    input_text_file = '/content/input_text.txt' #sys.argv[1]
    model_name = "tts_models/en/ljspeech/glow-tts" # sys.argv[2] if len(sys.argv) > 2 else
    output_csv_file = sys.argv[3] if len(sys.argv) > 3 else "output_analysis.csv"

    # Perform transcription
    transcribe_to_speech(input_text_file, model_name, output_csv_file)


 > tts_models/en/ljspeech/glow-tts is already downloaded.
 > vocoder_models/en/ljspeech/multiband-melgan is already downloaded.
 > Using model: glow_tts
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:0
 | > fft_size:1024
 | > power:1.1
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:False
 | > symmetric_norm:True
 | > mel_fmin:50.0
 | > mel_fmax:7600.0
 | > pitch_fmin:1.0
 | > pitch_fmax:640.0
 | > spec_gain:1.0
 | > stft_pad_mode:reflect
 | > max_norm:1.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024


  return torch.load(f, map_location=map_location, **kwargs)


 > Vocoder Model: multiband_melgan
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:0
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:True
 | > symmetric_norm:True
 | > mel_fmin:50.0
 | > mel_fmax:7600.0
 | > pitch_fmin:0.0
 | > pitch_fmax:640.0
 | > spec_gain:1.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:/root/.local/share/tts/vocoder_models--en--ljspeech--multiband-melgan/scale_stats.npy
 | > base:10
 | > hop_length:256
 | > win_length:1024
 > Generator Model: multiband_melgan_generator
 > Discriminator Model: melgan_multiscale_discriminator
 > Text splitted to sentences.
['All

In [1]:
import sys
import os
import csv
import time
import torch
from TTS.api import TTS  # Ensure Mozilla TTS with CUDA support is installed
from pydub import AudioSegment  # For converting the output to mp3

# Ensure you have installed dependencies: pip install tts pydub torch

def get_model_info(model_name):
    # Modify to extract model size details or model type
    model_info = {
        "tts_models/en/ljspeech/fastspeech2": "FastSpeech2-Base",
        "tts_models/en/ljspeech/glow-tts": "Glow-TTS",
    }
    return model_info.get(model_name, "Unknown-Model")


def generate_speech(model_name, input_text, output_audio_file):
    # Load the TTS model with CUDA enabled if available
    tts = TTS(model_name, gpu=torch.cuda.is_available())

    # Synthesize speech from text with CUDA acceleration if GPU is available
    tts.tts_to_file(text=input_text, file_path=output_audio_file)


def convert_to_mp3(wav_filename, mp3_filename):
    # Convert WAV to MP3 using pydub
    audio = AudioSegment.from_wav(wav_filename)
    audio.export(mp3_filename, format="mp3")
    return mp3_filename


def transcribe_to_speech(input_file, model_name, output_csv):
    # Get basename of the input file (without extension)
    basename = os.path.basename(input_file).replace('.txt', '')

    # Read the input text file
    with open(input_file, 'r', encoding='utf-8') as f:
        input_text = f.read()

    # Prepare output filenames
    wav_output_file = f"{basename}-{get_model_info(model_name)}.wav"
    mp3_output_file = f"{basename}-{get_model_info(model_name)}.mp3"

    # Start timing the transcription process
    start_time = time.time()

    # Generate speech (WAV format) with CUDA if available
    generate_speech(model_name, input_text, wav_output_file)

    # Convert WAV to MP3
    convert_to_mp3(wav_output_file, mp3_output_file)

    # Measure duration of process
    duration = round(time.time() - start_time, 2)

    # Record the result to CSV
    with open(output_csv, mode='a', newline='', encoding='utf-8') as csv_file:
        fieldnames = ['filename', 'output_filename', 'model_name', 'duration']
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)

        # Write headers if file is empty
        if csv_file.tell() == 0:
            writer.writeheader()

        writer.writerow({
            'filename': input_file,
            'output_filename': mp3_output_file,
            'model_name': model_name,
            'duration': duration
        })

    print(f"Transcription complete. Output saved as {mp3_output_file}")
    print(f"Details logged in {output_csv}")


if __name__ == "__main__":
    # Setting default input/output for demonstration
    input_text_file = '/content/input_text.txt'
    model_name = "tts_models/en/ljspeech/glow-tts"  # Choose model name
    output_csv_file = "output_analysis.csv"

    # Perform transcription
    transcribe_to_speech(input_text_file, model_name, output_csv_file)


RuntimeError: module was compiled against NumPy C-API version 0x10 (NumPy 1.23) but the running NumPy has C-API version 0xf. Check the section C-API incompatibility at the Troubleshooting ImportError section at https://numpy.org/devdocs/user/troubleshooting-importerror.html#c-api-incompatibility for indications on how to solve this problem.



 > Downloading model to /root/.local/share/tts/tts_models--en--ljspeech--glow-tts


 98%|█████████▊| 337M/344M [00:04<00:00, 84.6MiB/s]

 > Model's license - MPL
 > Check https://www.mozilla.org/en-US/MPL/2.0/ for more info.
 > Downloading model to /root/.local/share/tts/vocoder_models--en--ljspeech--multiband-melgan



100%|██████████| 344M/344M [00:09<00:00, 36.1MiB/s]

  4%|▎         | 3.04M/82.8M [00:00<00:02, 30.4MiB/s][A
 15%|█▍        | 12.2M/82.8M [00:00<00:01, 66.7MiB/s][A
 25%|██▍       | 20.7M/82.8M [00:00<00:00, 74.8MiB/s][A
 36%|███▌      | 29.5M/82.8M [00:00<00:00, 80.1MiB/s][A
 45%|████▌     | 37.5M/82.8M [00:00<00:00, 78.5MiB/s][A
 56%|█████▋    | 46.6M/82.8M [00:00<00:00, 82.7MiB/s][A
 68%|██████▊   | 56.0M/82.8M [00:00<00:00, 86.1MiB/s][A
 79%|███████▊  | 65.0M/82.8M [00:00<00:00, 87.6MiB/s][A
 89%|████████▉ | 74.1M/82.8M [00:00<00:00, 88.4MiB/s][A

 > Model's license - MPL
 > Check https://www.mozilla.org/en-US/MPL/2.0/ for more info.
 > Using model: glow_tts
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:0
 | > fft_size:1024
 | > power:1.1
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:False
 | > symmetric_norm:True
 | > mel_fmin:50.0
 | > mel_fmax:7600.0
 | > pitch_fmin:1.0
 | > pitch_fmax:640.0
 | > spec_gain:1.0
 | > stft_pad_mode:reflect
 | > max_norm:1.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024


  return torch.load(f, map_location=map_location, **kwargs)


 > Vocoder Model: multiband_melgan
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:0
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:True
 | > symmetric_norm:True
 | > mel_fmin:50.0
 | > mel_fmax:7600.0
 | > pitch_fmin:0.0
 | > pitch_fmax:640.0
 | > spec_gain:1.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:/root/.local/share/tts/vocoder_models--en--ljspeech--multiband-melgan/scale_stats.npy
 | > base:10
 | > hop_length:256
 | > win_length:1024
 > Generator Model: multiband_melgan_generator
 > Discriminator Model: melgan_multiscale_discriminator
 > Text splitted to sentences.
['All


100%|██████████| 82.8M/82.8M [00:20<00:00, 88.4MiB/s][A

 > Processing time: 19.37826633453369
 > Real-time factor: 0.02811964133728319
Transcription complete. Output saved as input_text-Glow-TTS.mp3
Details logged in output_analysis.csv


In [2]:
!pip install pydub

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


In [3]:
!pip install TTS

Collecting TTS
  Downloading TTS-0.22.0-cp310-cp310-manylinux1_x86_64.whl.metadata (21 kB)
Collecting anyascii>=0.3.0 (from TTS)
  Downloading anyascii-0.3.2-py3-none-any.whl.metadata (1.5 kB)
Collecting pysbd>=0.3.4 (from TTS)
  Downloading pysbd-0.3.4-py3-none-any.whl.metadata (6.1 kB)
Collecting umap-learn>=0.5.1 (from TTS)
  Downloading umap_learn-0.5.6-py3-none-any.whl.metadata (21 kB)
Collecting pandas<2.0,>=1.4 (from TTS)
  Downloading pandas-1.5.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting trainer>=0.0.32 (from TTS)
  Downloading trainer-0.0.36-py3-none-any.whl.metadata (8.1 kB)
Collecting coqpit>=0.0.16 (from TTS)
  Downloading coqpit-0.0.17-py3-none-any.whl.metadata (11 kB)
Collecting pypinyin (from TTS)
  Downloading pypinyin-0.53.0-py2.py3-none-any.whl.metadata (12 kB)
Collecting hangul-romanize (from TTS)
  Downloading hangul_romanize-0.1.0-py3-none-any.whl.metadata (1.2 kB)
Collecting gruut==2.2.3 (from gruut[de,es,fr]==2.2.3->T

In [4]:
!pip install gruut
!apt-get install -qq libportaudio2
!pip install pyaudio


Selecting previously unselected package libportaudio2:amd64.
(Reading database ... 123622 files and directories currently installed.)
Preparing to unpack .../libportaudio2_19.6.0-1.1_amd64.deb ...
Unpacking libportaudio2:amd64 (19.6.0-1.1) ...
Setting up libportaudio2:amd64 (19.6.0-1.1) ...
Processing triggers for libc-bin (2.35-0ubuntu3.4) ...
/sbin/ldconfig.real: /usr/local/lib/libtbbbind_2_0.so.3 is not a symbolic link

/sbin/ldconfig.real: /usr/local/lib/libur_adapter_opencl.so.0 is not a symbolic link

/sbin/ldconfig.real: /usr/local/lib/libtbbbind.so.3 is not a symbolic link

/sbin/ldconfig.real: /usr/local/lib/libtbbmalloc.so.2 is not a symbolic link

/sbin/ldconfig.real: /usr/local/lib/libur_loader.so.0 is not a symbolic link

/sbin/ldconfig.real: /usr/local/lib/libtbbbind_2_5.so.3 is not a symbolic link

/sbin/ldconfig.real: /usr/local/lib/libtbbmalloc_proxy.so.2 is not a symbolic link

/sbin/ldconfig.real: /usr/local/lib/libur_adapter_level_zero.so.0 is not a symbolic link

/

In [3]:
from pydub import AudioSegment

def convert_to_male_voice(wav_filename, male_output_filename_base, pitch_shift=-4):
    # Load the original file
    audio = AudioSegment.from_wav(wav_filename)

    # Pitch shift by lowering the playback speed
    audio = audio._spawn(audio.raw_data, overrides={
        "frame_rate": int(audio.frame_rate * (2.0 ** (pitch_shift / 12.0)))
    }).set_frame_rate(audio.frame_rate)

    # Export to WAV format
    wav_output_filename = f"{male_output_filename_base}.wav"
    audio.export(wav_output_filename, format="wav")

    # Export to MP3 format
    mp3_output_filename = f"{male_output_filename_base}.mp3"
    audio.export(mp3_output_filename, format="mp3")

    print(f"Files saved as: {wav_output_filename} and {mp3_output_filename}")

# Usage example
convert_to_male_voice("/content/input_text-Glow-TTS.wav", "output_male")


Files saved as: output_male.wav and output_male.mp3
