# Author Aqib Aziz
# Project: Voice Translation and Conversion
This program Converts English speech to Urdu speech

In [None]:
import torch, torchaudio
import requests
import IPython.display as display

Download the HuBERT content encoder (hubert_soft):

In [None]:
hubert = torch.hub.load("bshall/hubert:main", "hubert_soft", trust_repo=True).cuda()

Downloading: "https://github.com/bshall/hubert/zipball/main" to /root/.cache/torch/hub/main.zip
Downloading: "https://github.com/bshall/hubert/releases/download/v0.2/hubert-soft-35d9f29f.pt" to /root/.cache/torch/hub/checkpoints/hubert-soft-35d9f29f.pt
100%|██████████| 361M/361M [00:05<00:00, 72.3MB/s]


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Download the acoustic model (hubert_soft)

In [None]:
acoustic = torch.hub.load("bshall/acoustic-model:main", "hubert_soft", trust_repo=True).cuda()

Downloading: "https://github.com/bshall/acoustic-model/zipball/main" to /root/.cache/torch/hub/main.zip
Downloading: "https://github.com/bshall/acoustic-model/releases/download/v0.1/hubert-soft-0321fd7e.pt" to /root/.cache/torch/hub/checkpoints/hubert-soft-0321fd7e.pt
100%|██████████| 71.8M/71.8M [00:01<00:00, 72.2MB/s]


Download the vocoder (hifigan_hubert_soft)

In [None]:
hifigan = torch.hub.load("bshall/hifigan:main", "hifigan_hubert_soft", trust_repo=True).cuda()

Downloading: "https://github.com/bshall/hifigan/zipball/main" to /root/.cache/torch/hub/main.zip
Downloading: "https://github.com/bshall/hifigan/releases/download/v0.1/hifigan-hubert-soft-65f03469.pt" to /root/.cache/torch/hub/checkpoints/hifigan-hubert-soft-65f03469.pt
100%|██████████| 54.9M/54.9M [00:00<00:00, 73.6MB/s]


Download an example utterance:

In [None]:
# with open("example.wav", "wb") as file:
#   response = requests.get("https://drive.google.com/uc?export=preview&id=1Y3KuPAhB5VcsmIaokBVKu3LUEZOfhSu8")
#   file.write(response.content)

Or upload your own:

In [2]:
from google.colab import files

uploaded = files.upload()

Saving aqibsvoice.wav to aqibsvoice.wav


Load the source audio (and resample to 16kHz if necessary)

## Download SpeechRecognition to recognize audio

In [None]:
!pip install SpeechRecognition



## Download googletrans for translation

In [None]:
!pip install googletrans

Collecting googletrans
  Downloading googletrans-3.0.0.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting httpx==0.13.3 (from googletrans)
  Downloading httpx-0.13.3-py3-none-any.whl (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.1/55.1 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Collecting hstspreload (from httpx==0.13.3->googletrans)
  Downloading hstspreload-2023.1.1-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m40.1 MB/s[0m eta [36m0:00:00[0m
Collecting chardet==3.* (from httpx==0.13.3->googletrans)
  Downloading chardet-3.0.4-py2.py3-none-any.whl (133 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m133.4/133.4 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting idna==2.* (from httpx==0.13.3->googletrans)
  Downloading idna-2.10-py2.py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.8/58.8

## Download gtts for text-to-speech

In [None]:
!pip install gtts

Collecting gtts
  Downloading gTTS-2.3.2-py3-none-any.whl (28 kB)
Installing collected packages: gtts
Successfully installed gtts-2.3.2


In [None]:
import speech_recognition as sr
from googletrans import Translator
from gtts import gTTS
import os

## Convert Speech to Text

In [None]:
# Initialize the recognizer
recognizer = sr.Recognizer()

# Load an audio file
audio_file = "aqibsvoice.wav"

# Use the recognizer to open the audio file
with sr.AudioFile(audio_file) as source:
    # Adjust for ambient noise if needed to improve recognition accuracy.
    recognizer.adjust_for_ambient_noise(source)

    # Listen to the audio file and recognize the speech
    try:
        audio_data = recognizer.record(source)
        text = recognizer.recognize_google(audio_data)  # You can choose a different recognizer if needed
        print("Recognized text:")
        print(text)
    except sr.UnknownValueError:
        print("Google Speech Recognition could not understand the audio")
    except sr.RequestError as e:
        print("Could not request results from Google Speech Recognition service; {0}".format(e))


Recognized text:
my name is Aqib Aziz I am from Amazon


In [None]:
!pip install translate

Collecting translate
  Downloading translate-3.6.1-py2.py3-none-any.whl (12 kB)
Collecting libretranslatepy==2.1.1 (from translate)
  Downloading libretranslatepy-2.1.1-py3-none-any.whl (3.2 kB)
Installing collected packages: libretranslatepy, translate
Successfully installed libretranslatepy-2.1.1 translate-3.6.1


## Translate that recognized text into Urdu

In [None]:
from translate import Translator

# Create a Translator object
translator = Translator(to_lang="ur")

# English text to be translated
english_text = text

# Translate English text to Urdu
translated_text = translator.translate(english_text)

# Print the translated text in Urdu
print("Translated text in Urdu:")
print(translated_text)


Translated text in Urdu:
میرا نام عاقب عزیز ہے میں ایمیزون سے ہوں


## Convert that translated urdu text back to voice

In [None]:
from gtts import gTTS

# Urdu text to be converted to speech
urdu_text = translated_text

# Create a gTTS object with Urdu text
tts = gTTS(text=urdu_text, lang="ur")

# Save the speech to an audio file
output_file = "translated_to_urdu.wav"
tts.save(output_file)

# Optionally, you can play the speech using a media player
# For example, on Windows, you can use the following code to play the audio file:
# import os
# os.system("output_file")


## Play the voice

In [None]:
from IPython.display import Audio

# Path to the saved audio file
audio_file_path = 'translated_to_urdu.wav'  # Adjust the path as needed

# Play the audio file
Audio(audio_file_path)


In [None]:
# copy voice to google drive
!cp translated_to_urdu.wav "/content/drive/My Drive/"

## Voice Conversion
use translated urdu voice as a source

In [None]:
source, sr = torchaudio.load("translated_to_urdu.wav")

mono_audio = source.mean(dim=0, keepdim=True)
torchaudio.save("translated_to_urdu_mono.wav", mono_audio, sr)

In [None]:
source, sr = torchaudio.load("translated_to_urdu_mono.wav")
source = torchaudio.functional.resample(source, sr, 16000)
source = source.unsqueeze(0).cuda()

Convert to the target speaker:

In [None]:
with torch.inference_mode():
    # Extract speech units
    units = hubert.units(source)
    # Generate target spectrogram
    mel = acoustic.generate(units).transpose(1, 2)
    # Generate audio waveform
    target = hifigan(mel)

## Checking result
source voice:

In [None]:
display.Audio(source.squeeze().cpu(), rate=16000)

converted voice:

In [None]:
display.Audio(target.squeeze().cpu(), rate=16000)

## Saving the converted Voice

In [None]:
import torchaudio.transforms as T
import soundfile as sf

In [None]:
# Convert the PyTorch tensor to a NumPy array
audio_np = target.squeeze().cpu().detach().numpy()

# Define the path where you want to save the audio file
output_path = "converted_voice.wav"

# Save the audio as a WAV file using soundfile library
sf.write(output_path, audio_np, 22050)  # Adjust the sample rate (22050) if needed. sample rate= samples/second

# The sample rate determines the quality and frequency range of the audio. higher sample rate reduce audio time

In [None]:
# copy converted voice to google drive
!cp converted_voice.wav "/content/drive/My Drive/"