In [None]:
!pip install torch gradio sentencepiece sacremoses

In [1]:
# Speech to Speech Translation (English to French)

import torch
from transformers import pipeline

device = "cuda:0" if torch.cuda.is_available() else "cpu"
pipe = pipeline(
    "automatic-speech-recognition", model="openai/whisper-base", device=device
)

  from .autonotebook import tqdm as notebook_tqdm
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [2]:
#load one language (English) split of Vox Populi

from datasets import load_dataset

dataset = load_dataset("facebook/voxpopuli", "en", split="validation", streaming=True, trust_remote_code=True)  
sample = next(iter(dataset))

# Play example

from IPython.display import Audio

Audio(sample["audio"]["array"], rate=sample["audio"]["sampling_rate"])

In [3]:
# Function to translate audio to text

def translate(audio):
    outputs = pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe", "language": "fr"}) 
    return outputs["text"]

In [4]:
# Check the model

translate(sample["audio"].copy())

" Il y a des mesures qui ont été décourées, pas en septembre, mais aussi en marches. Et bien, nous verrons des mesures, peut-être pas encore, mais il y a des mesures qui ont été décourées. Et la situation pourrait être en worse si nous n'avons pas été décourées."

In [5]:
# Compare transcription to source audio

sample["raw_text"]

'Many measures have been taken, not only in September but also in March, and of course we see some effects of those measures perhaps not enough, but there are effects of those measures, and the situation could have been worse if we did not have taken those measures.'

In [6]:
# Load model for TTS

from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan

processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")

model = SpeechT5ForTextToSpeech.from_pretrained("ccourc23/fine_tuned_SpeechT5") 
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

In [7]:
# Load speaker embeddings

embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)

In [8]:
# Function that takes text and returns speech

def synthesise(text):
    inputs = processor(text=text, return_tensors="pt")
    speech = model.generate_speech(
        inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder
    )
    return speech.cpu()

In [9]:
# Check it works

speech = synthesise("Hey there! This is a test!")

Audio(speech, rate=16000)

In [10]:
# Concatenate the two functions

import numpy as np

target_dtype = np.int16
max_range = np.iinfo(target_dtype).max


def speech_to_speech_translation(audio):
    translated_text = translate(audio)
    synthesised_speech = synthesise(translated_text)
    synthesised_speech = (synthesised_speech.numpy() * max_range).astype(np.int16)
    return 16000, synthesised_speech

In [11]:
# Make sure you get the expected result

sampling_rate, synthesised_speech = speech_to_speech_translation(sample["audio"])

Audio(synthesised_speech, rate=sampling_rate)

In [12]:
# Create Gradio demo to use mic input

import gradio as gr

demo = gr.Blocks()

mic_translate = gr.Interface(
    fn=speech_to_speech_translation,
    inputs=gr.Audio(sources="microphone", type="filepath"),
    outputs=gr.Audio(label="Generated Speech", type="numpy"),
)

file_translate = gr.Interface(
    fn=speech_to_speech_translation,
    inputs=gr.Audio(sources="upload", type="filepath"),
    outputs=gr.Audio(label="Generated Speech", type="numpy"),
)

with demo:
    gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])

demo.launch(debug=True, share=True)

Running on local URL:  http://127.0.0.1:7861
Running on public URL: https://14d8157e8d746fa09c.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7861 <> https://14d8157e8d746fa09c.gradio.live


