In [1]:
import os
import numpy as np

import torch
import pandas as pd
import whisper

In [2]:
from TTS.api import TTS

In [3]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [4]:
def generate_audio(text, speaker_wav_path, language, output_file_path):
    tts = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", progress_bar=True).to(DEVICE)
    # Generar el archivo de audio a partir del texto
    tts.tts_to_file(text=text, speaker_wav=speaker_wav_path, language=language, file_path=output_file_path)


In [5]:
def supported_lang(iso_code):
    languages = {
        'en': 'English',
        'fr': 'French',
        'de': 'German',
        'it': 'Italian',
        'ar': 'Arabic'
    }
    return iso_code in languages

In [6]:
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
def whisper_transcribe_audio(model, audio_data_path):
    audio = whisper.load_audio(audio_data_path)
    audio = whisper.pad_or_trim(audio)
   # Representación útil para el procesamiento de audio que enfatiza las frecuencias importantes
    mel = whisper.log_mel_spectrogram(audio).to(model.device)

    # Detectar el idioma hablado
    _, probs = model.detect_language(mel)
    print(model.detect_language(mel))
    print(f"Detected language: {max(probs, key=probs.get)}")
    detected_language = max(probs, key=probs.get)

    # Ayudar al modelo con el idioma origen, aunque no es necesario
    options = whisper.DecodingOptions(language=detected_language, task="transcribe", without_timestamps=True)
    result = whisper.decode(model, mel, options)
    # Devolver la transcripción
    return result.text  

In [None]:
def whisper_detect_lang_and_transcribe_audio(model, audio_data_path):
    audio = whisper.load_audio(audio_data_path)
    audio = whisper.pad_or_trim(audio)
   # Representación útil para el procesamiento de audio que enfatiza las frecuencias importantes
    mel = whisper.log_mel_spectrogram(audio).to(model.device)

    # Detectar el idioma hablado
    _, probs = model.detect_language(mel)
    print(model.detect_language(mel))
    print(f"Detected language: {max(probs, key=probs.get)}")
    detected_language = max(probs, key=probs.get)

    # Ayudar al modelo con el idioma origen, aunque no es necesario
    options = whisper.DecodingOptions(language=detected_language, task="transcribe", without_timestamps=True)
    result = whisper.decode(model, mel, options)
    # Devolver la transcripción
    return detected_language, result.text  

In [8]:
def translation_from_esp_to_lang(lang, transcription):
    if supported_lang(lang):
        try:
            translator = pipeline("translation", model=f"Helsinki-NLP/opus-mt-es-{lang}")
            result = translator(transcription) 
            return result[0]['translation_text']
        except Exception as e:
            print("Error durante la traducción :", e)
    else:
        return ""

In [9]:
def translation_from_lang_to_esp(lang, transcription):
    if supported_lang(lang):
        try:
            translator = pipeline("translation", model=f"Helsinki-NLP/opus-mt-{lang}-es")
            result = translator(transcription) 
            return result[0]['translation_text']
        except Exception as e:
            print("Error durante la traducción :", e)
    else:
        return ""

In [10]:
model = whisper.load_model("base", device=DEVICE)
print(
    f"Model is {'multilingual' if model.is_multilingual else 'English-only'} "
)

Model is multilingual 


In [None]:
import gradio as gr
import os

def process_audio(audio_data_mic_path, audio_data_file_path, target_lang="en"):

    if audio_data_mic_path is None or os.path.getsize(audio_data_mic_path) == 0:
        audio_data = audio_data_file_path
    else:
        audio_data = audio_data_mic_path

    # Transcripción del Audio

    transcription = whisper_transcribe_audio(model, audio_data)
    print(transcription)

    # Traducción del audio al idioma objetivo
    translation = translation_from_esp_to_lang(target_lang, transcription)

    output_file_path = "output/output.wav"

    generate_audio(translation, audio_data, target_lang, output_file_path)

    return transcription, translation, output_file_path

# Crear la interfaz de Gradio
iface = gr.Interface(
    fn = process_audio,
    inputs = 
        [
        gr.Audio(sources="upload", type="filepath", label="Archivo de audio"),
        gr.Audio(sources="microphone", type="filepath", label="Graba tu audio"),
        gr.Dropdown(
                    ['en','fr', 'de', 'it', 'ar'], 
                    label="Idioma a Traducir", info="Info adicional"
        ),
        ],
    outputs=
        [
        gr.Textbox(label="Transcripción"), 
        gr.Textbox(label="Traducción"), 
        gr.Audio(type="filepath", label="Audio generado")
        ],
    title="Audio y Transcripcion",
    description="Graba tu voz y transcribela a texto.",
    allow_flagging = "never"
)

# Ejecutar la interfaz
iface.launch(debug=True)


In [12]:
def process_audio(audio_data_mic_path, audio_data_file_path):

    if audio_data_mic_path is None or os.path.getsize(audio_data_mic_path) == 0:
        audio_data = audio_data_file_path
    else:
        audio_data = audio_data_mic_path

    # Transcripción del Audio

    detected_lang, transcription = whisper_detect_lang_and_transcribe_audio(model, audio_data)
    print(detected_lang)
    print(transcription)

    # Traducción del audio al idioma objetivo
    translation = translation_from_lang_to_esp(detected_lang, transcription)

    output_file_path = "output/output.wav"

    generate_audio(translation, audio_data, "es", output_file_path)

    return transcription, translation, output_file_path

# Crear la interfaz de Gradio
iface = gr.Interface(
    fn = process_audio,
    inputs = 
        [
        gr.Audio(sources="upload", type="filepath", label="Audio File"),
        gr.Audio(sources="microphone", type="filepath", label="Record your audio"),
        ],
    outputs=
        [
        gr.Textbox(label="Transcripción"), 
        gr.Textbox(label="Translation"), 
        gr.Audio(type="filepath", label="Audio generated")
        ],
    title="Audio and Transcription",
    description="Record your voice and transcribe it to text.",
    allow_flagging = "never"
)

# Ejecutar la interfaz
iface.launch(debug=True)


Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


(tensor(50272), {'fa': 5.316548140399391e-06, 'sd': 1.3984866598093504e-07, 'tk': 5.117600787585275e-10, 'sa': 2.1366770397435175e-06, 'ta': 2.1097046101203887e-06, 'hr': 4.445303503075593e-08, 'jw': 1.7620612197788432e-05, 'si': 1.578731030349445e-06, 'oc': 1.5647387954231817e-06, 'nl': 1.585925383551512e-05, 'ps': 1.3039128816672019e-06, 'kk': 1.6700958838100632e-07, 'mk': 3.944379756148919e-08, 'hi': 6.9065849856997374e-06, 'yo': 1.5734492535557365e-06, 'haw': 1.0623280104482546e-05, 'mt': 1.672275288910896e-06, 'ht': 7.91949958056648e-07, 'bn': 2.448391569487285e-07, 'ln': 6.307045197218031e-08, 'bg': 1.4368102085882128e-07, 'yi': 6.737678859281004e-07, 'en': 0.00028171049780212343, 'kn': 8.622353675491468e-08, 'cy': 1.1375354915799107e-05, 'gu': 1.1640166164283983e-08, 'th': 7.942106776681612e-07, 'ca': 1.2773765547535731e-06, 'sn': 3.159991138090845e-06, 'da': 1.0267229981764103e-06, 'nn': 1.8076047126669437e-05, 'br': 9.232729098584969e-06, 'ur': 5.070365295978263e-05, 'el': 8.9

Traceback (most recent call last):
  File "/home/bmontes/miniconda3/envs/whisper-coqui/lib/python3.9/site-packages/gradio/queueing.py", line 528, in process_events
    response = await route_utils.call_process_api(
  File "/home/bmontes/miniconda3/envs/whisper-coqui/lib/python3.9/site-packages/gradio/route_utils.py", line 270, in call_process_api
    output = await app.get_blocks().process_api(
  File "/home/bmontes/miniconda3/envs/whisper-coqui/lib/python3.9/site-packages/gradio/blocks.py", line 1908, in process_api
    result = await self.call_function(
  File "/home/bmontes/miniconda3/envs/whisper-coqui/lib/python3.9/site-packages/gradio/blocks.py", line 1485, in call_function
    prediction = await anyio.to_thread.run_sync(
  File "/home/bmontes/miniconda3/envs/whisper-coqui/lib/python3.9/site-packages/anyio/to_thread.py", line 56, in run_sync
    return await get_async_backend().run_sync_in_worker_thread(
  File "/home/bmontes/miniconda3/envs/whisper-coqui/lib/python3.9/site-packa

Keyboard interruption in main thread... closing server.


