In [1]:
import os
import numpy as np

import torch
import pandas as pd
import whisper

In [2]:
from TTS.api import TTS

In [3]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [4]:
def generate_audio(text, speaker_wav_path, language, output_file_path):
    tts = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", progress_bar=True).to(DEVICE)
    # Generar el archivo de audio a partir del texto
    tts.tts_to_file(text=text, speaker_wav=speaker_wav_path, language=language, file_path=output_file_path)


In [5]:
def supported_lang(iso_code):
    languages = {
        'en': 'English',
        'fr': 'French',
        'de': 'German',
        'it': 'Italian',
        'ar': 'Arabic'
    }
    return iso_code in languages

In [6]:
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
def whisper_transcribe_audio(model, audio_data_path):
    audio = whisper.load_audio(audio_data_path)
    audio = whisper.pad_or_trim(audio)
   # Representación útil para el procesamiento de audio que enfatiza las frecuencias importantes
    mel = whisper.log_mel_spectrogram(audio).to(model.device)

    # Detectar el idioma hablado
    _, probs = model.detect_language(mel)
    print(model.detect_language(mel))
    print(f"Detected language: {max(probs, key=probs.get)}")
    detected_language = max(probs, key=probs.get)

    # Ayudar al modelo con el idioma origen, aunque no es necesario
    options = whisper.DecodingOptions(language=detected_language, task="transcribe", without_timestamps=True)
    result = whisper.decode(model, mel, options)
    # Devolver la transcripción
    return result.text  

In [8]:
def translation_from_esp_to_lang(lang, transcription):
    if supported_lang(lang):
        try:
            translator = pipeline("translation", model=f"Helsinki-NLP/opus-mt-es-{lang}")
            result = translator(transcription) 
            return result[0]['translation_text']
        except Exception as e:
            print("Error durante la traducción :", e)
    else:
        return ""

In [9]:
model = whisper.load_model("base", device=DEVICE)
print(
    f"Model is {'multilingual' if model.is_multilingual else 'English-only'} "
)

Model is multilingual 


In [13]:
import gradio as gr
import os

def process_audio(audio_data_mic_path, audio_data_file_path, target_lang="en"):

    if audio_data_mic_path is None or os.path.getsize(audio_data_mic_path) == 0:
        audio_data = audio_data_file_path
    else:
        audio_data = audio_data_mic_path

    # Transcripción del Audio

    transcription = whisper_transcribe_audio(model, audio_data)
    print(transcription)

    # Traducción del audio al idioma objetivo
    translation = translation_from_esp_to_lang(target_lang, transcription)

    output_file_path = "output/output.wav"

    generate_audio(translation, audio_data, target_lang, output_file_path)

    return transcription, translation, output_file_path

# Crear la interfaz de Gradio
iface = gr.Interface(
    fn = process_audio,
    inputs = 
        [
        gr.Audio(sources="upload", type="filepath", label="Archivo de audio"),
        gr.Audio(sources="microphone", type="filepath", label="Graba tu audio"),
        gr.Dropdown(
                    ['en','fr', 'de', 'it', 'ar'], 
                    label="Idioma a Traducir", info="Info adicional"
        ),
        ],
    outputs=
        [
        gr.Textbox(label="Transcripción"), 
        gr.Textbox(label="Traducción"), 
        gr.Audio(type="filepath", label="Audio generado")
        ],
    title="Audio y Transcripcion",
    description="Graba tu voz y transcribela a texto.",
    allow_flagging = "never"
)

# Ejecutar la interfaz
iface.launch(debug=True)


TypeError: __init__() got an unexpected keyword argument 'enable_queue'

In [None]:
import gradio as gr
import os

def process_audio(audio_data_mic_path, audio_data_file_path, target_lang="en"):

    if audio_data_mic_path is None or os.path.getsize(audio_data_mic_path) == 0:
        audio_data = audio_data_file_path
    else:
        audio_data = audio_data_mic_path

    # Transcripción del Audio

    transcription = whisper_transcribe_audio(model, audio_data)
    print(transcription)

    # Traducción del audio al idioma objetivo
    translation = translation_from_esp_to_lang(target_lang, transcription)

    output_file_path = "output.wav"

    generate_audio(translation, audio_data, target_lang, output_file_path)

    return transcription, translation, output_file_path

# Crear la interfaz de Gradio
iface = gr.Interface(
    process_audio,
    [
      gr.Audio(sources="upload", type="filepath", label="Archivo de audio"),
      gr.Audio(sources="microphone", type="filepath", label="Graba tu audio"),
      gr.Dropdown(
              ['en','fr', 'de', 'it', 'ar'], label="Idioma a Traducir", info="Info adicional"
      ),
    ],
    [
        "text", "text", "audio"
    ],
    title="Audio y Transcripcion",
    description="Graba tu voz y transcribela a texto."
)

# Ejecutar la interfaz
iface.launch(debug=True)
