<a href="https://colab.research.google.com/github/Andrescob/Audio_video-a-Texto/blob/main/Audio_a_Texto_Vosk.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install vosk
import subprocess  # For ffmpeg
import vosk
import os

# 0. Download the model if it doesn't exist
model_name = "vosk-model-en-us-0.22"
model_dir = model_name
if not os.path.exists(model_dir):
    print(f"Downloading model '{model_name}'...")
    !wget https://alphacephei.com/vosk/models/vosk-model-en-us-0.22.zip
    !unzip vosk-model-en-us-0.22.zip
    print("Model downloaded and extracted.")
else:
    print(f"Model '{model_name}' already exists.")

# 1. Audio Extraction (using ffmpeg)
try:
    # Check if input.mp4 exists
    if not os.path.exists('input.mp4'):
        raise FileNotFoundError("Error: input.mp4 not found in the current directory.")

    subprocess.run(['ffmpeg', '-i', 'input.mp4', '-vn', '-acodec', 'pcm_s16le', '-ar', '16000', 'audio.wav'], check=True, capture_output=True)

    # Check if audio.wav exists after ffmpeg
    if not os.path.exists('audio.wav'):
          raise FileNotFoundError("Error: audio.wav was not created. Please check the ffmpeg output for errors.")

except FileNotFoundError as e:
    print(e)
except subprocess.CalledProcessError as e:
    print(f"FFmpeg Error: {e}")
    print(f"FFmpeg Stderr: {e.stderr.decode()}")
    print("Please make sure input.mp4 exists and is a valid video file. You may need to install additional codecs or packages for ffmpeg.")


# 2. (Preprocessing, if necessary)

# 3. Speech-to-Text (using Vosk)
model = vosk.Model(model_dir) # Changed to load the model from the directory.
rec = vosk.KaldiRecognizer(model, 16000)
try:
    with open("audio.wav", "rb") as wf:
        while True:
            data = wf.read(4000)
            if len(data) == 0:
                break
            if rec.AcceptWaveform(data):
                print(rec.Result())
        print(rec.FinalResult())
except FileNotFoundError as e:
    print(f"Error: {e}")
    print("Please make sure 'audio.wav' was created in the step above.  Check for ffmpeg errors above")

Model 'vosk-model-en-us-0.22' already exists.
Error: input.mp4 not found in the current directory.


In [None]:
!pip install ipywidgets
from ipywidgets import FileUpload
from IPython.display import display
import os

def handle_upload(change):
    uploaded_file = list(uploader.value.values())[0]
    content = uploaded_file['content']
    file_name = uploaded_file['name']

    with open(file_name, 'wb') as f:
        f.write(content)
    print(f'Archivo "{file_name}" cargado exitosamente.')

    #Opcional: Devuelve el nombre del archivo para usarlo posteriormente
    return file_name

uploader = FileUpload(accept='.mp4,.wav', multiple=False)
uploader.observe(handle_upload, names='value')
display(uploader)

In [None]:
!pip install vosk
!pip install python-docx
!pip install langdetect
from ipywidgets import FileUpload
from IPython.display import display
import os
import subprocess
import vosk
from docx import Document
from docx.shared import Inches
from langdetect import detect


# 0. Download the model if it doesn't exist (using english as default model)
model_name = "vosk-model-en-us-0.22"
model_dir = model_name
if not os.path.exists(model_dir):
    print(f"Downloading model '{model_name}'...")
    !wget https://alphacephei.com/vosk/models/vosk-model-en-us-0.22.zip
    !unzip vosk-model-en-us-0.22.zip
    print("Model downloaded and extracted.")
else:
    print(f"Model '{model_name}' already exists.")

def get_vosk_model(language):
  """Download and returns the vosk model based on the specified language.
  If the model does not exist, it is downloaded.
  If the language is not supported, uses english as default.
  """
  if language == "es":
      model_name = "vosk-model-es-0.42"
  elif language == "fr":
      model_name = "vosk-model-fr-0.22"
  elif language == "de":
      model_name = "vosk-model-de-0.22"
  elif language == "pt":
      model_name = "vosk-model-pt-fb-v0.1.1-20200508"
  elif language == "it":
      model_name = "vosk-model-it-0.22"
  elif language == "ru":
        model_name = "vosk-model-ru-0.42"
  elif language == 'zh':
        model_name = "vosk-model-cn-0.22"
  else:
    model_name = "vosk-model-en-us-0.22" # Default to english if no supported language is found

  model_dir = model_name
  if not os.path.exists(model_dir):
      print(f"Downloading model '{model_name}'...")
      try:
          !wget https://alphacephei.com/vosk/models/{model_name}.zip
          !unzip {model_name}.zip
      except:
          print(f'Error downloading model: {model_name}, using english model instead')
          model_name = "vosk-model-en-us-0.22"
          model_dir = model_name
          if not os.path.exists(model_dir):
              print(f"Downloading model '{model_name}'...")
              !wget https://alphacephei.com/vosk/models/vosk-model-en-us-0.22.zip
              !unzip vosk-model-en-us-0.22.zip
              print("Model downloaded and extracted.")
  else:
      print(f"Model '{model_name}' already exists.")
  return vosk.Model(model_dir)

def create_word_document(text, filename):
  """Creates a .docx document with the given text and triggers download."""
  document = Document()
  document.add_paragraph(text)
  document.save(filename)

  # Trigger download
  from IPython.display import FileLink
  display(FileLink(filename))


def transcribe_audio(audio_file, language):
    """Transcribes an audio file to text using Vosk, identifying the language dynamically."""

    #1. Extract audio (using ffmpeg)
    try:
        # Check if input.mp4 or .wav exists
      if not os.path.exists(audio_file):
            raise FileNotFoundError(f"Error: {audio_file} not found in the current directory.")

      if audio_file.lower().endswith('.mp4'):
         subprocess.run(['ffmpeg', '-i', audio_file, '-vn', '-acodec', 'pcm_s16le', '-ar', '16000', 'audio.wav'], check=True, capture_output=True)
         audio_for_transcription = 'audio.wav'
      else:
          audio_for_transcription = audio_file


      # Check if audio.wav exists after ffmpeg
      if audio_for_transcription == 'audio.wav' and not os.path.exists('audio.wav'):
            raise FileNotFoundError("Error: audio.wav was not created. Please check the ffmpeg output for errors.")


    except FileNotFoundError as e:
       print(e)
       return None
    except subprocess.CalledProcessError as e:
        print(f"FFmpeg Error: {e}")
        print(f"FFmpeg Stderr: {e.stderr.decode()}")
        print("Please make sure input.mp4 exists and is a valid video file. You may need to install additional codecs or packages for ffmpeg.")
        return None


    # 2. Speech-to-Text (using Vosk)
    model = get_vosk_model(language)

    rec = vosk.KaldiRecognizer(model, 16000)
    try:
        with open(audio_for_transcription, "rb") as wf:
            text_output = ""
            while True:
                data = wf.read(4000)
                if len(data) == 0:
                    break
                if rec.AcceptWaveform(data):
                  result = rec.Result()
                  text_output += json.loads(result)['text'] + " "

            result = rec.FinalResult()
            text_output += json.loads(result)['text']

        if audio_for_transcription == 'audio.wav' and os.path.exists('audio.wav'):
          os.remove('audio.wav')
        return text_output
    except FileNotFoundError as e:
        print(f"Error: {e}")
        print("Please make sure 'audio.wav' was created in the step above.  Check for ffmpeg errors above")
        return None


def handle_upload(change):
    uploaded_file = list(uploader.value.values())[0]
    content = uploaded_file['content']
    file_name = uploaded_file['name']

    with open(file_name, 'wb') as f:
        f.write(content)
    print(f'Archivo "{file_name}" cargado exitosamente.')

    try:
        language = detect(open(file_name, 'rb').read(1000).decode('utf-8', 'ignore'))
        print(f'Detected language: {language}')
        text = transcribe_audio(file_name, language)

        if text:
          doc_filename = os.path.splitext(file_name)[0] + ".docx"
          create_word_document(text, doc_filename)
          print(f'Transcription saved to {doc_filename}')
        else:
          print('Transcription failed')
        os.remove(file_name) # remove the temp audio file
    except Exception as e:
        print('Error during transcription', e)
        os.remove(file_name)


uploader = FileUpload(accept='.mp4,.wav', multiple=False)
uploader.observe(handle_upload, names='value')
display(uploader)

Model 'vosk-model-en-us-0.22' already exists.


FileUpload(value={}, accept='.mp4,.wav', description='Upload')

In [None]:
!pip install vosk
!pip install python-docx
!pip install langdetect
!pip install librosa
import os
import subprocess
import vosk
import json
import librosa
from docx import Document
from IPython.display import display, FileLink
from ipywidgets import FileUpload
from langdetect import detect
import asyncio

# Global dictionary to store loaded models
loaded_models = {}

# 0. Download the model if it doesn't exist (using english as default model)
model_name = "vosk-model-en-us-0.22"
model_dir = model_name
if not os.path.exists(model_dir):
    print(f"Downloading model '{model_name}'...")
    !wget https://alphacephei.com/vosk/models/vosk-model-en-us-0.22.zip
    !unzip vosk-model-en-us-0.22.zip
    print("Model downloaded and extracted.")
else:
    print(f"Model '{model_name}' already exists.")


async def get_vosk_model(language):
    """Downloads and returns the vosk model based on the specified language.
    Uses a global dictionary for caching models.
    """
    global loaded_models
    if language in loaded_models:
        return loaded_models[language]


    if language == "es":
        model_name = "vosk-model-es-0.42"
    elif language == "fr":
        model_name = "vosk-model-fr-0.22"
    elif language == "de":
        model_name = "vosk-model-de-0.22"
    elif language == "pt":
        model_name = "vosk-model-pt-fb-v0.1.1-20200508"
    elif language == "it":
        model_name = "vosk-model-it-0.22"
    elif language == "ru":
          model_name = "vosk-model-ru-0.42"
    elif language == 'zh':
          model_name = "vosk-model-cn-0.22"
    else:
      model_name = "vosk-model-en-us-0.22" # Default to english if no supported language is found

    model_dir = model_name
    if not os.path.exists(model_dir):
        print(f"Downloading model '{model_name}'...")
        try:
            !wget https://alphacephei.com/vosk/models/{model_name}.zip
            !unzip {model_name}.zip
        except Exception as e:
            print(f'Error downloading model: {model_name}, using english model instead, {e}')
            model_name = "vosk-model-en-us-0.22"
            model_dir = model_name
            if not os.path.exists(model_dir):
                print(f"Downloading model '{model_name}'...")
                !wget https://alphacephei.com/vosk/models/vosk-model-en-us-0.22.zip
                !unzip vosk-model-en-us-0.22.zip
                print("Model downloaded and extracted.")
    else:
        print(f"Model '{model_name}' already exists.")

    model = vosk.Model(model_dir)
    loaded_models[language] = model # Cache the loaded model
    return model


def create_word_document(text, filename):
  """Creates a .docx document with the given text and triggers download."""
  document = Document()
  document.add_paragraph(text)
  document.save(filename)

  # Trigger download
  display(FileLink(filename))



async def transcribe_audio(audio_file, language):
    """Transcribes an audio file to text using Vosk, identifying the language dynamically."""

    #1. Extract audio (using ffmpeg)
    try:
        # Check if input.mp4 or .wav exists
      if not os.path.exists(audio_file):
            raise FileNotFoundError(f"Error: {audio_file} not found in the current directory.")

      audio_for_transcription = audio_file #Default file for transcription
      if audio_file.lower().endswith('.mp4'):
        # Check if the output audio file already exists
        if not os.path.exists('audio.wav'):
            subprocess.run(['ffmpeg', '-i', audio_file, '-vn', '-acodec', 'pcm_s16le', '-ar', '16000', 'audio.wav'], check=True, capture_output=True)
        audio_for_transcription = 'audio.wav'


      # Check if audio.wav exists after ffmpeg
      if audio_for_transcription == 'audio.wav' and not os.path.exists('audio.wav'):
            raise FileNotFoundError("Error: audio.wav was not created. Please check the ffmpeg output for errors.")


    except FileNotFoundError as e:
       print(e)
       return None
    except subprocess.CalledProcessError as e:
        print(f"FFmpeg Error: {e}")
        print(f"FFmpeg Stderr: {e.stderr.decode()}")
        print("Please make sure input.mp4 exists and is a valid video file. You may need to install additional codecs or packages for ffmpeg.")
        return None


    # 2. Speech-to-Text (using Vosk)
    model = await get_vosk_model(language) # Await the model loading

    rec = vosk.KaldiRecognizer(model, 16000)
    try:
        if audio_for_transcription.lower().endswith('.wav'):
           # Using librosa for direct .wav file processing to bypass file IO as much as possible
           y, sr = librosa.load(audio_for_transcription, sr=16000)
           data = (y * 32768).astype("int16").tobytes()

           text_output = ""
           if rec.AcceptWaveform(data):
              text_output += json.loads(rec.Result())['text'] + " "
           text_output += json.loads(rec.FinalResult())['text']
        else:
           with open(audio_for_transcription, "rb") as wf:
              text_output = ""
              while True:
                  data = wf.read(4000)
                  if len(data) == 0:
                      break
                  if rec.AcceptWaveform(data):
                    result = rec.Result()
                    text_output += json.loads(result)['text'] + " "

              result = rec.FinalResult()
              text_output += json.loads(result)['text']


        if audio_for_transcription == 'audio.wav' and os.path.exists('audio.wav'):
          os.remove('audio.wav')
        return text_output
    except FileNotFoundError as e:
        print(f"Error: {e}")
        print("Please make sure 'audio.wav' was created in the step above.  Check for ffmpeg errors above")
        return None


async def handle_upload(change):
    uploaded_file = list(uploader.value.values())[0]
    content = uploaded_file['content']
    file_name = uploaded_file['name']

    with open(file_name, 'wb') as f:
        f.write(content)
    print(f'Archivo "{file_name}" cargado exitosamente.')

    try:
        language = detect(open(file_name, 'rb').read(1000).decode('utf-8', 'ignore'))
        print(f'Detected language: {language}')
        text = await transcribe_audio(file_name, language)

        if text:
          doc_filename = os.path.splitext(file_name)[0] + ".docx"
          create_word_document(text, doc_filename)
          print(f'Transcription saved to {doc_filename}')
        else:
          print('Transcription failed')
        os.remove(file_name) # remove the temp audio file
    except Exception as e:
        print('Error during transcription', e)
        os.remove(file_name)


uploader = FileUpload(accept='.mp4,.wav', multiple=False)
uploader.observe(lambda change: asyncio.run(handle_upload(change)), names='value')
display(uploader)

In [5]:
import os
import vosk
import json
import librosa
import ffmpeg
import numpy as np
from docx import Document
from langdetect import detect
from google.colab import files

# Diccionario global para modelos cargados
loaded_models = {}

# Función para obtener el modelo de Vosk sin descargarlo repetidamente
def get_vosk_model(language):
    global loaded_models
    if language in loaded_models:
        return loaded_models[language]

    models = {
        "es": "vosk-model-es-0.42",
        "fr": "vosk-model-fr-0.22",
        "de": "vosk-model-de-0.22",
        "pt": "vosk-model-pt-fb-v0.1.1-20200508",
        "it": "vosk-model-it-0.22",
        "ru": "vosk-model-ru-0.42",
        "zh": "vosk-model-cn-0.22",
        "default": "vosk-model-en-us-0.22"
    }

    model_name = models.get(language, models["default"])
    model_path = f"/content/{model_name}"

    if not os.path.exists(model_path):
        print(f"Descargando modelo {model_name}...")
        os.system(f"wget -q https://alphacephei.com/vosk/models/{model_name}.zip && unzip -q {model_name}.zip -d /content")

    loaded_models[language] = vosk.Model(model_path)
    return loaded_models[language]

# Función optimizada de transcripción
def transcribe_audio(audio_file, language="es"):
    model = get_vosk_model(language)
    rec = vosk.KaldiRecognizer(model, 16000)

    # Cargar el audio con librosa
    y, sr = librosa.load(audio_file, sr=16000)
    audio_data = (y * 32768).astype(np.int16).tobytes()

    # Procesar audio en fragmentos grandes
    chunk_size = 8000  # 0.5s por fragmento
    text_output = ""
    for i in range(0, len(audio_data), chunk_size):
        if rec.AcceptWaveform(audio_data[i:i+chunk_size]):
            text_output += json.loads(rec.Result())["text"] + " "
    text_output += json.loads(rec.FinalResult())["text"]

    return text_output.strip()

# Función para guardar la transcripción en un archivo Word
def create_word_document(text, filename):
    document = Document()
    document.add_paragraph(text)
    document.save(filename)
    files.download(filename)

# Función principal para manejar la subida y transcripción de archivos
def handle_upload():
    uploaded = files.upload()
    for filename in uploaded.keys():
        print(f"Procesando {filename}...")
        try:
            text = transcribe_audio(filename)
            if text:
                doc_filename = filename.rsplit('.', 1)[0] + ".docx"
                create_word_document(text, doc_filename)
                print(f"Transcripción guardada en {doc_filename}")
            else:
                print("Error: No se pudo obtener la transcripción.")
        except Exception as e:
            print(f"Error procesando {filename}: {e}")

# Llamar a la función para subir archivos
display("Sube un archivo .mp4 o .wav para transcribir:")
handle_upload()


ModuleNotFoundError: No module named 'ffmpeg'

In [6]:
!pip install ffmpeg-python
import os
import vosk
import json
import librosa
import ffmpeg as ff # Changed import
import numpy as np
from docx import Document
from langdetect import detect
from google.colab import files

# Diccionario global para modelos cargados
loaded_models = {}

# Función para obtener el modelo de Vosk sin descargarlo repetidamente
def get_vosk_model(language):
    global loaded_models
    if language in loaded_models:
        return loaded_models[language]

    models = {
        "es": "vosk-model-es-0.42",
        "fr": "vosk-model-fr-0.22",
        "de": "vosk-model-de-0.22",
        "pt": "vosk-model-pt-fb-v0.1.1-20200508",
        "it": "vosk-model-it-0.22",
        "ru": "vosk-model-ru-0.42",
        "zh": "vosk-model-cn-0.22",
        "default": "vosk-model-en-us-0.22"
    }

    model_name = models.get(language, models["default"])
    model_path = f"/content/{model_name}"

    if not os.path.exists(model_path):
        print(f"Descargando modelo {model_name}...")
        os.system(f"wget -q https://alphacephei.com/vosk/models/{model_name}.zip && unzip -q {model_name}.zip -d /content")

    loaded_models[language] = vosk.Model(model_path)
    return loaded_models[language]

# Función optimizada de transcripción
def transcribe_audio(audio_file, language="es"):
    model = get_vosk_model(language)
    rec = vosk.KaldiRecognizer(model, 16000)

    # Cargar el audio con librosa
    y, sr = librosa.load(audio_file, sr=16000)
    audio_data = (y * 32768).astype(np.int16).tobytes()

    # Procesar audio en fragmentos grandes
    chunk_size = 8000  # 0.5s por fragmento
    text_output = ""
    for i in range(0, len(audio_data), chunk_size):
        if rec.AcceptWaveform(audio_data[i:i+chunk_size]):
            text_output += json.loads(rec.Result())["text"] + " "
    text_output += json.loads(rec.FinalResult())["text"]

    return text_output.strip()

# Función para guardar la transcripción en un archivo Word
def create_word_document(text, filename):
    document = Document()
    document.add_paragraph(text)
    document.save(filename)
    files.download(filename)

# Función principal para manejar la subida y transcripción de archivos
def handle_upload():
    uploaded = files.upload()
    for filename in uploaded.keys():
        print(f"Procesando {filename}...")
        try:
            text = transcribe_audio(filename)
            if text:
                doc_filename = filename.rsplit('.', 1)[0] + ".docx"
                create_word_document(text, doc_filename)
                print(f"Transcripción guardada en {doc_filename}")
            else:
                print("Error: No se pudo obtener la transcripción.")
        except Exception as e:
            print(f"Error procesando {filename}: {e}")

# Llamar a la función para subir archivos
display("Sube un archivo .mp4 o .wav para transcribir:")
handle_upload()

Collecting ffmpeg-python
  Downloading ffmpeg_python-0.2.0-py3-none-any.whl.metadata (1.7 kB)
Downloading ffmpeg_python-0.2.0-py3-none-any.whl (25 kB)
Installing collected packages: ffmpeg-python
Successfully installed ffmpeg-python-0.2.0


'Sube un archivo .mp4 o .wav para transcribir:'

Saving Vídeo sin título ‐ Hecho con Clipchamp (2).srt to Vídeo sin título ‐ Hecho con Clipchamp (2).srt
Procesando Vídeo sin título ‐ Hecho con Clipchamp (2).srt...
Descargando modelo vosk-model-es-0.42...
Error procesando Vídeo sin título ‐ Hecho con Clipchamp (2).srt: 


  y, sr = librosa.load(audio_file, sr=16000)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


In [7]:
import os
import vosk
import json
import ffmpeg
import numpy as np
import soundfile as sf
from docx import Document
from langdetect import detect
from google.colab import files

# Diccionario global para modelos cargados
loaded_models = {}

# Función para obtener el modelo de Vosk sin descargarlo repetidamente
def get_vosk_model(language):
    global loaded_models
    if language in loaded_models:
        return loaded_models[language]

    models = {
        "es": "vosk-model-es-0.42",
        "fr": "vosk-model-fr-0.22",
        "de": "vosk-model-de-0.22",
        "pt": "vosk-model-pt-fb-v0.1.1-20200508",
        "it": "vosk-model-it-0.22",
        "ru": "vosk-model-ru-0.42",
        "zh": "vosk-model-cn-0.22",
        "default": "vosk-model-en-us-0.22"
    }

    model_name = models.get(language, models["default"])
    model_path = f"/content/{model_name}"

    if not os.path.exists(model_path):
        print(f"Descargando modelo {model_name}...")
        os.system(f"wget -q https://alphacephei.com/vosk/models/{model_name}.zip && unzip -q {model_name}.zip -d /content")

    loaded_models[language] = vosk.Model(model_path)
    return loaded_models[language]

# Función para convertir audio a WAV si es necesario
def convert_to_wav(audio_file):
    wav_file = audio_file.rsplit('.', 1)[0] + ".wav"
    if not audio_file.lower().endswith(".wav"):
        print(f"Convirtiendo {audio_file} a formato WAV...")
        ffmpeg.input(audio_file).output(wav_file, format='wav', acodec='pcm_s16le', ac=1, ar='16000').run(overwrite_output=True)
    else:
        wav_file = audio_file
    return wav_file

# Función optimizada de transcripción
def transcribe_audio(audio_file, language="es"):
    model = get_vosk_model(language)
    rec = vosk.KaldiRecognizer(model, 16000)

    audio_file = convert_to_wav(audio_file)

    with sf.SoundFile(audio_file) as sf_file:
        audio_data = sf_file.read(dtype='int16')
        audio_data = np.array(audio_data, dtype=np.int16).tobytes()

    # Procesar audio en fragmentos grandes
    chunk_size = 8000  # 0.5s por fragmento
    text_output = ""
    for i in range(0, len(audio_data), chunk_size):
        if rec.AcceptWaveform(audio_data[i:i+chunk_size]):
            text_output += json.loads(rec.Result())["text"] + " "
    text_output += json.loads(rec.FinalResult())["text"]

    return text_output.strip()

# Función para guardar la transcripción en un archivo Word
def create_word_document(text, filename):
    document = Document()
    document.add_paragraph(text)
    document.save(filename)
    files.download(filename)

# Función principal para manejar la subida y transcripción de archivos
def handle_upload():
    uploaded = files.upload()
    for filename in uploaded.keys():
        print(f"Procesando {filename}...")
        try:
            text = transcribe_audio(filename)
            if text:
                doc_filename = filename.rsplit('.', 1)[0] + ".docx"
                create_word_document(text, doc_filename)
                print(f"Transcripción guardada en {doc_filename}")
            else:
                print("Error: No se pudo obtener la transcripción.")
        except Exception as e:
            print(f"Error procesando {filename}: {e}")

# Llamar a la función para subir archivos
display("Sube un archivo .mp4 o .wav para transcribir:")
handle_upload()


'Sube un archivo .mp4 o .wav para transcribir:'

Saving Vídeo sin título ‐ Hecho con Clipchamp (3).mp4 to Vídeo sin título ‐ Hecho con Clipchamp (3).mp4
Procesando Vídeo sin título ‐ Hecho con Clipchamp (3).mp4...
Convirtiendo Vídeo sin título ‐ Hecho con Clipchamp (3).mp4 a formato WAV...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Transcripción guardada en Vídeo sin título ‐ Hecho con Clipchamp (3).docx


In [8]:
import os
import vosk
import json
import ffmpeg
import numpy as np
import soundfile as sf
from docx import Document
from langdetect import detect
from google.colab import files

# Diccionario global para modelos cargados
loaded_models = {}

# Función para obtener el modelo de Vosk sin descargarlo repetidamente
def get_vosk_model(language):
    global loaded_models
    if language in loaded_models:
        return loaded_models[language]

    models = {
        "es": "vosk-model-es-0.42",
        "fr": "vosk-model-fr-0.22",
        "de": "vosk-model-de-0.22",
        "pt": "vosk-model-pt-fb-v0.1.1-20200508",
        "it": "vosk-model-it-0.22",
        "ru": "vosk-model-ru-0.42",
        "zh": "vosk-model-cn-0.22",
        "default": "vosk-model-en-us-0.22"
    }

    model_name = models.get(language, models["default"])
    model_path = f"/content/{model_name}"

    if not os.path.exists(model_path):
        print(f"Descargando modelo {model_name}...")
        os.system(f"wget -q https://alphacephei.com/vosk/models/{model_name}.zip && unzip -q {model_name}.zip -d /content")

    loaded_models[language] = vosk.Model(model_path)
    return loaded_models[language]

# Función para convertir audio a WAV si es necesario
def convert_to_wav(audio_file):
    wav_file = audio_file.rsplit('.', 1)[0] + ".wav"
    if not audio_file.lower().endswith(".wav"):
        print(f"Convirtiendo {audio_file} a formato WAV...")
        ffmpeg.input(audio_file).output(wav_file, format='wav', acodec='pcm_s16le', ac=1, ar='16000').run(overwrite_output=True)
    else:
        wav_file = audio_file
    return wav_file

# Función optimizada de transcripción
def transcribe_audio(audio_file, language="es"):
    model = get_vosk_model(language)
    rec = vosk.KaldiRecognizer(model, 16000)

    audio_file = convert_to_wav(audio_file)

    with sf.SoundFile(audio_file) as sf_file:
        audio_data = sf_file.read(dtype='int16')
        audio_data = np.array(audio_data, dtype=np.int16).tobytes()

    # Procesar audio en fragmentos grandes
    chunk_size = 8000  # 0.5s por fragmento
    text_output = ""
    for i in range(0, len(audio_data), chunk_size):
        if rec.AcceptWaveform(audio_data[i:i+chunk_size]):
            text_output += json.loads(rec.Result())["text"] + " "
    text_output += json.loads(rec.FinalResult())["text"]

    return text_output.strip()

# Función para guardar la transcripción en un archivo Word
def create_word_document(text, filename):
    document = Document()
    document.add_paragraph(text)
    document.save(filename)
    files.download(filename)

# Función principal para manejar la subida y transcripción de archivos
def handle_upload():
    uploaded = files.upload()
    for filename in uploaded.keys():
        print(f"Procesando {filename}...")
        try:
            text = transcribe_audio(filename)
            if text:
                doc_filename = filename.rsplit('.', 1)[0] + ".docx"
                create_word_document(text, doc_filename)
                print(f"Transcripción guardada en {doc_filename}")
            else:
                print("Error: No se pudo obtener la transcripción.")
        except Exception as e:
            print(f"Error procesando {filename}: {e}")

# Llamar a la función para subir archivos
display("Sube un archivo .mp4 o .wav para transcribir:")
handle_upload()


'Sube un archivo .mp4 o .wav para transcribir:'

Saving Vídeo sin título ‐ Hecho con Clipchamp (3).mp4 to Vídeo sin título ‐ Hecho con Clipchamp (3) (1).mp4
Procesando Vídeo sin título ‐ Hecho con Clipchamp (3) (1).mp4...
Convirtiendo Vídeo sin título ‐ Hecho con Clipchamp (3) (1).mp4 a formato WAV...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Transcripción guardada en Vídeo sin título ‐ Hecho con Clipchamp (3) (1).docx
