In [3]:
from faster_whisper import WhisperModel
import wave
import pyaudio

  from .autonotebook import tqdm as notebook_tqdm


In [36]:
##try - except - finally
def record_chunk(p, stream, file_path, chunk_length=15):
    frames = []
    print('Generating audio file')
    for _ in range(0, int(16000 / 1024 * chunk_length)):
        data = stream.read(1024)
        frames.append(data)
    wf = wave.open(file_path, 'wb')
    wf.setnchannels(1)
    wf.setsampwidth(2)#p.get_sample_size(pyaudio.paInt16))
    wf.setframerate(16000)
    wf.writeframes(b''.join(frames))
    wf.close()

In [56]:
def main():
    model_size = "medium"
    model = WhisperModel(model_size, device="cpu",compute_type="int8")

    p = pyaudio.PyAudio()
    stream = p.open(format=pyaudio.paInt16, channels=1, rate=44100, input=True, frames_per_buffer=1024)

    accumulated_transcription= ""
    print('Intializing')
    i = 0
    try:
        while True:
            i +=1
            chunk_file=f"temp/temp_chunk_{i}.wav"
            record_chunk(p, stream, chunk_file)
            results, info = model.transcribe(chunk_file)
            for transcription in results:
                accumulated_transcription += transcription.text + " "
                print(f"Transcription: {transcription.text}")

    except KeyboardInterrupt:
        print("Stopping...")
        with open("log.txt", "w") as log_file:
            log_file.write(f"{accumulated_transcription}")
    finally:
        print("Record stopped")
        stream.stop_stream()
        stream.close()
        p.terminate()


In [38]:
main()

Intializing
Generating audio file
Transcription:  Thank you.
Generating audio file
Stopping...
Record stopped


In [124]:
import sounddevice as sd
import numpy as np
from pynput import keyboard
from scipy.io.wavfile import write
import tempfile
import os
from faster_whisper import WhisperModel
import os
os.environ["HF_HOME"] = "./huggingface_cache"

In [125]:
class WhisperTranscriber:
    def __init__(self, model_size="large-v2", sample_rate=44100):
        self.model_size= model_size
        self.sample_rate = sample_rate
        self.model = WhisperModel(model_size, device="cpu", compute_type="int8")
        self.is_recording = False
    
    def on_press(self, key):
        if key == keyboard.Key.space:
            if not self.is_recording:
                self.is_recording = True
                print("Recording started")
    
    def on_release(self, key):
        if key == keyboard.Key.space:
            if self.is_recording:
                self.is_recording = False
                print("Recording stopped")
                return False
    def record_audio(self):
        recording = np.array([], dtype='float64').reshape(0, 2)
        frames_per_buffer =int(self.sample_rate * 0.5)

        with keyboard.Listener(on_press = self.on_press, on_release = self.on_release) as listener:
            while True:
                if self.is_recording:
                    chunk = sd.rec(frames_per_buffer, samplerate=self.sample_rate, channels=2, dtype='float32')
                    sd.wait()
                    recording = np.vstack([recording, chunk])
                if not self.is_recording and len(recording) > 0:
                    break
            listener.join()
        return recording
    
    def save_temp_audio(self, recording, temp_path = 'temp/'):
        os.makedirs(temp_path, exist_ok=True) 
        print("Estoy guardando el audio")
        temp_file = tempfile.NamedTemporaryFile(dir=temp_path,delete=False, suffix='.wav')
        write( temp_file.name, self.sample_rate, recording)
        int16_recording = np.int16(recording * 32767)
        temp_file_int = tempfile.NamedTemporaryFile(dir=temp_path,delete=False, suffix='.wav')
        write( temp_file_int.name, self.sample_rate, int16_recording)
        return temp_file.name
    
    def transcribe_audio(self, file_path):
        print(file_path)
        segments, info = self.model.transcribe(file_path, beam_size=5)
        print(f"Detected language '{info.language}' with probability {info.language_probability}")
        full_transcription = ""
        for segment in segments:
            full_transcription += segment.text + " "
        # os.remove(file_path)
        return full_transcription
        
    def run(self):
        print("Hold the spacebar to start recording...")
        while True:
            recording = self.record_audio()
            file_path = self.save_temp_audio(recording)
            self.transcribe_audio(file_path)
            print("Press the spacebar to start recording again, or press Ctl+C to exit.")


In [126]:
transcriber = WhisperTranscriber()

In [127]:
transcriber.run()

Hold the spacebar to start recording...
Recording started
Recording stopped
Estoy guardando el audio
c:\Users\Enrique\OneDrive\Documentos\ai_voice_recognition\temp\tmp3llvej6_.wav
Detected language 'en' with probability 0.41731569170951843
Press the spacebar to start recording again, or press Ctl+C to exit.


KeyboardInterrupt: 

In [5]:
from faster_whisper import WhisperModel

# Ruta al archivo generado
audio_path = "temp/output_normalized.wav"

# Carga el modelo
model = WhisperModel("large-v2", device="cpu", compute_type="int8")

# Transcribe el audio
segments, info = model.transcribe(audio_path, language="es", beam_size=5, no_speech_threshold=0.9)

# Muestra los resultados
print(f"Detected language: {info.language} with probability {info.language_probability}")
for segment in segments:
    print(f"[{segment.start:.2f}s - {segment.end:.2f}s]: {segment.text}")
    print(f"Segment start: {segment.start}, end: {segment.end}")
    print(f"Text: {segment.text}")
    print(f"Tokens: {segment.tokens}")
    print(f"Avg logprob: {segment.avg_logprob}")
    print(f"Compression ratio: {segment.compression_ratio}")
    print(f"No speech prob: {segment.no_speech_prob}")

Detected language: es with probability 1
[0.00s - 3.00s]:  Subtítulos realizados por la comunidad de Amara.org
Segment start: 0.0, end: 3.0
Text:  Subtítulos realizados por la comunidad de Amara.org
Tokens: [50364, 8511, 83, 6712, 28348, 22828, 4181, 1515, 635, 35695, 368, 2012, 2419, 13, 4646, 50514]
Avg logprob: -0.3144052168902229
Compression ratio: 0.8813559322033898
No speech prob: 0.8903958201408386


In [17]:
import speech_recognition as sr

recognizer = sr.Recognizer()

# Usar Sphinx para reconocer desde un archivo de audio
with sr.AudioFile("temp/output.wav") as source:
    audio = recognizer.record(source)



# Usar Google Speech Recognition directamente
try:
    text = recognizer.recognize_google(audio, language="es-ES")
    print(f"Texto transcrito: {text}")
except sr.UnknownValueError:
    print("No se pudo entender el audio")
except sr.RequestError as e:
    print(f"No se pudo conectar al servicio de Google Speech Recognition; {e}")

No se pudo entender el audio


In [5]:
import whisper

# Cargar el modelo Whisper
model = whisper.load_model("base")

# Ruta del archivo de audio
audio_path = "temp/output_normalized.wav"

# Cargar el audio y obtener el espectrograma de Mel
audio = whisper.load_audio(audio_path)
mel = whisper.log_mel_spectrogram(audio).to(model.device)

# Detectar el lenguaje
# Whisper tiene un método de detección de lenguaje interno
lang_probs = model.detect_language(mel)
language = max(lang_probs, key=lang_probs.get)  # Detecta el idioma con la mayor probabilidad
print(f"Detected language: {language} with probability: {lang_probs[language]:.2f}")

# Crear las opciones para la decodificación (ya que el idioma está detectado, lo pasamos aquí)
options = whisper.DecodingOptions(fp16=False, language=language, beam_size=5)

# Decodificar el audio
result = whisper.decode(model, mel, options)

# Mostrar el resultado de la transcripción
print(f"Transcription: {result.text}")


AssertionError: incorrect audio shape

In [12]:
import whisper

model = whisper.load_model("large-v2")

# load the entire audio file
audio = whisper.load_audio(audio_path)

options = {
    "language": "es", # input language, if omitted is auto detected
    "task": "translate" # or "transcribe" if you just want transcription
}
result = whisper.transcribe(model, audio, **options)
print(result["text"])

100%|█████████████████████████████████████| 2.87G/2.87G [14:08<00:00, 3.64MiB/s]
  checkpoint = torch.load(fp, map_location=device)





In [17]:
from pydub import AudioSegment
import numpy as np
import torch

model="medium"
audio_model = whisper.load_model(model, device="cpu")
audio_segment = AudioSegment.from_wav(audio_path)
#audio_segment = audio_segment.low_pass_filter(1000)
# get sample rate
if audio_segment.frame_rate != 16000: # 16 kHz
    print('Frecuencia')
    audio_segment = audio_segment.set_frame_rate(16000)
if audio_segment.sample_width != 2:   # int16
    print('canales')
    audio_segment = audio_segment.set_sample_width(2)
if audio_segment.channels != 1:       # mono
    print('mono')
    audio_segment = audio_segment.set_channels(1)        
arr = np.array(audio_segment.get_array_of_samples())
arr = arr.astype(np.float32)/32768.0
# to device
print(f"Transcribing...")
result = audio_model.transcribe(arr, language="es", fp16=torch.cuda.is_available())
text = result['text'].strip()
print(text)

Frecuencia
mono
Transcribing...



In [None]:
import sounddevice as sd
import numpy as np
import whisper
import tempfile
import os
from pynput import keyboard


class WhisperTranscriber:
    def __init__(self, model_size="medium", sample_rate=16000):
        self.sample_rate = sample_rate
        self.model_size = model_size
        self.model = whisper.load_model(self.model_size)
        self.is_recording = False
        self.recording = []  # Para almacenar los datos grabados

    def on_press(self, key):
        if key == keyboard.Key.space and not self.is_recording:
            self.is_recording = True
            self.recording = []  # Resetea la grabación
            print("Recording started...")

    def on_release(self, key):
        if key == keyboard.Key.space and self.is_recording:
            self.is_recording = False
            print("Recording stopped.")
            return False  # Salir del listener

    def record_audio(self, duration=15):
        print("Hold the spacebar to start recording...")
        with keyboard.Listener(on_press=self.on_press, on_release=self.on_release) as listener:
            while True:
                if self.is_recording:
                    # Grabar en chunks pequeños (streaming)
                    chunk = sd.rec(int(self.sample_rate * duration), samplerate=self.sample_rate, channels=1, dtype="int16")
                    sd.wait()  # Esperar a que el chunk termine
                    self.recording.append(chunk)
                else:
                    if len(self.recording) > 0:
                        break
            listener.join()

        # Combinar todos los chunks grabados
        full_recording = np.concatenate(self.recording, axis=0)
        return full_recording

    def save_temp_audio(self, recording, temp_path="temp/"):
        os.makedirs(temp_path, exist_ok=True)
        with tempfile.NamedTemporaryFile(dir=temp_path, delete=False, suffix=".wav") as temp_file:
            # Guardar la grabación directamente como WAV
            from scipy.io.wavfile import write
            write(temp_file.name, self.sample_rate, recording)
            return temp_file.name

    def transcribe_audio(self, file_path):
        print(f"Transcribing file: {file_path}")
        result = self.model.transcribe(file_path, language="es")
        print(f"Transcription: {result['text']}")
        os.remove(file_path)  # Limpiar archivo temporal
        return result['text']

    def run(self):
        try:
            while True:
                audio_data = self.record_audio()
                temp_audio_path = self.save_temp_audio(audio_data)
                transcription = self.transcribe_audio(temp_audio_path)
                print("\nPress the spacebar to record again, or Ctrl+C to exit.")
        except KeyboardInterrupt:
            print("Exiting...")
            return


if __name__ == "__main__":
    transcriber = WhisperTranscriber()
    transcriber.run()


  checkpoint = torch.load(fp, map_location=device)


Hold the spacebar to start recording...
Recording started...
Recording stopped.
Transcribing file: c:\Users\Enrique\OneDrive\Documentos\ai_voice_recognition\temp\tmptn7pv15i.wav




Transcription:  Hola, mi nombre es Enrique Delgado Aznar y mi DNI es 778-416-99Q.

Press the spacebar to record again, or Ctrl+C to exit.
Hold the spacebar to start recording...


In [3]:
import sounddevice as sd
from scipy.io.wavfile import write
import numpy as np
import wave

def record_and_playback(sample_rate=44100, duration=5, output_file="test_recording.wav"):
    print("Grabando...")
    recording = sd.rec(int(sample_rate * duration), samplerate=sample_rate, channels=1, dtype='int16')
    sd.wait()  # Esperar a que termine la grabación
    print("Grabación completada. Guardando archivo...")

    # Guardar archivo WAV
    write(output_file, sample_rate, recording)
    print(f"Grabación guardada como {output_file}")
    
    # Reproducir el archivo grabado para verificar la calidad
    print("Reproduciendo la grabación...")
    with wave.open(output_file, 'rb') as wf:
        sample_rate = wf.getframerate()
        data = wf.readframes(wf.getnframes())
        audio = np.frombuffer(data, dtype='int16')
        sd.play(audio, samplerate=sample_rate)
        sd.wait()  # Esperar a que termine la reproducción

record_and_playback()

Grabando...
Grabación completada. Guardando archivo...
Grabación guardada como test_recording.wav
Reproduciendo la grabación...
