In [1]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import sounddevice as sd
import numpy as np

# Load model and processor
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
forced_decoder_ids = processor.get_decoder_prompt_ids(language="italian", task="transcribe")

def record_audio(duration=5, samplerate=16000):
    """Record audio from microphone"""
    recording = sd.rec(int(duration * samplerate), samplerate=samplerate, channels=1)
    sd.wait()
    return recording.flatten()

def transcribe_audio(audio_chunk):
    """Transcribe a single audio chunk"""
    input_features = processor(
        audio_chunk, 
        sampling_rate=16000, 
        return_tensors="pt"
    ).input_features
    
    predicted_ids = model.generate(input_features, forced_decoder_ids=forced_decoder_ids)
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
    return transcription[0]

def continuous_transcription(chunk_duration=5):
    """Continuously record and transcribe audio in chunks"""
    print("Recording... Press Ctrl+C to stop")
    try:
        while True:
            print("\nRecording new chunk...")
            audio_chunk = record_audio(duration=chunk_duration)
            text = transcribe_audio(audio_chunk)
            print(f"Transcription: {text}")
    except KeyboardInterrupt:
        print("\nStopping transcription")

# Start transcription
continuous_transcription()


  from .autonotebook import tqdm as notebook_tqdm


Recording... Press Ctrl+C to stop

Recording new chunk...


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Transcription:  ...

Recording new chunk...
Transcription:  Sì, non ho fatto.

Recording new chunk...
Transcription:  Sì, non ho fatto.

Recording new chunk...

Stopping transcription
