In [1]:
import time
import numpy as np
import torch
from datetime import datetime
from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor
import scipy.io.wavfile as wavfile

  from .autonotebook import tqdm as notebook_tqdm


for microphone input

In [5]:
import matplotlib.pyplot as plt
from scipy.signal import resample
import pyaudiowpatch as pyaudio # or pyaudio if no loopback needed

Set device

In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

if device == 'cuda':
    print("CUDA is available!")
else:
    print("CUDA is not available. Using CPU.")

CUDA is available!


Model Setup

In [None]:
model_id = "openai/whisper-small"  
asr_pipe = pipeline(
    "automatic-speech-recognition",
    model=model_id,
    chunk_length_s=30,
    stride_length_s=5,
    return_timestamps=True,
    device=device,
    generate_kwargs={"language": "English", "task": "transcribe"}
)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Device set to use cuda


For Microphone input

In [None]:
p = pyaudio.PyAudio()
print("Available audio input devices:\n")

for i in range(p.get_device_count()):
    info = p.get_device_info_by_index(i)
    if info["maxInputChannels"] > 0:
        print(f"Index {i}: {info['name']} (Loopback: {info.get('isLoopbackDevice', False)})")


Available audio input devices:

Index 0: Microsoft Sound Mapper - Input (Loopback: False)
Index 1: Headset (ACCENTUM) (Loopback: False)
Index 2: Microphone Array (Realtek(R) Au (Loopback: False)
Index 7: Primary Sound Capture Driver (Loopback: False)
Index 8: Headset (ACCENTUM) (Loopback: False)
Index 9: Microphone Array (Realtek(R) Audio) (Loopback: False)
Index 17: Microphone Array (Realtek(R) Audio) (Loopback: False)
Index 18: Headset (ACCENTUM) (Loopback: False)
Index 19: Speakers (Realtek(R) Audio) [Loopback] (Loopback: True)
Index 20: C27R50x (NVIDIA High Definition Audio) [Loopback] (Loopback: True)
Index 21: Headphones (ACCENTUM) [Loopback] (Loopback: True)


In [18]:
input_device_index = 1  
p = pyaudio.PyAudio()
mic = p.get_device_info_by_index(input_device_index)
RATE = int(mic["defaultSampleRate"])
FORMAT = pyaudio.paInt16
CHANNELS = 1
TRANSCRIPTION_INTERVAL = 10  # in seconds
WHISPER_RATE = 16000  # Whisper expects 16kHz

stream = p.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, input_device_index=input_device_index)
print(f"Recording from: {mic['name']}")
print("Listening... Press Ctrl+C to stop.")

audio_frames = np.array([], dtype=np.float32)
file_path = "live_transcriptions.txt"
with open(file_path, "w") as f:
    try:
        while True:
            data = stream.read(int(RATE * TRANSCRIPTION_INTERVAL), exception_on_overflow=False)
            audio_chunk = np.frombuffer(data, np.int16).astype(np.float32) / 32768.0
            audio_chunk = resample(audio_chunk, int(len(audio_chunk) * WHISPER_RATE / RATE))
            audio_frames = np.append(audio_frames, audio_chunk)

            result = asr_pipe({"array": audio_chunk, "sampling_rate": WHISPER_RATE})

            timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            f.write(f"[{timestamp}] {result['text']}\n")
            f.flush()
            print(f"[{timestamp}] {result['text']}")
    except KeyboardInterrupt:
        print("Recording stopped.")
        audio_int16 = (audio_frames * 32767).astype(np.int16)
        stream.stop_stream()
        stream.close()
        p.terminate()

Recording from: Headset (ACCENTUM)
Listening... Press Ctrl+C to stop.
[2025-06-20 18:52:03]  Hi, my name is Rohit Karthik and I am a junior MSNG here working at New Boy Techs. So I have been given a task to transcribe this video.
[2025-06-20 18:52:16]  So, I think it's working pretty good and I'm doing a good job.
Recording stopped.
