In [1]:
import sounddevice as sd
import numpy as np
import torch
from collections import deque
from speechbrain.inference.VAD import VAD
import time
import whisper
import tempfile
import soundfile as sf

# Load models
vad = VAD.from_hparams(source="speechbrain/vad-crdnn-libriparty", savedir="tmp_vad_model")
whisper_model = whisper.load_model("base")

# Parameters
SAMPLE_RATE = 16000
CHUNK_DURATION = 0.1
CHUNK_SIZE = int(SAMPLE_RATE * CHUNK_DURATION)
BUFFER_SIZE = 5 * CHUNK_SIZE
PRE_SPEECH_DURATION = 0.5
PRE_SPEECH_SIZE = int(SAMPLE_RATE * PRE_SPEECH_DURATION)
NUM_LABEL = 1

# Buffers & states
audio_buffer = deque(maxlen=BUFFER_SIZE)
pre_speech_buffer = deque(maxlen=PRE_SPEECH_SIZE)
recorded_audio_buffer = []

is_recording = False
quiet_count = 0
speech_count = 0
recording_triggered = False

def audio_callback(indata, frames, time_info, status):
    if status:
        print("⚠️", status)
    samples = indata[:, 0]
    audio_buffer.extend(samples)
    pre_speech_buffer.extend(samples)


  wrapped_fwd = torch.cuda.amp.custom_fwd(fwd, cast_inputs=cast_inputs)


In [None]:
import torch.nn as nn
from transformers.modeling_outputs import SequenceClassifierOutput

class DistilBERTClassifier(nn.Module):
    def __init__(self, base_model, num_labels):
        super().__init__()
        self.encoder = base_model
        self.classifier = nn.Sequential(
            nn.Linear(self.encoder.config.hidden_size, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, num_labels)
        )

    def forward(self, input_ids, attention_mask=None, labels=None):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        pooled = outputs.last_hidden_state[:,0]
        logits = self.classifier(pooled)
        
        loss = None
        if labels is not None:
            loss = nn.CrossEntropyLoss()(logits, labels)
        
        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
        )


In [None]:
import os
import torch
import torch.nn as nn
import evaluate
import numpy as np
from datasets import load_dataset, Audio
from transformers import (
    AutoFeatureExtractor,
    AutoModel,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    set_seed
)
from transformers.modeling_outputs import SequenceClassifierOutput
from transformers import AutoModel
MODEL_NAME = "distilbert-base-uncased"
encoder = AutoModel.from_pretrained(MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

test_model = "models/SemanticVAD_2.pt"
Semantic_VAD_model = DistilBERTClassifier(base_model=encoder, num_labels=2)
Semantic_VAD_model.load_state_dict(torch.load(test_model, map_location="mps"))
Semantic_VAD_model.eval()

DistilBERTClassifier(
  (encoder): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1)

In [None]:
# Start mic
stream = sd.InputStream(callback=audio_callback, channels=1, samplerate=SAMPLE_RATE)
stream.start()
print("🎙️ Listening... (press Ctrl+C to stop)")

try:
    while True:
        if len(audio_buffer) >= CHUNK_SIZE:
            chunk_np = np.array(list(audio_buffer)[-CHUNK_SIZE:])
            chunk_tensor = torch.from_numpy(chunk_np).unsqueeze(0)
            start = time.time()
            speech_prob = vad.get_speech_prob_chunk(chunk_tensor).max().item()
            end = time.time()

            if speech_prob > 0.5:
                speech_count += 1
                quiet_count = 0
                # print(f"Speaking chunk {speech_count}/2 (prob={speech_prob:.2f}) | detection time consumption {end - start:.3f}s")

                if not is_recording and speech_count >= 2:
                    print("⏺️ Start recording...")
                    is_recording = True
                    recorded_audio_buffer = list(pre_speech_buffer) + list(chunk_np)
                    recording_triggered = True
                elif is_recording:
                    recorded_audio_buffer.extend(chunk_np)

            else:
                if is_recording:
                    recorded_audio_buffer.extend(chunk_np)
                    quiet_count += 1
                    # print(f"Quiet chunk {quiet_count}/5")

                    if quiet_count >= 5:
                        print("_"*50)
                        print("🛑 Stop recording. Running Whisper...")
                        is_recording = False
                        speech_count = 0
                        quiet_count = 0
                        recording_triggered = False

                        
                        # Save audio and transcribe
                        a = time.time()
                        audio_np = np.array(recorded_audio_buffer)
                        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_wav:
                            sf.write(tmp_wav.name, audio_np, SAMPLE_RATE)
                            result = whisper_model.transcribe(tmp_wav.name)
                            print("📝 Transcription:", result["text"])
                        b = time.time()
                        print(f"⏱️ Whisper took {b - a:.3f} seconds")

                        a = time.time()
                        with torch.no_grad():
                            inputs = tokenizer(
                                result["text"],
                                truncation=True,
                                padding="max_length",
                                max_length=128,
                                return_tensors="pt"  # make sure it returns tensor
                            )
                            # feed input_ids and attention_mask separately
                            outputs = Semantic_VAD_model(
                                input_ids=inputs["input_ids"],
                                attention_mask=inputs["attention_mask"]
                            )
                            logits = outputs.logits
                            # Softmax to get probability distribution
                            probs = torch.softmax(logits, dim=-1)

                            # Get predicted class
                            pred = torch.argmax(probs, dim=-1).item()
                            print("_"*50)
                            print(f"Semantic VAD result: {pred}\n{logits=}")

                            if pred == 0:
                                logiif_interrupt = 0
                                print("Stop AI talking!!!!!!")
                            else:
                                logiif_interrupt = 1
                                print("Backchannel ignored!!!!!!")
                        b = time.time()
                        print(f"⏱️ SemanticVAD took {b - a:.3f} seconds")
                else:
                    speech_count = 0
                    quiet_count = 0

        sd.sleep(50)

except KeyboardInterrupt:
    print("🔚 Stop listening.")
    stream.stop()
    stream.close()

🎙️ Listening... (press Ctrl+C to stop)
⏺️ Start recording...
__________________________________________________
🛑 Stop recording. Running Whisper...




📝 Transcription:  하루가 두려 dari 리허는
⏱️ Whisper took 1.020 seconds
__________________________________________________
Semantic VAD result: 1
logits=tensor([[-1.9042,  1.8073]])
Backchannel ignored!!!!!!
⏱️ SemanticVAD took 0.030 seconds
⏺️ Start recording...
__________________________________________________
🛑 Stop recording. Running Whisper...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


📝 Transcription:  Hi
⏱️ Whisper took 0.284 seconds
__________________________________________________
Semantic VAD result: 1
logits=tensor([[-2.9100,  2.8706]])
Backchannel ignored!!!!!!
⏱️ SemanticVAD took 0.020 seconds
⏺️ Start recording...
__________________________________________________
🛑 Stop recording. Running Whisper...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


📝 Transcription:  Excuse me?
⏱️ Whisper took 0.284 seconds
__________________________________________________
Semantic VAD result: 0
logits=tensor([[ 5.3915, -5.1514]])
Stop AI talking!!!!!!
⏱️ SemanticVAD took 0.019 seconds
⏺️ Start recording...
__________________________________________________
🛑 Stop recording. Running Whisper...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


📝 Transcription:  I don't think so.
⏱️ Whisper took 0.293 seconds
__________________________________________________
Semantic VAD result: 1
logits=tensor([[-2.9348,  2.8994]])
Backchannel ignored!!!!!!
⏱️ SemanticVAD took 0.019 seconds
⏺️ Start recording...
__________________________________________________
🛑 Stop recording. Running Whisper...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


📝 Transcription:  Well, you're right.
⏱️ Whisper took 0.277 seconds
__________________________________________________
Semantic VAD result: 1
logits=tensor([[-2.9307,  2.8941]])
Backchannel ignored!!!!!!
⏱️ SemanticVAD took 0.018 seconds
⏺️ Start recording...
__________________________________________________
🛑 Stop recording. Running Whisper...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


📝 Transcription:  No.
⏱️ Whisper took 0.268 seconds
__________________________________________________
Semantic VAD result: 1
logits=tensor([[-2.9126,  2.8719]])
Backchannel ignored!!!!!!
⏱️ SemanticVAD took 0.017 seconds
⏺️ Start recording...
__________________________________________________
🛑 Stop recording. Running Whisper...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


📝 Transcription:  listen to
⏱️ Whisper took 0.928 seconds
__________________________________________________
Semantic VAD result: 0
logits=tensor([[ 5.2388, -5.0202]])
Stop AI talking!!!!!!
⏱️ SemanticVAD took 0.020 seconds
