Finity-Alpha · fakhirali · Feb 4, 2024 · Feb 2, 2024 · Feb 2, 2024 · Feb 4, 2024
diff --git a/main_llama.py b/main_llama.py
@@ -37,6 +37,7 @@
         response = john.generate_response(user_input)
         print(response)
 
-        mouth.say_multiple(response.replace('[USER]', '').replace('[END]', '').replace('[START]', ''))
+        # mouth.say_multiple(response.replace('[USER]', '').replace('[END]', '').replace('[START]', ''))
+        mouth.say_interruption(response.replace('[USER]', '').replace('[END]', '').replace('[START]', ''), ear.interrupt_listen)
         if response.find('[END]') != -1:
             break
diff --git a/stt.py b/stt.py
@@ -2,8 +2,9 @@
 import torchaudio
 import torchaudio.functional as F
 import torch
-from utils import record
+from utils import record_user, record_interruption
 from vad import VoiceActivityDetection
+import re
 print(); print()
 
 
@@ -16,6 +17,7 @@ def __init__(self, model_id='openai/whisper-base.en', device='cpu', silence_seco
         self.model.to(device)
         self.vad = VoiceActivityDetection()
         self.silence_seconds = silence_seconds
+        self.not_interrupt_words = ['you', 'yes', 'yeah', 'hmm']
 
     @torch.no_grad()
     def transcribe(self, audio):
@@ -26,10 +28,30 @@ def transcribe(self, audio):
 
 
     def listen(self):
-        audio = record(self.silence_seconds, self.vad)
+        audio = record_user(self.silence_seconds, self.vad)
         text = self.transcribe(audio)
         return text
-
+
+
+    def interrupt_listen(self, record_seconds=100):
+        while record_seconds > 0:
+            interruption_audio = record_interruption(self.vad, record_seconds)
+            # duration of interruption audio
+            if interruption_audio is None:
+                return False
+            else:
+                duration = len(interruption_audio) / 16_000
+                text = self.transcribe(interruption_audio)
+                #remove any punctuation using re
+                text = re.sub(r'[^\w\s]', '', text)
+                text = text.lower()
+                text = text.strip()
+                print(text)
+                if text in self.not_interrupt_words:
+                    record_seconds -= duration
+                else:
+                    return True
+
 
 if __name__ == "__main__":
     device = 'cuda' if torch.cuda.is_available() else 'cpu'

diff --git a/tts.py b/tts.py
@@ -25,11 +25,24 @@ def run_tts(self, text):
 
     def say(self, text):
         output = self.run_tts(text)
+        # get the duration of audio
         sd.play(output, samplerate=self.model.config.sampling_rate)
         if self.visualize:
             self.visualizer.visualize(output, text)
         sd.wait()
 
+    def say_interruption(self, text, listen_interruption_func):
+        output = self.run_tts(text)
+        # get the duration of audio
+        duration = len(output) / self.model.config.sampling_rate
+        sd.play(output, samplerate=self.model.config.sampling_rate)
+        interruption = listen_interruption_func(duration)
+        if interruption:
+            sd.stop()
+        else:
+            sd.wait()
+
+
     def say_multiple(self, text):
         pattern = r'[.?!]'
         sentences = re.split(pattern, text)

diff --git a/utils.py b/utils.py
@@ -1,54 +1,68 @@
 import numpy as np
 import pyaudio
 import audioop
-def record(silence_seconds, vad=None):
-    seconds_silence = silence_seconds  # changing this might make the convo more natural
-    CHUNK = 1024
-    FORMAT = pyaudio.paInt16
-    CHANNELS = 1  # make sure this is 1
-    RATE = 16000
-    RECORD_SECONDS = 100
-    WAVE_OUTPUT_FILENAME = "user.wav"
 
+CHUNK = 1024
+FORMAT = pyaudio.paInt16
+CHANNELS = 1
+RATE = 16000
+
+
+def make_stream():
     p = pyaudio.PyAudio()
+    return p.open(format=FORMAT,
+                  channels=CHANNELS,
+                  rate=RATE,
+                  input=True,
+                  frames_per_buffer=CHUNK)
 
-    stream = p.open(format=FORMAT,
-                    channels=CHANNELS,
-                    rate=RATE,
-                    input=True,
-                    frames_per_buffer=CHUNK)
 
-    print("* recording")
+# def record_audio(record_seconds=100):
+#     # yield audio frames
+#
+#     RECORD_SECONDS = record_seconds
+#
+#     stream = make_stream()
+#
+#     for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
+#         data = stream.read(CHUNK)
+#         yield data
+#
 
+def record_interruption(vad, recond_seconds=100):
+    print("* recording for interruption")
+    frames = []
+    stream = make_stream()
+    for _ in range(0, int(RATE / CHUNK * recond_seconds)):
+        data = stream.read(CHUNK)
+        frames.append(data)
+        contains_speech = vad.contains_speech(frames[int(RATE / CHUNK) * -2:])
+        if contains_speech:
+            stream.close()
+            frames = np.frombuffer(b''.join(frames), dtype=np.int16)
+            frames = frames / (1 << 15)
+            return frames.astype(np.float32)
+    stream.close()
+    return None
+
+
+def record_user(silence_seconds, vad):
     frames = []
 
     started = False
     one_second_iters = int(RATE / CHUNK)
-    silent_iters = 0
+    stream = make_stream()
+    print("* recording")
 
-    for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
+    while True:
         data = stream.read(CHUNK)
         frames.append(data)
-        if vad is None:
-            rms = audioop.rms(data, p.get_sample_size(FORMAT))
-            decibel = 20 * np.log10(rms)
-            if not started and decibel > 50:
-                started = True
-
-            if started and decibel < 50:
-                silent_iters += 1
-
-            if started and decibel > 50:
-                silent_iters = 0
-
-            if silent_iters >= one_second_iters * seconds_silence:
-                break
-        else:
-            contains_speech = vad.contains_speech(frames[-one_second_iters*silence_seconds:])
-            if not started and contains_speech:
-                started = True
-            if started and contains_speech is False:
-                break
+        contains_speech = vad.contains_speech(frames[-one_second_iters * silence_seconds:])
+        if not started and contains_speech:
+            started = True
+        if started and contains_speech is False:
+            break
+    stream.close()
 
     print("* done recording")
 

diff --git a/vad.py b/vad.py
@@ -3,7 +3,7 @@
 from pprint import pprint
 import time
 import numpy as np
-from utils import record
+from utils import record_user
 
 class VoiceActivityDetection:
     def __init__(self, sampling_rate=16000):
@@ -33,7 +33,7 @@ def contains_speech(self, audio):
 
 if __name__ == "__main__":
     vad = VoiceActivityDetection()
-    audio = record(3, vad)
+    audio = record_user(3, vad)