Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion main_llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
response = john.generate_response(user_input)
print(response)

mouth.say_multiple(response.replace('[USER]', '').replace('[END]', '').replace('[START]', ''))
# mouth.say_multiple(response.replace('[USER]', '').replace('[END]', '').replace('[START]', ''))
mouth.say_interruption(response.replace('[USER]', '').replace('[END]', '').replace('[START]', ''), ear.interrupt_listen)
if response.find('[END]') != -1:
break
28 changes: 25 additions & 3 deletions stt.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@
import torchaudio
import torchaudio.functional as F
import torch
from utils import record
from utils import record_user, record_interruption
from vad import VoiceActivityDetection
import re
print(); print()


Expand All @@ -16,6 +17,7 @@ def __init__(self, model_id='openai/whisper-base.en', device='cpu', silence_seco
self.model.to(device)
self.vad = VoiceActivityDetection()
self.silence_seconds = silence_seconds
self.not_interrupt_words = ['you', 'yes', 'yeah', 'hmm']

@torch.no_grad()
def transcribe(self, audio):
Expand All @@ -26,10 +28,30 @@ def transcribe(self, audio):


def listen(self):
audio = record(self.silence_seconds, self.vad)
audio = record_user(self.silence_seconds, self.vad)
text = self.transcribe(audio)
return text



def interrupt_listen(self, record_seconds=100):
while record_seconds > 0:
interruption_audio = record_interruption(self.vad, record_seconds)
# duration of interruption audio
if interruption_audio is None:
return False
else:
duration = len(interruption_audio) / 16_000
text = self.transcribe(interruption_audio)
#remove any punctuation using re
text = re.sub(r'[^\w\s]', '', text)
text = text.lower()
text = text.strip()
print(text)
if text in self.not_interrupt_words:
record_seconds -= duration
else:
return True


if __name__ == "__main__":
device = 'cuda' if torch.cuda.is_available() else 'cpu'
Expand Down
13 changes: 13 additions & 0 deletions tts.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,24 @@ def run_tts(self, text):

def say(self, text):
output = self.run_tts(text)
# get the duration of audio
sd.play(output, samplerate=self.model.config.sampling_rate)
if self.visualize:
self.visualizer.visualize(output, text)
sd.wait()

def say_interruption(self, text, listen_interruption_func):
output = self.run_tts(text)
# get the duration of audio
duration = len(output) / self.model.config.sampling_rate
sd.play(output, samplerate=self.model.config.sampling_rate)
interruption = listen_interruption_func(duration)
if interruption:
sd.stop()
else:
sd.wait()


def say_multiple(self, text):
pattern = r'[.?!]'
sentences = re.split(pattern, text)
Expand Down
86 changes: 50 additions & 36 deletions utils.py
Original file line number Diff line number Diff line change
@@ -1,54 +1,68 @@
import numpy as np
import pyaudio
import audioop
def record(silence_seconds, vad=None):
seconds_silence = silence_seconds # changing this might make the convo more natural
CHUNK = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 1 # make sure this is 1
RATE = 16000
RECORD_SECONDS = 100
WAVE_OUTPUT_FILENAME = "user.wav"

CHUNK = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 16000


def make_stream():
p = pyaudio.PyAudio()
return p.open(format=FORMAT,
channels=CHANNELS,
rate=RATE,
input=True,
frames_per_buffer=CHUNK)

stream = p.open(format=FORMAT,
channels=CHANNELS,
rate=RATE,
input=True,
frames_per_buffer=CHUNK)

print("* recording")
# def record_audio(record_seconds=100):
# # yield audio frames
#
# RECORD_SECONDS = record_seconds
#
# stream = make_stream()
#
# for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
# data = stream.read(CHUNK)
# yield data
#

def record_interruption(vad, recond_seconds=100):
print("* recording for interruption")
frames = []
stream = make_stream()
for _ in range(0, int(RATE / CHUNK * recond_seconds)):
data = stream.read(CHUNK)
frames.append(data)
contains_speech = vad.contains_speech(frames[int(RATE / CHUNK) * -2:])
if contains_speech:
stream.close()
frames = np.frombuffer(b''.join(frames), dtype=np.int16)
frames = frames / (1 << 15)
return frames.astype(np.float32)
stream.close()
return None


def record_user(silence_seconds, vad):
frames = []

started = False
one_second_iters = int(RATE / CHUNK)
silent_iters = 0
stream = make_stream()
print("* recording")

for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
while True:
data = stream.read(CHUNK)
frames.append(data)
if vad is None:
rms = audioop.rms(data, p.get_sample_size(FORMAT))
decibel = 20 * np.log10(rms)
if not started and decibel > 50:
started = True

if started and decibel < 50:
silent_iters += 1

if started and decibel > 50:
silent_iters = 0

if silent_iters >= one_second_iters * seconds_silence:
break
else:
contains_speech = vad.contains_speech(frames[-one_second_iters*silence_seconds:])
if not started and contains_speech:
started = True
if started and contains_speech is False:
break
contains_speech = vad.contains_speech(frames[-one_second_iters * silence_seconds:])
if not started and contains_speech:
started = True
if started and contains_speech is False:
break
stream.close()

print("* done recording")

Expand Down
4 changes: 2 additions & 2 deletions vad.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from pprint import pprint
import time
import numpy as np
from utils import record
from utils import record_user

class VoiceActivityDetection:
def __init__(self, sampling_rate=16000):
Expand Down Expand Up @@ -33,7 +33,7 @@ def contains_speech(self, audio):

if __name__ == "__main__":
vad = VoiceActivityDetection()
audio = record(3, vad)
audio = record_user(3, vad)