-
Notifications
You must be signed in to change notification settings - Fork 0
/
voice.py
224 lines (213 loc) · 9.72 KB
/
voice.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
import os
from openai import OpenAI
import pyaudio
import wave
import whisperx
import torch
import requests
import gc
import json
from dotenv import load_dotenv
def fun_play_wav(filename):
print("播放音檔")
chunk = 1024
audio_path = os.path.join(assert_directory, filename)
wf = wave.open(audio_path, 'rb')
p = pyaudio.PyAudio()
stream = p.open(format=p.get_format_from_width(wf.getsampwidth()),
channels=wf.getnchannels(),
rate=wf.getframerate(),
output=True)
data = wf.readframes(chunk)
while data:
stream.write(data)
data = wf.readframes(chunk)
stream.stop_stream()
stream.close()
p.terminate()
def fun_record(sec: int):
chunk = 1024 # 記錄聲音的樣本區塊大小
sample_format = pyaudio.paInt16 # 樣本格式,可使用 paFloat32、paInt32、paInt24、paInt16、paInt8、paUInt8、paCustomFormat
channels = 1 # 聲道數量
fs = 44100# 取樣頻率,常見值為 44100 ( CD )、48000 ( DVD )、22050、24000、12000 和 11025。
seconds = sec # 錄音秒數
filename = "whisperX.wav" # 錄音檔名
audio_path = os.path.join(assert_directory, filename)
p = pyaudio.PyAudio() # 建立 pyaudio 物件
print("開始錄音...")
stream = p.open(format=sample_format, channels=channels,rate=fs, frames_per_buffer=chunk, input=True)
frames = [] # 建立聲音串列
for _ in range(0, int(fs / chunk * seconds)):
data = stream.read(chunk)
frames.append(data)
stream.stop_stream() # 將聲音記錄到串列中
stream.close() # 停止錄音
p.terminate() # 關閉串流
print('錄音結束...')
wf = wave.open(audio_path, 'wb') # 開啟聲音記錄檔
wf.setnchannels(channels) # 設定聲道
wf.setsampwidth(p.get_sample_size(sample_format)) # 設定格式
wf.setframerate(fs) # 設定取樣頻率
wf.writeframes(b''.join(frames)) # 存檔
wf.close()
def fun_whisperX():
print("執行 WhisperX")
print("辨識中...")
audio_path = os.path.join(assert_directory, "whisperX.wav")
result = modelx.transcribe(audio_path)
print(f"辨識: \n {result['segments'][0]['text']}")
# 把辨識結果轉換成小寫
return result['segments'][0]['text'].lower()
def fun_llm(messages):
print("執行 LLM")
# 讀取 prompt.txt 檔案
with open(os.path.join(assert_directory, "prompt.txt"), "r", encoding="utf-8") as f:
prompt = f.read()
history = [
{
"role": "system",
"content": prompt
},{
"role": "user",
"content": messages
}
]
completion = client.chat.completions.create(model="llama3-8b-8192", messages=history, temperature=0.7,)
print(completion.choices[0].message.content)
return completion.choices[0].message.content
def fun_tts(text):
print("執行 TTS")
refer_wav_path = tts_path
prompt_text = tts_text
prompt_language = "中文"
text_language = "中文"
url = f"{tts_api}/?refer_wav_path={refer_wav_path}&prompt_text={prompt_text}&prompt_language={prompt_language}&text={text}&text_language={text_language}"
response = requests.get(url)
# response輸出為音檔
with open(os.path.join(assert_directory, "SoVITS_LLM.wav"), "wb") as f:
f.write(response.content)
def fun_irremote(value):
print("執行 IR Remote")
irremote_url = f"{web_api}/irremote"
log_url = f"{web_api}/log"
body = json.dumps(value)
headers = {"Content-Type": "application/json"}
irremote_res = requests.post(irremote_url, headers=headers, data=body)
log_res = requests.post(log_url, headers=headers, data=body)
print(irremote_res.text)
print(log_res.text)
if __name__ == '__main__':
try:
# 初始化
# 載入環境變數
load_dotenv(".voice.env")
groq_api_url = os.getenv("GROQ_API_URL")
groq_api_key = os.getenv("GROQ_API_KEY")
wihisperx_model = os.getenv("WHISPERX_MODEL")
web_api = os.getenv("WEB_API")
tts_api = os.getenv("TTS_API")
tts_path = os.getenv("TTS_PATH")
tts_text = os.getenv("TTS_TEXT")
if not all([groq_api_url, groq_api_key, wihisperx_model, web_api, tts_api, tts_path, tts_text]):
raise Exception("環境變數未設定完全")
else:
print("環境變數設定完成")
# 取得當前目錄
assert_directory = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets")
print("當前目錄: ", assert_directory)
# 初始化 LLM
client = OpenAI(base_url=groq_api_url, api_key=groq_api_key)
# 載入模型 (WhisperX)
device = "cuda" if torch.cuda.is_available() else "cpu"
modelx = whisperx.load_model(wihisperx_model, device, compute_type="int8" if device == "cpu" else "float32")
while True:
try:
# 呼叫 def_record() 錄音
fun_record(3)
# 呼叫 def_whisperX() 辨識
detect = fun_whisperX()
# 判斷是否有偵測到 "Hey" 或 "OK" 和 "Whisper"
if ('hey' in detect or 'ok' in detect) and ('whisper' in detect):
fun_play_wav("hello.wav")
print('哈囉,請問有什麼需求嗎?')
# 播放提示音
fun_play_wav("提示音.wav")
# 呼叫 def_record() 錄音
fun_record(5)
# 呼叫 def_whisperX() 辨識
text = fun_whisperX()
# 判斷是否有偵測到 "冷氣" 或 "電風扇" 或 "電視" 或 "查詢"
if '冷氣' in text or '電風扇' in text or '電視' in text or '查詢' in text:
# 呼叫 def_llm() 生成回答
llm_res = fun_llm(text)
# 呼叫 def_tts() 生成音檔
fun_tts(llm_res)
value={}
if "查詢" in text:
if "控制" in text:
ans="好的,最近一次的控制紀錄是2023/09/25"
elif "心率" in text or "心律" in text:
ans="好的,最近一次的心率紀錄為70"
elif "睡眠" in text:
ans="好的,最近一次的睡眠時間為6小時30分鐘"
elif "步數" in text:
ans="好的,最近一天的步數為7000步"
if '冷氣' in text:
value['devices']='aircon'
value['name'] = '大金冷氣'
if '上下' in text and ('擺動' in text or '搖擺' in text):
value['signal']='H-swing'
elif '左右' in text and ('擺動' in text or '搖擺' in text):
value['signal']='V-swing'
elif '開' in text and '冷氣' in text:
value['signal']='on'
elif '關' in text and '冷氣' in text:
value['signal']='off'
if '電風扇' in text:
value['devices'] = 'fan'
value['name'] = '電風扇'
if '弱風' in text:
value['signal'] = 'L-wind'
elif '強風' in text:
value['signal'] = 'H-wind'
if '上下' in text and ('擺動' in text or '搖擺' in text):
value['signal']='H-swing'
elif '左右' in text and ('擺動' in text or '搖擺' in text):
value['signal']='V-swing'
elif '開' in text and '風扇' in text:
value['signal'] = 'on'
elif '關' in text and '風扇' in text:
value['signal'] = 'off'
if '電視' in text:
value['devices'] = 'tv'
value['name'] = '電視'
if '開' in text and '電視' in text:
value['signal'] = 'on'
elif '關' in text and '電視' in text:
value['signal'] = 'off'
elif '轉' in text and '台' in text:
value['signal'] = text[-3:-1]
elif '確定' in text:
value['signal'] = 'ok'
# 呼叫 def_irremote() 控制裝置
# fun_irremote(value)
print(value)
print(text)
# 呼叫 def_play_wav() 播放音檔
fun_play_wav("SoVITS_LLM.wav")
else:
print('抱歉,我聽不懂你的需求')
fun_play_wav("sorry.wav")
except KeyboardInterrupt:
print("Ctrl+C")
break
except IndexError:
print("未檢測到語音")
except Exception as e:
print(f"初始化時發生錯誤: {e}")
finally:
# 刪除模型
del modelx
# 釋放記憶體
gc.collect()
print("程式結束")