In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install -q gradio transformers sentence-transformers torchaudio datasets soundfile pydub

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.9/46.9 MB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m322.6/322.6 kB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m44.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m36.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m37.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
!pip install edge-tts

Collecting edge-tts
  Downloading edge_tts-7.0.1-py3-none-any.whl.metadata (5.5 kB)
Collecting srt<4.0.0,>=3.4.1 (from edge-tts)
  Downloading srt-3.5.3.tar.gz (28 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading edge_tts-7.0.1-py3-none-any.whl (26 kB)
Building wheels for collected packages: srt
  Building wheel for srt (setup.py) ... [?25l[?25hdone
  Created wheel for srt: filename=srt-3.5.3-py3-none-any.whl size=22427 sha256=df65991e668d4a63eb64589983eb0a46f166b0d98531350fca7d23ebb805df03
  Stored in directory: /root/.cache/pip/wheels/1f/43/f1/23ee9119497fcb57d9f7046fbf34c6d9027c46a1fa7824cf08
Successfully built srt
Installing collected packages: srt, edge-tts
Successfully installed edge-tts-7.0.1 srt-3.5.3


In [4]:
import json
from sentence_transformers import SentenceTransformer, util
import torch
import gradio as gr
import torchaudio
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
import numpy as np
from pydub import AudioSegment
import io
import soundfile as sf
import os
import asyncio
import edge_tts
from pydub.playback import play
import datetime
from transformers import WhisperProcessor, WhisperForConditionalGeneration

# 1. 加载FAQ数据
with open('/content/drive/MyDrive/6104/group project/hsu_faq.json') as f:
    faq_data = json.load(f)

# 2. 初始化模型
# 语音识别模型(Whisper大型版)
whisper_processor = WhisperProcessor.from_pretrained("openai/whisper-large-v3")
whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v3")

# 问答嵌入模型
embedding_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

# 本地LLM(使用较小的Phi-2模型)
llm_model = AutoModelForCausalLM.from_pretrained("microsoft/phi-2",
                                               torch_dtype=torch.float16,
                                               device_map="auto")
llm_tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2")

# 3. 预处理FAQ数据
questions = [item['question'] for item in faq_data]
answers = [item['answer'] for item in faq_data]
question_embeddings = embedding_model.encode(questions, convert_to_tensor=True)

# 4. RAG问答函数
def rag_qa(user_question, threshold=0.7):
    # 编码用户问题
    user_embedding = embedding_model.encode(user_question, convert_to_tensor=True)

    # 计算相似度
    cos_scores = util.cos_sim(user_embedding, question_embeddings)[0]
    max_score_idx = torch.argmax(cos_scores).item()
    max_score = cos_scores[max_score_idx].item()

    if max_score > threshold:
        return answers[max_score_idx]
    else:
        # 使用LLM生成回答
        prompt = f"""你是一个恒生大学的智能助手。根据以下上下文回答问题。
        如果问题与恒生大学无关，请礼貌拒绝回答。

        问题: {user_question}
        回答: """

        inputs = llm_tokenizer(prompt, return_tensors="pt", return_attention_mask=False).to(llm_model.device)
        outputs = llm_model.generate(**inputs, max_new_tokens=100)
        answer = llm_tokenizer.batch_decode(outputs)[0]
        return answer.replace(prompt, "").split("\n")[0]

# 5. 改进的语音处理函数
async def text_to_speech(text, output_file="output.mp3"):
    # 定义文本和语音参数
    voice = "zh-CN-YunjianNeural"  # 使用中文语音

    # 创建 Communicate 对象并生成完整音频文件
    communicate = edge_tts.Communicate(text, voice)
    await communicate.save(output_file)
    return output_file

def transcribe_audio(audio):
    # 读取音频并重采样为 16kHz
    # Gradio 返回的 audio 是一个字典，包含 "array" 和 "sampling_rate"
    waveform = torch.tensor(audio[1], dtype=torch.float32)
    sample_rate = audio[0]

    if sample_rate != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
        waveform = resampler(waveform)

    # 加载音频文件并预处理
    audio_input = whisper_processor(waveform, sampling_rate=16000, return_tensors="pt").input_features

    # 使用模型生成文本
    with torch.no_grad():
        predicted_ids = whisper_model.generate(audio_input)

    # 解码生成的 token 为文本
    transcription = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
    return transcription

def process_audio(audio_file):
    # 处理空输入情况
    if audio_file is None:
        return "", "请提供语音输入或使用文本输入", None

    try:
        # 处理Gradio音频输入
        if isinstance(audio_file, (tuple, list)):
            sample_rate, audio_data = audio_file
            audio_input = (sample_rate, audio_data)
        else:
            # 如果是文件路径
            audio_segment = AudioSegment.from_file(audio_file)
            audio_data = np.array(audio_segment.get_array_of_samples())
            if audio_segment.channels > 1:
                audio_data = audio_data.reshape((-1, audio_segment.channels)).mean(axis=1)
            audio_input = (audio_segment.frame_rate, audio_data)

        # 语音识别
        text = transcribe_audio(audio_input)

        # 获取回答
        answer = rag_qa(text)

        # 语音合成
        output_audio = "/tmp/output.mp3"
        asyncio.run(text_to_speech(answer, output_audio))

        return text, answer, output_audio

    except Exception as e:
        print(f"处理音频时出错: {str(e)}")
        return "[语音识别失败]", f"处理语音输入时出错: {str(e)}", None

# 6. 文本处理函数
def process_text(text):
    if not text.strip():
        return "", "请输入有效的问题", None

    answer = rag_qa(text)

    try:
        # 语音合成
        output_audio = "/tmp/output.mp3"
        asyncio.run(text_to_speech(answer, output_audio))

        return "", answer, output_audio
    except Exception as e:
        print(f"语音合成失败: {str(e)}")
        return "", answer, None



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/340 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.07k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.27k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.90k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.89k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/35.7k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

In [5]:
# 7. Gradio界面
with gr.Blocks(title="HSUHK智能助手") as demo:
    gr.Markdown("# 🎤 HSUHK智能语音助手")
    gr.Markdown("上传语音提问或使用麦克风录制")

    with gr.Tab("语音输入"):
        audio_input = gr.Audio(sources=["microphone", "upload"], type="numpy", label="语音输入")
        audio_submit = gr.Button("提交语音问题")

    with gr.Tab("文本输入"):
        text_input = gr.Textbox(label="输入文字问题")
        text_submit = gr.Button("提交文字问题")

    with gr.Column():
        recognized_text = gr.Textbox(label="识别到的提问")
        answer_text = gr.Textbox(label="回答文本")
        answer_audio = gr.Audio(label="语音回答", autoplay=True)

    # 清除按钮
    clear_btn = gr.Button("清除所有")

    # 事件处理
    audio_submit.click(
        fn=process_audio,
        inputs=audio_input,
        outputs=[recognized_text, answer_text, answer_audio]
    )

    text_submit.click(
        fn=process_text,
        inputs=text_input,
        outputs=[recognized_text, answer_text, answer_audio]
    )

    text_input.submit(
        fn=process_text,
        inputs=text_input,
        outputs=[recognized_text, answer_text, answer_audio]
    )

    clear_btn.click(
        fn=lambda: [None, "", "", None],
        outputs=[audio_input, recognized_text, answer_text, answer_audio]
    )

# 启动应用
demo.launch(debug=True)

It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://66483d325774d662a6.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://66483d325774d662a6.gradio.live


