In [1]:
import gradio as gr
import torch
import librosa
import numpy as np
import os
from transformers import (
    AutoModelForSpeechSeq2Seq,
    AutoProcessor,
    pipeline,
    VitsModel,
    AutoTokenizer
)



In [2]:
# --- 1. CONFIGURATION & HARDWARE CHECK ---
# Checks if you have a GPU (NVIDIA) or Mac (MPS) or CPU
if torch.cuda.is_available():
    DEVICE = "cuda:0"
    TORCH_DTYPE = torch.float16
    print("🚀 Hardware: NVIDIA GPU detected (Fast)")
elif torch.backends.mps.is_available():
    DEVICE = "mps"
    TORCH_DTYPE = torch.float32 # MPS often prefers float32
    print("🚀 Hardware: Mac M1/M2/M3 detected")
else:
    DEVICE = "cpu"
    TORCH_DTYPE = torch.float32
    print("⚠️ Hardware: CPU only (Will be slow)")

# --- 2. LOAD MODELS ---

print("⏳ 1/3: Loading Mandarin ASR (Doctor's Ear)...")
# Using a slightly better model for Mandarin 'base' instead of 'small'
mandarin_asr = pipeline(
    "automatic-speech-recognition",
    model="openai/whisper-base",
    device=DEVICE,
    torch_dtype=TORCH_DTYPE
)

print("⏳ 2/3: Loading Hakka ASR (Patient's Ear)...")
hakka_asr_id = "formospeech/whisper-large-v3-taiwanese-hakka"
hakka_processor = AutoProcessor.from_pretrained(hakka_asr_id)
hakka_model = AutoModelForSpeechSeq2Seq.from_pretrained(
    hakka_asr_id, torch_dtype=TORCH_DTYPE, use_safetensors=True
).to(DEVICE)

print("⏳ 3/3: Loading MMS Hakka TTS (Patient's Voice)...")
# This is the model you requested: Massively Multilingual Speech for Hakka
tts_model_id = "facebook/mms-tts-hak"
tts_tokenizer = AutoTokenizer.from_pretrained(tts_model_id)
tts_model = VitsModel.from_pretrained(tts_model_id).to(DEVICE)


# --- 3. THE MEDICAL KNOWLEDGE BASE (Corpus) ---
# Maps Mandarin triggers -> Hakka Romanization (for MMS TTS)
# Note: MMS reads Romanization (Pha̍k-fa-sṳ) much better than Hanzi.
medical_corpus = [
    # -- DIAGNOSIS --
    {
        "category": "Diagnosis",
        "mandarin": ["高血壓", "血壓高"],
        "hakka_display": "高血壓 (Go-hiet-ap)",
        "hakka_romanization": "go hiet ap",  # Text for MMS TTS
        "symptoms": ["頭那暈", "暈暈", "血壓高"]
    },
    {
        "category": "Diagnosis",
        "mandarin": ["糖尿病", "血糖"],
        "hakka_display": "糖尿病 (Tng-ngieu-phiang)",
        "hakka_romanization": "tng ngieu phiang",
        "symptoms": ["嘴渴", "食多", "尿有甜"]
    },
    # -- DOCTOR ORDERS / SUGGESTIONS --
    {
        "category": "Order",
        "mandarin": ["吃藥", "吃药", "服用"],
        "hakka_display": "愛食藥 (Oi siit yok)",
        "hakka_romanization": "oi siit yok",
        "symptoms": []
    },
    {
        "category": "Order",
        "mandarin": ["喝水", "多喝水"],
        "hakka_display": "多飲水 (To lim sui)",
        "hakka_romanization": "to lim sui",
        "symptoms": []
    },
    {
        "category": "Order",
        "mandarin": ["張開嘴", "張嘴", "啊"],
        "hakka_display": "嘴阿開 (Zui a hoi)",
        "hakka_romanization": "zui a hoi",
        "symptoms": []
    },
    {
        "category": "Order",
        "mandarin": ["深呼吸"],
        "hakka_display": "大氣透 (Tai hi teu)",
        "hakka_romanization": "tai hi teu",
        "symptoms": []
    }
]

# --- 4. PROCESSING FUNCTIONS ---

def process_doctor_command(audio_path):
    """
    1. Transcribe Mandarin
    2. Search Corpus for Order/Diagnosis
    3. Generate Hakka Speech using MMS
    """
    if audio_path is None:
        return "No Audio", None, None

    # A. FORCE MANDARIN RECOGNITION
    # We pass generate_kwargs to force the language
    result = mandarin_asr(audio_path, generate_kwargs={"language": "chinese"})
    text_mandarin = result["text"]

    # B. SEARCH CORPUS
    match = None
    for item in medical_corpus:
        # Check if any mandarin keyword exists in the transcribed text
        if any(k in text_mandarin for k in item["mandarin"]):
            match = item
            break

    status_log = f"👨‍⚕️ Doctor said: '{text_mandarin}'"

    if match:
        status_log += f"\n✅ MATCHED: {match['category']} - {match['hakka_display']}"

        # C. GENERATE HAKKA AUDIO (Using MMS)
        # We use the romanized string for better pronunciation
        hakka_text_input = match["hakka_romanization"]
        inputs = tts_tokenizer(hakka_text_input, return_tensors="pt").to(DEVICE)

        with torch.no_grad():
            output_wav = tts_model(**inputs).waveform

        # Convert to standard numpy format for Gradio output
        output_np = output_wav.cpu().numpy().squeeze()

        return status_log, match['hakka_display'], (tts_model.config.sampling_rate, output_np)

    else:
        status_log += "\n⚠️ Concept not found in Corpus. (Try: '吃藥', '高血壓', '深呼吸')"
        return status_log, "N/A", None


def process_patient_response(audio_path):
    """
    1. Transcribe Hakka (Sixian Dialect)
    2. Check Confidence Score
    3. Detect Symptoms
    """
    if audio_path is None:
        return "No Audio", 0

    # Load and resample audio
    y, sr = librosa.load(audio_path, sr=16000)

    # Prepare inputs for FormoSpeech Model
    inputs = hakka_processor(y, sampling_rate=16000, return_tensors="pt").to(DEVICE)
    # Force the model to transcribe Chinese characters
    forced_ids = hakka_processor.get_decoder_prompt_ids(language="zh", task="transcribe")

    # Generate
    outputs = hakka_model.generate(
        **inputs,
        forced_decoder_ids=forced_ids,
        return_dict_in_generate=True,
        output_scores=True
    )

    # Decode text
    transcription = hakka_processor.batch_decode(outputs.sequences, skip_special_tokens=True)[0]

    # Calculate Confidence
    transition_scores = hakka_model.compute_transition_scores(outputs.sequences, outputs.scores, normalize_logits=True)
    confidence = torch.exp(transition_scores).mean().item() * 100

    # Logic
    res_text = f"👴 Patient said: {transcription}"

    if confidence < 40:
        res_text += "\n⚠️ Low Confidence: Speech unclear or different dialect."
    else:
        # Check for symptoms in corpus
        found_symptom = False
        for item in medical_corpus:
            if any(s in transcription for s in item["symptoms"]):
                res_text += f"\n🚨 SYMPTOM DETECTED: {item['category']} ({item['mandarin'][0]})"
                found_symptom = True
        if not found_symptom:
            res_text += "\nℹ️ No specific symptoms detected."

    return res_text, f"{confidence:.1f}%"

# --- 5. LAUNCH WEB INTERFACE ---

with gr.Blocks(title="Hakka Medical Bridge") as demo:
    gr.Markdown("# 🏥 Hakka-Mandarin Medical Translator")
    gr.Markdown("Auto-translation for Doctors (Mandarin) and Patients (Hakka)")

    with gr.Tab("👨‍⚕️ Doctor Mode (Order & Diagnose)"):
        gr.Markdown("Say commands like: **'Remember to take medicine'**, **'Deep breath'**, **'Do you have high blood pressure?'**")
        with gr.Row():
            doc_input = gr.Audio(sources=["microphone"], type="filepath", label="Doctor Input (Mandarin)")
            doc_output_log = gr.Textbox(label="System Log")

        with gr.Row():
            doc_hakka_text = gr.Textbox(label="Hakka Script", lines=2)
            doc_hakka_audio = gr.Audio(label="AI Spoken Hakka (MMS)", autoplay=True)

        btn_translate = gr.Button("Translate to Hakka", variant="primary")
        btn_translate.click(
            process_doctor_command,
            inputs=doc_input,
            outputs=[doc_output_log, doc_hakka_text, doc_hakka_audio]
        )

    with gr.Tab("👴 Patient Mode (Symptom Check)"):
        gr.Markdown("Patient speaks Hakka. AI checks for **Safety** and **Symptoms**.")
        with gr.Row():
            pat_input = gr.Audio(sources=["microphone"], type="filepath", label="Patient Input")
            pat_log = gr.Textbox(label="Analysis", lines=5)
            pat_conf = gr.Textbox(label="Accent Confidence")

        btn_analyze = gr.Button("Analyze Patient Speech", variant="primary")
        btn_analyze.click(
            process_patient_response,
            inputs=pat_input,
            outputs=[pat_log, pat_conf]
        )

if __name__ == "__main__":
    print("🌐 Starting Server...")
    demo.launch(inbrowser=True)

⚠️ Hardware: CPU only (Will be slow)
⏳ 1/3: Loading Mandarin ASR (Doctor's Ear)...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/290M [00:00<?, ?B/s]

generation_config.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

normalizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

preprocessor_config.json: 0.00B [00:00, ?B/s]

Device set to use cpu


⏳ 2/3: Loading Hakka ASR (Patient's Ear)...


preprocessor_config.json:   0%|          | 0.00/340 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

normalizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.18G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json: 0.00B [00:00, ?B/s]

⏳ 3/3: Loading MMS Hakka TTS (Patient's Voice)...


tokenizer_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/487 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/47.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/145M [00:00<?, ?B/s]

🌐 Starting Server...
It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://565ceb280eabf9eae3.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
