In [None]:
import os
os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"

import numpy as np
import librosa, joblib
import tensorflow as tf

from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model = tf.keras.models.load_model("mfcc_cnn_model_cat.h5")
le = joblib.load("label_encoder_cat.pkl")

tokenizer = AutoTokenizer.from_pretrained(
    "Qwen/Qwen2.5-3B-Instruct",
    trust_remote_code=True
)

llm_model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen2.5-3B-Instruct",
    torch_dtype=torch.float32,
    device_map="auto"
)

def extract_mfcc_from_file(file_path, n_mfcc=40, max_len=173):
    y, sr = librosa.load(file_path, sr=22050)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)

    if mfcc.shape[1] < max_len:
        mfcc = np.pad(
            mfcc,
            ((0, 0), (0, max_len - mfcc.shape[1])),
            mode="constant"
        )
    else:
        mfcc = mfcc[:, :max_len]

    return mfcc.T[np.newaxis, ...]


def analyze_predictions(preds, labels):
    probs = {labels[i]: float(preds[i]) for i in range(len(preds))}
    sorted_probs = sorted(probs.items(), key=lambda x: x[1], reverse=True)

    top1, top1_val = sorted_probs[0]
    top2, top2_val = sorted_probs[1]

    if top1_val >= 0.9:
        focus = {top1: top1_val}
    elif (top1_val + top2_val) >= 0.95:
        focus = {top1: top1_val, top2: top2_val}
    else:
        focus = dict(sorted_probs[:3])

    return focus


def llm_chat(prompt: str):
    messages = [
        {
            "role": "system",
            "content": (
                "You are an empathetic interpreter translating a pet‚Äôs vocal expressions into clear, "
                "natural language for humans. Your explanations should be descriptive, non-clinical, "
                "and focused on instincts, needs, or intentions rather than emotions."
            )
        },
        {
            "role": "user",
            "content": prompt
        }
    ]

    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

    inputs = tokenizer(
        text,
        return_tensors="pt"
    ).to(llm_model.device)

    with torch.no_grad():
        outputs = llm_model.generate(
            **inputs,
            max_new_tokens=160,
            temperature=0.7,
            top_p=0.9,
            do_sample=True
        )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)


if __name__ == "__main__":

    AUDIO_FILE_PATH = "pet.mp3"  # ‚Üê example cat voice

    print("‚ñ∂ Processing pet audio...")
    mfcc = extract_mfcc_from_file(AUDIO_FILE_PATH)

    print("‚ñ∂ Running CNN inference...")
    preds = model.predict(mfcc)[0]
    focus = analyze_predictions(preds, le.classes_)

    prompt = f"""
    Here are some signals extracted from a pet‚Äôs vocal expressions:
    {focus}

    Interpret these signals for a human listener:
    - Do not mention sound, audio, or probability values.
    - Focus on instincts, intentions, or possible needs.
    - Avoid raw emotion labels or diagnostic language.
    - Write 2‚Äì4 natural sentences describing what the pet might be expressing.
    - You may suggest possibilities such as hunger, alertness, discomfort, curiosity, or a desire for interaction.
    - Maintain an empathetic, calm, and explanatory tone.
    """

    print("‚ñ∂ Generating interpretation...")
    analysis_text = llm_chat(prompt)

    print("\nüêæ PET INTERPRETATION RESULT:\n")
    print(analysis_text)
