In [1]:
!pip install -q piper-tts pathvalidate soundfile librosa datasets transformers accelerate


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m65.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.1/17.1 MB[0m [31m71.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:

!python -m piper.download_voices en_US-lessac-medium


INFO:__main__:Downloaded: en_US-lessac-medium


In [None]:
import os
import json
import random
import subprocess
import torch
from tqdm import tqdm
import soundfile as sf
import librosa

from datasets import load_dataset
from transformers import AutoProcessor, Qwen2AudioForConditionalGeneration

# ---------------- CONFIG -------------------
HF_DATASET = "frostymelonade/SemEval2017-task7-pun-detection"
HF_SPLIT = "test"

TYPES = {"heterographic", "homographic"}
PER_TYPE = 250
SEED = 42

MODEL_ID = "Qwen/Qwen2-Audio-7B-Instruct"
MAX_NEW_TOKENS = 120

PIPER_MODEL = "/content/piper_models/en_US-lessac-medium.onnx"
AUDIO_DIR = "cache/tts"
AUDIO_EXT = ".wav"

OUT_BASE = "cache/phase2_text_audio_raw"
OUT_ALL = OUT_BASE + ".jsonl"
OUT_HET = OUT_BASE + ".heterographic.jsonl"
OUT_HOM = OUT_BASE + ".homographic.jsonl"

os.makedirs(AUDIO_DIR, exist_ok=True)
os.makedirs("cache", exist_ok=True)

# ---------------- PROMPT ----------------

def build_messages(text):
    return [
        {"role": "system", "content": "You are an expert linguist."},
        {
            "role": "user",
            "content": f"""Explain whether the following text contains a pun.

You are given the written text and its spoken audio.

<Audio>
<|AUDIO|>
</Audio>

Instructions:
- Do NOT explain your analysis process.
- Do NOT define what a pun is.
- Focus ONLY on the linguistic mechanism.
- If the text is a pun, clearly state:
  • the word or phrase involved
  • the two meanings or sound-based ambiguity
- If it is not a pun, clearly state that no wordplay or ambiguity is present.

Write a concise paragraph (3–6 sentences).

Text:
{text}
"""
        }
    ]


# ---------------- HELPERS ----------------
def normalize_id(x):
    return str(x).strip() if x else None

def load_audio(path, target_sr=16000):
    wav, sr = sf.read(path)
    if sr != target_sr:
        wav = librosa.resample(wav, orig_sr=sr, target_sr=target_sr)
    return wav



#------------------PHASE A - OFFLINE TTS----------------
print("=== Phase A: Generating TTS ===")

ds = load_dataset(HF_DATASET, split=HF_SPLIT)

items = []
for r in ds:
    if r["type"] in TYPES:
        items.append({
            "id": normalize_id(r["id"]),
            "text": r["text"],
            "type": r["type"],
            "label": r["label"],
        })

grouped = {}
for x in items:
    grouped.setdefault(x["type"], []).append(x)

rng = random.Random(SEED)
items = []
for t in grouped:
    rng.shuffle(grouped[t])
    items.extend(grouped[t][:PER_TYPE])

def generate_tts(text, uid):
    out_wav = os.path.join(AUDIO_DIR, uid + AUDIO_EXT)

    if os.path.exists(out_wav) and os.path.getsize(out_wav) > 1000:
        return True

    p = subprocess.run(
        [
            "piper",
            "--model", PIPER_MODEL,
            "--output_file", out_wav,
        ],
        input=text + "\n",
        text=True,
        capture_output=True,
    )

    if p.returncode != 0:
        print(f"[PIPER ERROR] {uid}")
        print(p.stderr)
        return False

    if not os.path.exists(out_wav) or os.path.getsize(out_wav) < 1000:
        return False

    return True

ok = 0
for it in tqdm(items, desc="TTS"):
    if generate_tts(it["text"], it["id"]):
        ok += 1

print(f"TTS generated for {ok}/{len(items)} items")

# ---------------VERIFY WAVS---------------
bad = []
for fn in os.listdir(AUDIO_DIR):
    try:
        info = sf.info(os.path.join(AUDIO_DIR, fn))
        if info.frames == 0:
            bad.append(fn)
    except:
        bad.append(fn)

print("Bad wav files:", len(bad))
assert len(bad) == 0, "Some WAV files are invalid"


#-------------------PHASE B — QWEN2-AUDIO-------------------
print("=== Phase B: Qwen2-Audio inference ===")

device = "cuda"
torch.set_grad_enabled(False)

processor = AutoProcessor.from_pretrained(MODEL_ID)
model = Qwen2AudioForConditionalGeneration.from_pretrained(
    MODEL_ID,
    device_map="auto",
    torch_dtype=torch.float16,
).eval()

def generate_reason(text, uid):
    messages = build_messages(text)
    prompt = processor.tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
    )

    audio = load_audio(os.path.join(AUDIO_DIR, uid + AUDIO_EXT))

    inputs = processor(
        text=prompt,
        audio=audio,
        sampling_rate=16000,
        return_tensors="pt",
        padding=True,
    ).to(device)

    out = model.generate(
        **inputs,
        max_new_tokens=MAX_NEW_TOKENS,
        min_new_tokens=40,
        do_sample=False,
        pad_token_id=processor.tokenizer.eos_token_id,
    )

    gen = out[0][inputs["input_ids"].shape[1]:]
    return processor.tokenizer.decode(
        gen, skip_special_tokens=True, clean_up_tokenization_spaces=True
    ).strip()

os.makedirs(os.path.dirname(OUT_ALL), exist_ok=True)

with open(OUT_ALL, "w", encoding="utf-8") as fa, \
     open(OUT_HET, "w", encoding="utf-8") as fh, \
     open(OUT_HOM, "w", encoding="utf-8") as fm:

    for it in tqdm(items, desc="Inference"):
        uid = it["id"]
        wav = os.path.join(AUDIO_DIR, uid + AUDIO_EXT)
        if not os.path.exists(wav):
            continue

        reason = generate_reason(it["text"], uid)

        obj = {
            "id": uid,
            "Text": it["text"],
            "RawReason": reason,
            "Label": it["label"],
            "Type": it["type"],
        }

        line = json.dumps(obj, ensure_ascii=False) + "\n"
        fa.write(line)
        (fh if it["type"] == "heterographic" else fm).write(line)

print("=== DONE: Text + Audio experiment complete ===")


=== Phase A: Generating TTS ===


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/101 [00:00<?, ?B/s]

heterographic-and-homographic-test.json: 0.00B [00:00, ?B/s]

Generating test split:   0%|          | 0/4030 [00:00<?, ? examples/s]

TTS: 100%|██████████| 500/500 [18:38<00:00,  2.24s/it]


TTS generated for 500/500 items
Bad wav files: 0
=== Phase B: Qwen2-Audio inference ===




preprocessor_config.json:   0%|          | 0.00/342 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/853 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Downloading (incomplete total...): 0.00B [00:00, ?B/s]

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

Loading weights:   0%|          | 0/876 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/230 [00:00<?, ?B/s]


Inference:   0%|          | 0/500 [00:00<?, ?it/s][AThe following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.

Inference:   0%|          | 1/500 [00:16<2:19:42, 16.80s/it][A
Inference:   0%|          | 2/500 [00:22<1:24:34, 10.19s/it][A
Inference:   1%|          | 3/500 [00:25<57:54,  6.99s/it]  [A
Inference:   1%|          | 4/500 [00:30<50:03,  6.06s/it][A
Inference:   1%|          | 5/500 [00:34<44:37,  5.41s/it][A
Inference:   1%|          | 6/500 [00:38<41:40,  5.06s/it][A
Inference:   1%|▏         | 7/500 [00:44<43:57,  5.35s/it][A
Inference:   2%|▏         | 8/500 [00:47<38:19,  4.67s/it][A
Inference:   2%|▏         | 9/500 [00:51<36:12,  4.42s/it][A
Inference:   2%|▏         | 10/500 [00:55<33:24,  4.09s/it][A
Inference:   2%|▏         | 11/500 [00:58<30:23,  3.73s/it][A
Inference:   2%|▏         | 12/500 [01:01<29:03,  3.57s/it][A
Inference:   3%|▎         | 13/500 [01:0

=== DONE: Text + Audio experiment complete ===





In [None]:
from google.colab import files

files.download("cache/phase2_text_audio_raw.jsonl")
files.download("cache/phase2_text_audio_raw.heterographic.jsonl")
files.download("cache/phase2_text_audio_raw.homographic.jsonl")

In [None]:
!which piper
!piper --help
