In [None]:
import os
import json
import torch
from tqdm import tqdm
from datasets import load_dataset
from transformers import AutoProcessor, Qwen2AudioForConditionalGeneration

# ================= CONFIG =================

DATASET_PATH = "context_situated_pun.csv"   # local file
DATASET_SPLIT = "train"

OUT_PATH = "cache/pun_explanations_qwen.jsonl"

MODEL_ID = "Qwen/Qwen2-Audio-7B-Instruct"
MAX_NEW_TOKENS = 120
MAX_ITEMS = 500

# ================= PROMPT =================

def build_messages(text):
    return [
        {
            "role": "system",
            "content": "You are an expert linguist."
        },
        {
            "role": "user",
            "content": f"""Explain whether the following text contains a pun.

Instructions:
- Do NOT explain your analysis process.
- Do NOT define what a pun is.
- Focus ONLY on the linguistic mechanism.
- If the text is a pun, clearly state:
  • the word or phrase involved
  • the two meanings or sound-based ambiguity
- If it is not a pun, clearly state that no wordplay or ambiguity is present.

Write a concise paragraph (3–6 sentences).

Text:
{text}
"""
        }
    ]

# ================= MAIN =================

def main():
    if not torch.cuda.is_available():
        raise SystemExit("NO GPU DETECTED")

    device = "cuda"
    torch.set_grad_enabled(False)

    print(f"Loading model: {MODEL_ID}")
    processor = AutoProcessor.from_pretrained(MODEL_ID)
    model = Qwen2AudioForConditionalGeneration.from_pretrained(
        MODEL_ID,
        device_map="auto",
        torch_dtype=torch.float16,
    ).eval()

    print(f"Loading local dataset: {DATASET_PATH}")
    ds = load_dataset(
        "csv" if DATASET_PATH.endswith(".csv") else "json",
        data_files=DATASET_PATH,
        split=DATASET_SPLIT,
    )

    os.makedirs(os.path.dirname(OUT_PATH), exist_ok=True)

    def generate(text: str) -> str:
        messages = build_messages(text)

        prompt = processor.tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True,
        )

        inputs = processor(
            text=prompt,
            return_tensors="pt",
            padding=True,
        ).to(device)

        with torch.no_grad():
            out = model.generate(
                **inputs,
                max_new_tokens=MAX_NEW_TOKENS,
                min_new_tokens=40,
                do_sample=False,
                pad_token_id=processor.tokenizer.eos_token_id,
            )

        gen_tokens = out[0][inputs["input_ids"].shape[1]:]
        return processor.tokenizer.decode(
            gen_tokens,
            skip_special_tokens=True,
            clean_up_tokenization_spaces=True,
        ).strip()

    count = 0

    with open(OUT_PATH, "w", encoding="utf-8") as f:
        for idx, item in tqdm(enumerate(ds), total=len(ds), desc="Explaining texts"):
            if count >= MAX_ITEMS:
                break

            raw_text = item.get("user_pun")

            if raw_text is None:
                continue

            text = str(raw_text).strip()

            # Skip placeholders / empty-like entries
            if not text or text in {"{}"}:
                continue

            explanation = generate(text)

            out_obj = {
                "id": idx,
                "Explanation": explanation,
            }

            f.write(json.dumps(out_obj, ensure_ascii=False) + "\n")
            f.flush()

            count += 1
            torch.cuda.empty_cache()

    print("Done.")
    print(f"Generated {count} explanations")
    print(f"Output -> {OUT_PATH}")

# ================= RUN =================

if __name__ == "__main__":
    main()


Loading model: Qwen/Qwen2-Audio-7B-Instruct


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading (incomplete total...): 0.00B [00:00, ?B/s]

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

Loading weights:   0%|          | 0/876 [00:00<?, ?it/s]

Loading local dataset: context_situated_pun.csv


Explaining texts:   0%|          | 0/4551 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Explaining texts:  19%|█▉        | 886/4551 [36:07<2:29:24,  2.45s/it]

Done.
Generated 500 explanations
Output → cache/pun_explanations_qwen.jsonl





In [None]:
from google.colab import files
import os, sys


In [None]:
# ---- Download output file ----
print("Downloading output file...")
files.download(OUT_PATH)

# ---- Shutdown runtime ----
print("Shutting down runtime...")
os.kill(os.getpid(), 9)


Downloading output file...


<IPython.core.display.Javascript object>