In [3]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel

# If you are in Colab, adapt this, for local VS Code you can omit or set to repo root
# os.chdir("/content/MWAHAHA_Competition")

BASE_MODEL_ID = "Qwen/Qwen2.5-3B-Instruct"
ADAPTER_PATH = "../models/qwen_lora_jokes" 

In [4]:
SYSTEM_PROMPT = (
    "You are a multilingual stand-up comedian. "
    "You write short, original jokes in English"
    "You ALWAYS obey the userâ€™s constraints exactly (word inclusion, topic, language). "
    "You prefer concise setups and strong punchlines."
)

# Load tokenizer from adapter folder so it uses the same special tokens
tokenizer = AutoTokenizer.from_pretrained(ADAPTER_PATH, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Load base model
if torch.cuda.is_available():
    dtype = torch.float16
    device_map = "auto"
    print("Using CUDA (float16)")
else:
    dtype = torch.float32
    device_map = None
    print("Using CPU (float32)")

base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_ID,
    torch_dtype=dtype,
    device_map=device_map,
)

# Attach LoRA adapter
model = PeftModel.from_pretrained(base_model, ADAPTER_PATH)
model.eval()

# (Optional) merge LoRA into base weights for slightly faster inference
# model = model.merge_and_unload()
# model.eval()


In [None]:
def build_chat_prompt(user_prompt: str) -> str:
    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": user_prompt},
    ]
    # add_generation_prompt=True adds the assistant role at the end for generation
    return tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
    )

@torch.inference_mode()
def generate_joke(
    prompt: str,
    max_new_tokens: int = 96,
    temperature: float = 0.8,
    top_p: float = 0.95,
    do_sample: bool = True,
):
    text = build_chat_prompt(prompt)
    inputs = tokenizer(text, return_tensors="pt")
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    output_ids = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=do_sample,
        temperature=temperature,
        top_p=top_p,
        pad_token_id=tokenizer.eos_token_id,
    )

    # Cut off the prompt part
    generated_ids = output_ids[0][inputs["input_ids"].shape[1]:]
    response = tokenizer.decode(generated_ids, skip_special_tokens=True)
    return response.strip()


In [7]:
test_prompts = [
    "How do you collect lots of stars?",
    "Tell me a short joke about programmers and coffee.",
    "Make a pun about databases in one sentence.",
]

for p in test_prompts:
    print("=" * 80)
    print("USER:", p)
    ans = generate_joke(p)
    print("MODEL:", ans)


In [8]:
ans = generate_joke("How do you collect lots of stars?",
                    max_new_tokens=8,    # tiny
                    do_sample=False)
print(ans)


In [13]:
import pandas as pd


sft_result = pd.read_csv("../results/task-a-title_predictions.tsv", sep="\t")


sft_result

In [14]:
print(sft_result.isna().sum())
