In [29]:
import json
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from tqdm import tqdm

# === Load model and tokenizer ===
tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")

# Set padding token
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# === Helper function ===
def clean_text(text):
    return text.replace('\n', ' ').replace('\r', ' ').replace('"', "'").strip()

# === Load jokes ===
with open("reddit_jokes.json", "r", encoding="utf-8") as f:
    jokes_data = json.load(f)

# === Output storage ===
joke_pairs = []

# === Batch processing ===
BATCH_SIZE = 16
MAX_JOKES = 7000

for i in tqdm(range(0, len(jokes_data[:MAX_JOKES]), BATCH_SIZE)):
    batch = jokes_data[i:i + BATCH_SIZE]
    prompts = []
    joke_texts = []

    for joke in batch:
        title = joke.get("title", "").strip()
        body = joke.get("body", "").strip()
        full_joke = clean_text(f"{title} {body}")
        if len(full_joke) > 140 or not full_joke:
            continue
        joke_texts.append(full_joke)
        prompts.append(
            f"Convert the following joke into a neutral, non-humorous statement. "
            f"Remove any humor, exaggeration, or playful language. The output should be serious.\n\n"
            f"Joke: {full_joke}\nNon-humorous statement:"
        )

    if not prompts:
        continue

    try:
        inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True).to(device)
        with torch.no_grad():
            outputs = model.generate(
                input_ids=inputs["input_ids"],
                attention_mask=inputs["attention_mask"],
                max_length=inputs["input_ids"].shape[1] + 30,
                temperature=0.7,
                do_sample=True,
                pad_token_id=tokenizer.pad_token_id
            )
    except Exception as e:
        print(f"Batch failed: {str(e)}")
        continue

    for j, output in enumerate(outputs):
        decoded = tokenizer.decode(output, skip_special_tokens=True)
        try:
            # Try to extract the generated non-humorous statement
            response = decoded.split("Non-humorous statement:")[1].split("\n")[0].strip()
        except IndexError:
            response = ""
        joke_pairs.append({
            "joke": joke_texts[j],
            "unfunny": response
        })

# === Save to JSON ===
with open("joke_unfunny_pairs.json", "w", encoding="utf-8") as f_out:
    json.dump(joke_pairs, f_out, indent=2, ensure_ascii=False)

print(f"✅ Saved {len(joke_pairs)} joke-unfunny pairs to joke_unfunny_pairs.json")


  0%|                                                                                                              | 0/438 [00:00<?, ?it/s]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
  0%|▏                                                                                                     | 1/438 [00:06<45:21,  6.23s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
  0%|▍                                                                                                     | 2/438 [00:12<44:15,  6.09s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
  0%|▍                                                          

KeyboardInterrupt: 