In [3]:
import json
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch
from tqdm import tqdm

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="openai-community/gpt2")

# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")



def clean_text(text):
    return text.replace('\n', ' ').replace('\r', ' ').replace('"', "'").strip()

def generate_output(prompt):
    inputs = tokenizer.encode(prompt, return_tensors="pt")
    with torch.no_grad():
        outputs = model.generate(inputs, max_length=inputs.shape[1] + 30, temperature=0.7, do_sample=True)
    return tokenizer.decode(outputs[0])[len(prompt):].strip()

# Load jokes from JSON
with open("reddit_jokes.json", "r", encoding="utf-8") as f:
    jokes_data = json.load(f)

# Output list
joke_pairs = []

# Process each joke
for joke in tqdm(jokes_data[26:46]):
    title = joke.get("title", "").strip()
    body = joke.get("body", "").strip()
    full_joke = clean_text(f"{title} {body}")
    if len(full_joke) > 140:
        continue  # Skip jokes that are too long

    if not full_joke:
        continue  # skip if empty

    prompt = (
        "Convert the following joke into a neutral, non-humorous statement. Remove any humor, exaggeration, or playful language. "
        "The output should be a serious version of the statement.\n\n"
        "Example 1:\n"
        "Joke: Why don't skeletons fight each other? They don't have the guts.\n"
        "Non-humorous statement: Skeletons lack internal organs, so they cannot fight.\n\n"
        "Example 2:\n"
        "Joke: Why was the math book sad? It had too many problems.\n"
        "Non-humorous statement: The math book contains a lot of problems.\n\n"
        "Now, please convert the following joke into a non-humorous statement:\n"
        f"Joke: {full_joke}\nNon-humorous statement:"
    )
    
    try:
        unfunny = generate_output(prompt)
    except Exception as e:
        print(f"Error with joke: {full_joke[:60]}... | {str(e)}")
        continue

    joke_pairs.append({
        "joke": full_joke,
        "unfunny": unfunny
    })
    print(f"full_joke = {full_joke}")
    print(f"nonjoke = {unfunny}")


# Save to new JSON
with open("joke_unfunny_pairs.json", "w", encoding="utf-8") as f_out:
    json.dump(joke_pairs, f_out, indent=2, ensure_ascii=False)

print(f"Saved {len(joke_pairs)} joke-unfunny pairs to joke_unfunny_pairs.json")


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]




Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cpu
  0%|                                                                                           | 0/20 [00:00<?, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  5%|████▏                                                                              | 1/20 [00:01<00:19,  1.02s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


full_joke = What are minorities? Lesser people.
nonjoke = No, the math book includes half of the problems.

Example 3:

Joke: Why did you have to go to college to


 10%|████████▎                                                                          | 2/20 [00:01<00:17,  1.06it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


full_joke = Did you hear that Donald Trump is technically a plant? Because all of his cells have built a wall.
nonjoke = Donald Trump is a plant.

Example 3:

Joke: Do you want to go to a football game and play basketball? You


 15%|████████████▍                                                                      | 3/20 [00:02<00:15,  1.09it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


full_joke = i had trouble swallowing a viagra last night my neck was stiff for 4 hours
nonjoke = I didn't drink it all the time

Example 3:

Joke: Why was the math book sad? It had too many problems


 20%|████████████████▌                                                                  | 4/20 [00:03<00:14,  1.10it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


full_joke = What is the king of all school supplies? The Ruler
nonjoke = The Ruler is a liar, a fool, and a cheat.

Example 3:

Joke: Why do your friends steal my phone


 40%|█████████████████████████████████▏                                                 | 8/20 [00:04<00:05,  2.31it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


full_joke = Why did the producers of 007 films use government debt to fund their newest film? Because interest in the Bond is so low.
nonjoke = The government doesn't need to pay people to do what they do.

Example 3:

Joke: Why did the first movie with


 45%|█████████████████████████████████████▎                                             | 9/20 [00:05<00:05,  1.93it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


full_joke = Pocket empty day ! Happy pocket empty day.
nonjoke = A day out in the world is a day.

Example 3:

Joke: The next day is the next day.


 50%|█████████████████████████████████████████                                         | 10/20 [00:06<00:06,  1.66it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


full_joke = I want to see that new movie coming out with Scarlett Johannson… …but she probably isn't available.
nonjoke = She's not available.

Example 3:

Joke: I can't believe I'm going to end up in a hospital. You


 60%|█████████████████████████████████████████████████▏                                | 12/20 [00:07<00:04,  1.86it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


full_joke = Why is it so hard to break up with a Japanese Girl? You have to drop the Bomb twice before she gets the Message.
nonjoke = It is actually not hard to break up with a Japanese Girl.

Example 3:

Joke: Why are the girls so stupid?


 75%|█████████████████████████████████████████████████████████████▌                    | 15/20 [00:08<00:02,  2.34it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


full_joke = My wife wants to eat somewhere shes never eaten before for V-Day I told her she should try the kitchen
nonjoke = She is crazy.

Example 3:

Joke: I get a lot of mail from different countries that say they are not going to


 90%|█████████████████████████████████████████████████████████████████████████▊        | 18/20 [00:08<00:00,  2.67it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


full_joke = There are two types of people The ones who bang on the wall, And the ones who bang on the wall because I'm banging my girlfriend on the wall
nonjoke = I hate people who bang on the wall.

Example 3:
Joke: Whoops, I think I was a bad math teacher.


 95%|█████████████████████████████████████████████████████████████████████████████▉    | 19/20 [00:09<00:00,  2.21it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


full_joke = Why did the computer squeak? Someone stepped on its mouse.
nonjoke = Someone stepped on its mouse.

Example 3:

Joke: Why did the computer squeak? Someone stepped on its mouse.


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:10<00:00,  1.87it/s]

full_joke = i found a place where the recycling rate is 98%. Your moms bed.
nonjoke = The recycling rate is 98%. Your moms bed. Example 2:

Joke: how long do you have to wait before you can sell the
Saved 12 joke-unfunny pairs to joke_unfunny_pairs.json



