In [31]:
import torch
import torch.nn.functional as F
from torch import nn
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model
from datasets import Dataset
import pandas as pd

In [32]:
filepath = '../datasets/rJokes/preprocessed.csv'

df = pd.read_csv(filepath)
df.head(5)
cleaned_df = df.dropna()
cleaned_df.head(5)
data = pd.DataFrame()
data['caption'] = cleaned_df['body'].copy()
data['joke'] = cleaned_df['joke'].copy()

In [33]:
data = data[:100]

In [29]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

df = pd.DataFrame(data)
dataset = Dataset.from_pandas(df)

def tokenize_function(examples):
    max_length = 100
    inputs = tokenizer(
        examples["caption"],
        padding="max_length",
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )
    targets = tokenizer(
        examples["joke"],
        padding="max_length",
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )
    return {
        "input_ids": inputs["input_ids"].squeeze(),
        "attention_mask": inputs["attention_mask"].squeeze(),
        "labels": targets["input_ids"].squeeze()
    }

tokenized_dataset = dataset.map(tokenize_function, batched=True)

lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["c_attn", "c_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits

        shift_logits = logits[..., :-1, :].contiguous()
        shift_labels = labels[..., 1:].contiguous()

        min_seq_length = min(shift_logits.size(1), shift_labels.size(1))
        shift_logits = shift_logits[:, :min_seq_length, :].contiguous()
        shift_labels = shift_labels[:, :min_seq_length].contiguous()

        vocab_size = shift_logits.size(-1)
        batch_size, seq_length = shift_labels.size()
        shift_labels_one_hot = F.one_hot(shift_labels, num_classes=vocab_size).float()

        shift_logits = shift_logits.view(-1, vocab_size)
        shift_labels_one_hot = shift_labels_one_hot.view(-1, vocab_size)


        loss_fct = nn.MSELoss()
        loss = loss_fct(shift_logits, shift_labels_one_hot)

        return (loss, outputs) if return_outputs else loss

training_args = TrainingArguments(
    output_dir="./joke_generator",
    num_train_epochs=10,
    per_device_train_batch_size=4,
    save_steps=500,
    save_total_limit=2,
    logging_steps=100,
    learning_rate=2e-5,
    weight_decay=0.01,
)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)

trainer.train()

model.save_pretrained("./joke_generator_model")
tokenizer.save_pretrained("./joke_generator_model")

def generate_joke(caption):
    model.eval()
    inputs = tokenizer(caption, return_tensors="pt", padding=True, truncation=True, max_length=100).to(device)
    outputs = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=400,
        num_beams=5,
        no_repeat_ngram_size=2,
        early_stopping=True
    )
    joke = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return joke

caption = "A cat in a hat"
joke = generate_joke(caption)
print(f"Caption: {caption}")
print(f"Joke: {joke}")

Map: 100%|██████████| 100/100 [00:00<00:00, 2164.52 examples/s]


Step,Training Loss


KeyboardInterrupt: 

In [None]:
import torch
import torch.nn as nn
from transformers import (
    AutoTokenizer, AutoModelForCausalLM, 
    Trainer, TrainingArguments, 
    DataCollatorForLanguageModeling,
    GenerationConfig
)
from peft import LoraConfig, get_peft_model
from datasets import Dataset
import pandas as pd
from typing import Dict, List


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

special_tokens = {"additional_special_tokens": ["[CAPTION]", "[JOKE]"]}
tokenizer.add_special_tokens(special_tokens)
model.resize_token_embeddings(len(tokenizer))

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token


def format_prompt(caption: str, joke: str = None) -> str:
    """Format the input for causal language modeling"""
    prompt = f"[CAPTION] {caption} [JOKE]"
    if joke:
        prompt += f" {joke}{tokenizer.eos_token}"
    return prompt

def tokenize_function(examples: Dict) -> Dict:
    """Improved tokenization with proper formatting"""
    
    formatted_texts = [format_prompt(caption, joke) 
                      for caption, joke in zip(examples["caption"], examples["joke"])]
    
    tokenized = tokenizer(
        formatted_texts,
        padding="max_length",
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )
    
    tokenized["labels"] = tokenized["input_ids"].clone()
    
    return tokenized

df = pd.DataFrame(data)
dataset = Dataset.from_pandas(df)
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=dataset.column_names)


data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
    pad_to_multiple_of=8
)


lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["c_attn", "c_proj", "c_fc"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    modules_to_save=["lm_head"]
)

model = get_peft_model(model, lora_config)

print("Trainable parameters:")
model.print_trainable_parameters()


class ImprovedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        """Use standard cross-entropy loss for language modeling"""
        outputs = model(**inputs)
        loss = outputs.loss
        
        return (loss, outputs) if return_outputs else loss


training_args = TrainingArguments(
    output_dir="./joke_generator",
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    warmup_steps=100,
    logging_steps=50,
    save_steps=500,
    learning_rate=1e-4,
    weight_decay=0.01,
    adam_epsilon=1e-8,
    max_grad_norm=1.0,
    prediction_loss_only=True,
    remove_unused_columns=False,
    load_best_model_at_end=False,
    fp16=torch.cuda.is_available(),
    dataloader_pin_memory=False,
)


trainer = ImprovedTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

print("Starting training...")
trainer.train()

trainer.save_model("./joke_generator_model")
tokenizer.save_pretrained("./joke_generator_model")


def generate_joke(caption: str, max_joke_length: int = 100) -> str:
    """Improved generation with better parameters"""
    model.eval()
    
    prompt = format_prompt(caption)
    
    inputs = tokenizer(prompt, return_tensors="pt", max_length=256, truncation=True).to(device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=len(inputs["input_ids"][0]) + max_joke_length,
            num_beams=5,
            early_stopping=True,
            no_repeat_ngram_size=2,
            temperature=0.7,
            do_sample=True,
            top_k=50,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )
    
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=False)
    
    if "[JOKE]" in generated_text:
        joke_start = generated_text.find("[JOKE]") + 6
        joke = generated_text[joke_start:].split(tokenizer.eos_token)[0].strip()
    else:
        joke = generated_text[len(prompt):].split(tokenizer.eos_token)[0].strip()
    
    return joke


def test_joke_generation():
    """Test the model with various captions"""
    test_captions = [
        "A cat in a hat",
        "A programmer at a coffee shop", 
        "A banana wearing sunglasses",
        "Two robots in love"
    ]
    
    print("Testing Joke Generation:\n")
    for caption in test_captions:
        try:
            joke = generate_joke(caption)
            print(f"Caption: {caption}")
            print(f"Generated Joke: {joke}\n")
            print("-" * 50)
        except Exception as e:
            print(f"Error generating joke for '{caption}': {e}")

test_joke_generation()

'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 543407fe-08f5-4674-989f-9527c8fc5f7e)')' thrown while requesting HEAD https://huggingface.co/gpt2/resolve/main/tokenizer_config.json
Retrying in 1s [Retry 1/5].
The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
Map: 100%|██████████| 100/100 [00:00<00:00, 2514.19 examples/s]
  trainer = ImprovedTrainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 50256}.


Trainable parameters:
trainable params: 40,958,208 || all params: 165,399,552 || trainable%: 24.7632
Starting training...


`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
50,6.51




Testing Joke Generation:

Caption: A cat in a hat
Generated Joke: 𐌈����𠌀𓌐𡌂𙌑𒌓𔌒𑌔𚌙𛌛𝌝𜌞𞌜🌠𤌟𥌡𣌤𦌥𪌨𩌬𨌫𬌪𭌭�

--------------------------------------------------
Caption: A programmer at a coffee shop
Generated Joke: 𐌀𒌂𙌐𤌑𠌓𡌗𓌘𣌙�𦌛𛌔𝌝�𗐼𜌡�𔐞𑌠𕌥𘌭𭐐青𚌟🌰𢌣�

--------------------------------------------------
Caption: A banana wearing sunglasses
Generated Joke: 𐌀𒌂𑌃𓌁𔐐��𠑓𡌫𙌬�𤐭𣌨�𛐹�𦐺��𜌱��𿐴𱱗𭐼𴐾�

--------------------------------------------------
Caption: Two robots in love
Generated Joke: One of them is a woman 𐐌�𒌷𙌱𠌐𡌬𘌭𦌲𓌴�𔌘𖌨𑌮𛌳𜌼𞌽🌾𣌹𤌿𥌺𨌸�𪌰𩍦�

--------------------------------------------------


In [41]:
caption = data['caption'].values[3]
joke = generate_joke(caption)
print(f"Caption: {caption}")
print("-" * 50)
for i in range(0, len(joke), 100):
    print(joke[i:i+100])

Caption: Two gay guys are lying on their bed.  The first guy asks the second:

"Do you hate me?"

Second Guy replies , "No..."

"Do you find me annoying?" - "No"

"Do you think I'm ugly?" - "No"

"Is there some one else?" - "No"

"Do you want me to leave? " 

"Don't you want to have sex with me?"
Second guy says "I do! But why are you asking me these stupid questions?"

First Guy: "Then why are you facing towards me!?"

--------------------------------------------------
Second Guy : "Because you're a gay guy!"  Kenyan guy replies "You're not gay! You're just a guy who 
likes to lie on his bed!   " Do you like me?  I hate you! No, I don't! I find you annoying!" Second 
guy : ( "Why do you ask me this stupid question?" ) Kenyan dude replies :  Nigerian guy:  He says he
 likes me because he's gay, but he doesn't like to talk about
