In [1]:
# !pip install wget
!pip install torch -q
!pip install transformers -q
!pip install datasets -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [45]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [2]:
import torch

if torch.cuda.is_available():
  device = torch.device("cuda")
  device_count = torch.cuda.device_count()
  device_name = torch.cuda.get_device_name(0)

  print(f"There are {device_count} GPU(s) available.")
  print(f"We will use the GPU: {device_name}")


else:
  print("No GPU available, using the CPU instead.")
  device = torch.device("cpu")

No GPU available, using the CPU instead.


In [6]:
import torch
from transformers import DistilBertTokenizer
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split


In [46]:
class PoemDataset(Dataset):
    def __init__(self, sentences, poems, tokenizer, max_length):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.inputs = []

        for sentence, poem in zip(sentences, poems):
            self.inputs.append(f"{sentence} {poem}")

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        input_text = self.inputs[idx]
        encodings = self.tokenizer(input_text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
        return {
            'input_ids': encodings['input_ids'].squeeze(),
            'attention_mask': encodings['attention_mask'].squeeze(),
            'labels': encodings['input_ids'].squeeze()
        }


In [47]:
def prepare_poem_dataset(angry_sentences, funny_poems, model_name='gpt2', max_length=128, batch_size=4):
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token  # GPT-2 doesn't have a padding token by default
    train_sentences, test_sentences, train_poems, test_poems = train_test_split(angry_sentences, funny_poems, test_size=0.2, random_state=42)

    train_dataset = PoemDataset(train_sentences, train_poems, tokenizer, max_length)
    test_dataset = PoemDataset(test_sentences, test_poems, tokenizer, max_length)

    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

    return train_dataloader, test_dataloader, tokenizer

In [28]:
angry_sentences = [
    "I can't believe they forgot my birthday!",
    "This traffic is driving me crazy!",
    "Why is the WiFi so slow today?",
    "I'm so tired of eating the same thing every day!",
    "My phone battery always dies when I need it most!",
    "Why do I always lose my keys right when I'm late?",
    "I hate it when people chew with their mouth open!",
    "How come the line is always longest when I'm in a hurry?",
    "Why does it always rain when I forget my umbrella?",
    "I can't stand it when people don't use their turn signals!"
]

funny_poems = [
    "Forgotten day, oh what a blight! / But who needs cake at midnight? / Perhaps they plan a grand surprise / Or simply can't read calendar's guise.",
    "Cars crawl like snails on hot concrete / A turtle race can't be beat / In this jam, I'll grow a beard / Road rage? Nah, I'm just weird.",
    "Internet crawls, my patience thins / Loading bar becomes my frenemy / I could've trained a pigeon / To deliver emails more speedy.",
    "Monotonous meals, day after day / My taste buds threaten to run away / Perhaps I'll start a food rebellion / And eat my socks for this meal's hellion.",
    "Battery drains, oh cruel device! / Always fails at moments precise / I'll invent a phone powered by sighs / Or just yell my messages to the skies.",
    "Keys play hide and seek, what a game! / As I'm rushing out, they're to blame / I'll tie them to a giant balloon / So finding them won't spell my doom.",
    "Open-mouthed chewers, please beware / Your dinner sounds pollute the air / I'll invent a mute button for mouths / Or dine exclusively down south.",
    "Lines stretch long when time is tight / A cosmic joke, an endless plight / I'll master teleportation soon / Or just camp out since last June.",
    "Raindrops fall as umbrellas hide / Weather forecasts have surely lied / I'll grow a waterproof hairdo / Or just pretend I'm at the zoo.",
    "Turn signals forgotten, cars swerve / Testing each driver's last nerve / I'll invent telepathic cars / Or stick big arrows to their fars."
]

In [29]:
train_dataloader, test_dataloader, tokenizer = prepare_poem_dataset(angry_sentences, funny_poems)

print(f"Number of training batches: {len(train_dataloader)}")
print(f"Number of test batches: {len(test_dataloader)}")

# Example of accessing a batch
for batch in train_dataloader:
    print("Input shape:", batch['input_ids'].shape)
    print("Attention mask shape:", batch['attention_mask'].shape)
    print("Labels shape:", batch['labels'].shape)
    break

Number of training batches: 1
Number of test batches: 1
Input shape: torch.Size([8, 128])
Attention mask shape: torch.Size([8, 128])
Labels shape: torch.Size([8, 128])


In [48]:
def train_model(train_dataloader, model, optimizer, scheduler, device, num_epochs=3):
    model.train()
    for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}/{num_epochs}")
        for batch in tqdm(train_dataloader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            scheduler.step()

In [35]:
def evaluate_model(test_dataloader, model, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in tqdm(test_dataloader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            total_loss += outputs.loss.item()

    avg_loss = total_loss / len(test_dataloader)
    print(f"Average test loss: {avg_loss}")

In [49]:
def generate_poem(sentence, model, tokenizer, device, max_length=128):
    model.eval()
    input_ids = tokenizer.encode(sentence, return_tensors='pt').to(device)

    with torch.no_grad():
        output = model.generate(input_ids, max_length=max_length, num_return_sequences=1, do_sample=True, top_k=50, top_p=0.95)

    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    poem = generated_text[len(sentence):].strip()  # Remove the input sentence from the output
    lines = poem.split('.')[:4]  # Get first 4 sentences
    return '\n'.join(line.strip() for line in lines if line.strip())


In [50]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.to(device)

# Set up optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader) * 3)


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [51]:
train_model(train_dataloader, model, optimizer, scheduler, device)

Epoch 1/3


100%|██████████| 1/1 [00:28<00:00, 28.62s/it]


Epoch 2/3


100%|██████████| 1/1 [00:19<00:00, 19.42s/it]


Epoch 3/3


100%|██████████| 1/1 [00:17<00:00, 17.51s/it]


In [52]:
new_angry_sentence = "I can't believe I missed my bus!"
generated_poem = generate_poem(new_angry_sentence, model, tokenizer, device)
print(f"Input: {new_angry_sentence}")
print(f"Generated poem:\n{generated_poem}")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Input: I can't believe I missed my bus!
Generated poem:
[unused252] [unused92] [unused193] [unused193] є [unused313] [unused276] attacked credit [unused285] protect invitation carried [unused12] [unused193] [unused193] < [unused333] ∅ [unused279] ll [unused601] [unused279] influence [unused282] [unused257] looks [unused281] [unused257] contention [unused12] [unused193] [unused193] [unused38] [unused988] reserve [unused333] [unused461] [unused321] [unused12] [unused193] [unused193] [unused459] yanked [unused10] [unused503] [unused542] ⁷ mathematics [unused350] [unused479] [unused542] [unused279] ！ [unused423] carried [unused10] ♥ [unused279] [unused257] guardian ロ [unused10] 」 [unused506] hurling? [unused350] [unused479] reason [unused279] [unused257] keith [unused12] [unused193] [unused193] [unused38] [unused988] [unused193] [unused193] [unused38] [unused988] [unused193] [unused193] ″ [unused479] [unused461] [unused321] [unused285] [unused646] [unused498] [unused10] [unused257] ɛ [unus