In [1]:
# !pip install wget
!pip install torch -q
!pip install transformers -q
!pip install datasets -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import torch

if torch.cuda.is_available():
  device = torch.device("cuda")
  device_count = torch.cuda.device_count()
  device_name = torch.cuda.get_device_name(0)

  print(f"There are {device_count} GPU(s) available.")
  print(f"We will use the GPU: {device_name}")


else:
  print("No GPU available, using the CPU instead.")
  device = torch.device("cpu")

No GPU available, using the CPU instead.


In [6]:
import torch
from transformers import DistilBertTokenizer
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split


In [26]:
class PoemDataset(Dataset):
    def __init__(self, sentences, poems, tokenizer, max_length):
        self.sentences = sentences
        self.poems = poems
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        poem = self.poems[idx]

        encoding = self.tokenizer.encode_plus(
            sentence,
            poem,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': encoding['input_ids'].flatten()
        }

In [27]:
def prepare_poem_dataset(angry_sentences, funny_poems, model_name='distilbert-base-uncased', max_length=128, batch_size=8):
    # Ensure sentences and poems are paired correctly
    assert len(angry_sentences) == len(funny_poems), "Mismatch in number of sentences and poems"

    # Split the data into train and test sets
    train_sentences, test_sentences, train_poems, test_poems = train_test_split(
        angry_sentences, funny_poems, test_size=0.2, random_state=42
    )

    # Initialize the tokenizer
    tokenizer = DistilBertTokenizer.from_pretrained(model_name)

    # Create datasets
    train_dataset = PoemDataset(train_sentences, train_poems, tokenizer, max_length)
    test_dataset = PoemDataset(test_sentences, test_poems, tokenizer, max_length)

    # Create dataloaders
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

    return train_dataloader, test_dataloader, tokenizer


In [28]:
angry_sentences = [
    "I can't believe they forgot my birthday!",
    "This traffic is driving me crazy!",
    "Why is the WiFi so slow today?",
    "I'm so tired of eating the same thing every day!",
    "My phone battery always dies when I need it most!",
    "Why do I always lose my keys right when I'm late?",
    "I hate it when people chew with their mouth open!",
    "How come the line is always longest when I'm in a hurry?",
    "Why does it always rain when I forget my umbrella?",
    "I can't stand it when people don't use their turn signals!"
]

funny_poems = [
    "Forgotten day, oh what a blight! / But who needs cake at midnight? / Perhaps they plan a grand surprise / Or simply can't read calendar's guise.",
    "Cars crawl like snails on hot concrete / A turtle race can't be beat / In this jam, I'll grow a beard / Road rage? Nah, I'm just weird.",
    "Internet crawls, my patience thins / Loading bar becomes my frenemy / I could've trained a pigeon / To deliver emails more speedy.",
    "Monotonous meals, day after day / My taste buds threaten to run away / Perhaps I'll start a food rebellion / And eat my socks for this meal's hellion.",
    "Battery drains, oh cruel device! / Always fails at moments precise / I'll invent a phone powered by sighs / Or just yell my messages to the skies.",
    "Keys play hide and seek, what a game! / As I'm rushing out, they're to blame / I'll tie them to a giant balloon / So finding them won't spell my doom.",
    "Open-mouthed chewers, please beware / Your dinner sounds pollute the air / I'll invent a mute button for mouths / Or dine exclusively down south.",
    "Lines stretch long when time is tight / A cosmic joke, an endless plight / I'll master teleportation soon / Or just camp out since last June.",
    "Raindrops fall as umbrellas hide / Weather forecasts have surely lied / I'll grow a waterproof hairdo / Or just pretend I'm at the zoo.",
    "Turn signals forgotten, cars swerve / Testing each driver's last nerve / I'll invent telepathic cars / Or stick big arrows to their fars."
]

In [29]:
train_dataloader, test_dataloader, tokenizer = prepare_poem_dataset(angry_sentences, funny_poems)

print(f"Number of training batches: {len(train_dataloader)}")
print(f"Number of test batches: {len(test_dataloader)}")

# Example of accessing a batch
for batch in train_dataloader:
    print("Input shape:", batch['input_ids'].shape)
    print("Attention mask shape:", batch['attention_mask'].shape)
    print("Labels shape:", batch['labels'].shape)
    break

Number of training batches: 1
Number of test batches: 1
Input shape: torch.Size([8, 128])
Attention mask shape: torch.Size([8, 128])
Labels shape: torch.Size([8, 128])
