**Step 1: Import Libraries**

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import random
import re
import pandas as pd
from torch.utils.data import Dataset, DataLoader


**Step 2: Load and Preprocess Dataset**

In [2]:
# Load the dataset
dataset_path = "/kaggle/input/poems/poems-100.csv"
df = pd.read_csv(dataset_path)

# Check column names
print("Column names in the dataset:", df.columns)

# Preprocess text (tokenization, lowercasing, etc.)
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9\s.,!?]", "", text)  # Keep basic punctuation
    return text.split()

# Ensure correct column name
correct_column_name = "text"  # Update if column name is different
df["tokens"] = df[correct_column_name].apply(preprocess_text)

# Display the first few rows
print(df.head())


Column names in the dataset: Index(['text'], dtype='object')
                                                text  \
0  O my Luve's like a red, red rose\nThat’s newly...   
1  The rose is red,\nThe violet's blue,\nSugar is...   
2  How do I love thee? Let me count the ways.\nI ...   
3  Had I the heavens' embroidered cloths,\nEnwrou...   
4  I.\n    Enough! we're tired, my heart and I.\n...   

                                              tokens  
0  [o, my, luves, like, a, red,, red, rose, thats...  
1  [the, rose, is, red,, the, violets, blue,, sug...  
2  [how, do, i, love, thee?, let, me, count, the,...  
3  [had, i, the, heavens, embroidered, cloths,, e...  
4  [i., enough!, were, tired,, my, heart, and, i....  


**Step 3: Create Vocabulary**

In [3]:
# Create vocabulary
all_words = [word for tokens in df["tokens"] for word in tokens]
unique_words = list(set(all_words))
word_vocab = {word: idx for idx, word in enumerate(unique_words)}
vocab_reverse = {idx: word for word, idx in word_vocab.items()}

# Vocabulary size
vocab_size = len(word_vocab)
print(f"Vocabulary size: {vocab_size}")


Vocabulary size: 6725


**Step 4: Convert Data to One-Hot Encoded Sequences**

In [4]:
sequence_length = 5  # Number of words used as input

# One-hot encoding function
def one_hot_encode(indices, vocab_size):
    batch_size = len(indices)
    seq_length = len(indices[0])
    one_hot = torch.zeros(batch_size, seq_length, vocab_size)  # Shape: (batch, seq_len, vocab_size)
    
    for i in range(batch_size):
        for j in range(seq_length):
            one_hot[i, j, indices[i][j]] = 1.0  # Set correct index to 1

    return one_hot

# Create sequences
def create_sequences(tokens, seq_length):
    input_seqs, target_seqs = [], []
    for i in range(len(tokens) - seq_length):
        input_seqs.append([word_vocab[t] for t in tokens[i:i+seq_length]])
        target_seqs.append(word_vocab[tokens[i+seq_length]])
    return input_seqs, target_seqs

# Prepare dataset
input_seqs, target_seqs = [], []
for tokens in df["tokens"]:
    inp, tgt = create_sequences(tokens, sequence_length)
    input_seqs.extend(inp)
    target_seqs.extend(tgt)

# Convert to tensors
input_seqs = one_hot_encode(input_seqs, vocab_size)  # One-hot encode inputs
target_seqs = torch.tensor(target_seqs, dtype=torch.long)


**Step 5: Define Dataset & DataLoader**

In [5]:
class PoemDataset(Dataset):
    def __init__(self, inputs, targets):
        self.inputs = inputs
        self.targets = targets

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return self.inputs[idx], self.targets[idx]

batch_size = 64
dataset = PoemDataset(input_seqs, target_seqs)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)


**Step 6: Define the RNN Model**

In [6]:
class PoemRNN(nn.Module):
    def __init__(self, vocab_size, hidden_size, num_layers):
        super(PoemRNN, self).__init__()
        self.lstm = nn.LSTM(vocab_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, hidden):
        out, hidden = self.lstm(x, hidden)
        out = self.fc(out[:, -1, :])  
        return out, hidden


**Step 7: Initialize Model & Optimizer**

In [7]:
# Hyperparameters
hidden_size = 512  
num_layers = 2  

# Model
model = PoemRNN(vocab_size, hidden_size, num_layers).cuda()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


**Step 8: Train the Model**

In [8]:
num_epochs = 200  

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0

    for inputs, targets in dataloader:
        inputs, targets = inputs.cuda(), targets.cuda()

        hidden = (
            torch.zeros(num_layers, inputs.size(0), hidden_size).cuda(),
            torch.zeros(num_layers, inputs.size(0), hidden_size).cuda(),
        )

        optimizer.zero_grad()
        outputs, hidden = model(inputs, hidden)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    avg_loss = epoch_loss / len(dataloader)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")

torch.save(model.state_dict(), "poem_rnn_onehot.pth")
print("Training complete. Model saved as 'poem_rnn_onehot.pth'.")


Epoch 1/200, Loss: 7.2379
Epoch 2/200, Loss: 6.6455
Epoch 3/200, Loss: 6.2180
Epoch 4/200, Loss: 5.6761
Epoch 5/200, Loss: 4.9289
Epoch 6/200, Loss: 3.9498
Epoch 7/200, Loss: 2.8212
Epoch 8/200, Loss: 1.7596
Epoch 9/200, Loss: 0.9969
Epoch 10/200, Loss: 0.5510
Epoch 11/200, Loss: 0.2950
Epoch 12/200, Loss: 0.1587
Epoch 13/200, Loss: 0.0897
Epoch 14/200, Loss: 0.0602
Epoch 15/200, Loss: 0.0468
Epoch 16/200, Loss: 0.0387
Epoch 17/200, Loss: 0.0417
Epoch 18/200, Loss: 0.0642
Epoch 19/200, Loss: 0.0986
Epoch 20/200, Loss: 0.0744
Epoch 21/200, Loss: 0.0474
Epoch 22/200, Loss: 0.0319
Epoch 23/200, Loss: 0.0230
Epoch 24/200, Loss: 0.0216
Epoch 25/200, Loss: 0.0233
Epoch 26/200, Loss: 0.0209
Epoch 27/200, Loss: 0.0407
Epoch 28/200, Loss: 0.1539
Epoch 29/200, Loss: 0.0400
Epoch 30/200, Loss: 0.0173
Epoch 31/200, Loss: 0.0141
Epoch 32/200, Loss: 0.0119
Epoch 33/200, Loss: 0.0118
Epoch 34/200, Loss: 0.0116
Epoch 35/200, Loss: 0.0119
Epoch 36/200, Loss: 0.0137
Epoch 37/200, Loss: 0.0967
Epoch 38/2

**Step 9: Generate Poem**

In [9]:
def generate_poem(start_words, length=100, temperature=0.8, top_k=5):
    model.eval()
    current_words = start_words
    result = [word.capitalize() for word in current_words]

    batch_size = 1
    hidden = (
        torch.zeros(num_layers, batch_size, hidden_size).cuda(),
        torch.zeros(num_layers, batch_size, hidden_size).cuda(),
    )

    for _ in range(length - len(start_words)):  
        input_idx = [[word_vocab.get(current_words[-1], 0)]]
        one_hot_input = one_hot_encode(input_idx, vocab_size).cuda()

        output, hidden = model(one_hot_input, hidden)

        output = output / temperature
        probabilities = F.softmax(output, dim=1).squeeze()
        top_k_values, top_k_indices = torch.topk(probabilities, top_k)

        best_word_idx = top_k_indices[random.randint(0, len(top_k_indices) - 1)]
        next_word = vocab_reverse.get(best_word_idx.item(), "<UNK>")

        if next_word in ["<PAD>", "<UNK>"]:
            break

        result.append(next_word)
        current_words.append(next_word)

        # Add line breaks
        if len(result) % 8 == 0:
            result.append("\n")  

    poem = " ".join(result)
    poem = re.sub(r"\s+([.,!?])", r"\1", poem)  
    poem = re.sub(r"\n\s*", "\n", poem)  
    return poem


**Step 10: Generate & Display Poem**

In [10]:
start_words = ["she", "married"]
poem = generate_poem(start_words, length=50, temperature=0.8, top_k=5)

print("Generated Poem:\n", poem)


Generated Poem:
 She Married myriads nothing, are, that and me 
moment moment the vain being eyes beauty, 
let hasten as think same. indescribable who 
what what have just good ten young 
tell listen, id lost lives have may 
lives but fast flaw, ruind let work 
me, me, we i so know i 

