IMPORT

In [33]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
from collections import Counter
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

LOADING DATASET

In [34]:
with open("/kaggle/input/datasets/akashdeep31/poems-100/poems-100.csv", "r", encoding="utf-8") as f:
    text = f.read().lower()

print(text[:500])


text
"o my luve's like a red, red rose
that’s newly sprung in june;
o my luve's like the melodie
that’s sweetly play'd in tune.

as fair art thou, my bonnie lass,
so deep in luve am i:
and i will luve thee still, my dear,
till a’ the seas gang dry:

till a’ the seas gang dry, my dear,
and the rocks melt wi’ the sun:
i will luve thee still, my dear,
while the sands o’ life shall run.

and fare thee well, my only luve
and fare thee well, a while!
and i will come again, my luve,
tho’ it were ten th


PREPROCESSING

In [35]:
tokens = text.split()
print("Total tokens:", len(tokens))

Total tokens: 24801


In [36]:
vocab = sorted(set(tokens))
vocab_size = len(vocab)

word_to_idx = {word:i for i,word in enumerate(vocab)}
idx_to_word = {i:word for word,i in word_to_idx.items()}

print("Vocabulary size:", vocab_size)


Vocabulary size: 7045


In [37]:
seq_length = 5
sequences = []

for i in range(len(tokens) - seq_length):
    seq = tokens[i:i+seq_length]
    target = tokens[i+seq_length]
    sequences.append((seq, target))

print("Total sequences:", len(sequences))


Total sequences: 24796


ONE HOT ENCODING

In [38]:
def one_hot_encode(word_idx, vocab_size):
    vec = torch.zeros(vocab_size)
    vec[word_idx] = 1
    return vec


In [39]:
X_onehot = []
y_onehot = []

for seq, target in sequences:
    seq_vec = [one_hot_encode(word_to_idx[w], vocab_size) for w in seq]
    X_onehot.append(torch.stack(seq_vec))
    y_onehot.append(word_to_idx[target])

X_onehot = torch.stack(X_onehot)
y_onehot = torch.tensor(y_onehot)

X_onehot = X_onehot.to(device)
y_onehot = y_onehot.to(device)


RNN Model

In [40]:
class RNN_OneHot(nn.Module):
    def __init__(self, vocab_size, hidden_size):
        super().__init__()
        self.rnn = nn.RNN(vocab_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        out, _ = self.rnn(x)
        out = self.fc(out[:, -1, :])
        return out


LSTM Model

In [41]:
class LSTM_OneHot(nn.Module):
    def __init__(self, vocab_size, hidden_size):
        super().__init__()
        self.lstm = nn.LSTM(vocab_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])
        return out


Training

In [42]:
def train_model(model, X, y, epochs=10):
    model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.CrossEntropyLoss()

    for epoch in range(epochs):
        optimizer.zero_grad()
        output = model(X)
        loss = criterion(output, y)
        loss.backward()
        optimizer.step()

        print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")


In [43]:
model_onehot_RNN = RNN_OneHot(vocab_size, 256)
train_model(model_onehot, X_onehot, y_onehot, epochs=40)


Epoch 1, Loss: 7.8987
Epoch 2, Loss: 7.7698
Epoch 3, Loss: 7.6390
Epoch 4, Loss: 7.5126
Epoch 5, Loss: 7.3973
Epoch 6, Loss: 7.2989
Epoch 7, Loss: 7.2201
Epoch 8, Loss: 7.1601
Epoch 9, Loss: 7.1149
Epoch 10, Loss: 7.0797
Epoch 11, Loss: 7.0501
Epoch 12, Loss: 7.0234
Epoch 13, Loss: 6.9983
Epoch 14, Loss: 6.9741
Epoch 15, Loss: 6.9509
Epoch 16, Loss: 6.9289
Epoch 17, Loss: 6.9083
Epoch 18, Loss: 6.8896
Epoch 19, Loss: 6.8734
Epoch 20, Loss: 6.8600
Epoch 21, Loss: 6.8498
Epoch 22, Loss: 6.8430
Epoch 23, Loss: 6.8395
Epoch 24, Loss: 6.8386
Epoch 25, Loss: 6.8396
Epoch 26, Loss: 6.8414
Epoch 27, Loss: 6.8432
Epoch 28, Loss: 6.8441
Epoch 29, Loss: 6.8438
Epoch 30, Loss: 6.8421
Epoch 31, Loss: 6.8393
Epoch 32, Loss: 6.8357
Epoch 33, Loss: 6.8318
Epoch 34, Loss: 6.8279
Epoch 35, Loss: 6.8242
Epoch 36, Loss: 6.8206
Epoch 37, Loss: 6.8172
Epoch 38, Loss: 6.8138
Epoch 39, Loss: 6.8102
Epoch 40, Loss: 6.8066


In [44]:
model_onehot_LSTM = LSTM_OneHot(vocab_size, 256)
train_model(model_onehot, X_onehot, y_onehot, epochs=40)


Epoch 1, Loss: 6.8030
Epoch 2, Loss: 6.8000
Epoch 3, Loss: 6.7912
Epoch 4, Loss: 6.7882
Epoch 5, Loss: 6.7852
Epoch 6, Loss: 6.7795
Epoch 7, Loss: 6.7750
Epoch 8, Loss: 6.7716
Epoch 9, Loss: 6.7666
Epoch 10, Loss: 6.7608
Epoch 11, Loss: 6.7564
Epoch 12, Loss: 6.7520
Epoch 13, Loss: 6.7468
Epoch 14, Loss: 6.7419
Epoch 15, Loss: 6.7379
Epoch 16, Loss: 6.7332
Epoch 17, Loss: 6.7280
Epoch 18, Loss: 6.7236
Epoch 19, Loss: 6.7191
Epoch 20, Loss: 6.7142
Epoch 21, Loss: 6.7099
Epoch 22, Loss: 6.7056
Epoch 23, Loss: 6.7009
Epoch 24, Loss: 6.6966
Epoch 25, Loss: 6.6924
Epoch 26, Loss: 6.6878
Epoch 27, Loss: 6.6837
Epoch 28, Loss: 6.6794
Epoch 29, Loss: 6.6750
Epoch 30, Loss: 6.6708
Epoch 31, Loss: 6.6664
Epoch 32, Loss: 6.6621
Epoch 33, Loss: 6.6579
Epoch 34, Loss: 6.6533
Epoch 35, Loss: 6.6490
Epoch 36, Loss: 6.6444
Epoch 37, Loss: 6.6399
Epoch 38, Loss: 6.6353
Epoch 39, Loss: 6.6305
Epoch 40, Loss: 6.6258


In [45]:
def generate_text_onehot(model, start_words, num_words=20):
    model.eval()
    words = start_words.copy()

    for _ in range(num_words):
        seq = words[-seq_length:]
        seq_vec = [one_hot_encode(word_to_idx[w], vocab_size) for w in seq]
        seq_tensor = torch.stack(seq_vec).unsqueeze(0).to(device)

        with torch.no_grad():
            output = model(seq_tensor)
            predicted = torch.argmax(output).item()

        words.append(idx_to_word[predicted])

    return " ".join(words)


In [46]:
print(generate_text_onehot(model_onehot_RNN, tokens[:5]))

text "o my luve's like salt frigate under frigate by stud zones, prove run. leaf frigate zones, rais'd invalids, horn. owning stud rolls, salt square


In [47]:
print(generate_text_onehot(model_onehot_LSTM, tokens[:5]))

text "o my luve's like people, uncurled gladdest i! nodding nodding prevent surly nodding ribs. nodding nodding ribs. nodding ribs. nodding nodding ribs. nodding ribs.


Trainable Word Embeddings Approach

RNN with Embedding

In [48]:
class RNN_Embedding(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.RNN(embed_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.rnn(x)
        out = self.fc(out[:, -1, :])
        return out


LSTM with Embedding

In [49]:
class LSTM_Embedding(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])
        return out


In [50]:
X_embed = []
y_embed = []

for seq, target in sequences:
    seq_idx = [word_to_idx[w] for w in seq]
    X_embed.append(seq_idx)
    y_embed.append(word_to_idx[target])

X_embed = torch.tensor(X_embed)
y_embed = torch.tensor(y_embed)


In [51]:
X_embed = X_embed.to(device)
y_embed = y_embed.to(device)


Training

In [52]:
model_embed_RNN = RNN_Embedding(vocab_size, 100, 256)
train_model(model_embed, X_embed, y_embed, epochs=40)


Epoch 1, Loss: 8.0428
Epoch 2, Loss: 7.9350
Epoch 3, Loss: 7.8227
Epoch 4, Loss: 7.7083
Epoch 5, Loss: 7.5947
Epoch 6, Loss: 7.4847
Epoch 7, Loss: 7.3813
Epoch 8, Loss: 7.2869
Epoch 9, Loss: 7.2035
Epoch 10, Loss: 7.1320
Epoch 11, Loss: 7.0722
Epoch 12, Loss: 7.0232
Epoch 13, Loss: 6.9837
Epoch 14, Loss: 6.9519
Epoch 15, Loss: 6.9263
Epoch 16, Loss: 6.9052
Epoch 17, Loss: 6.8874
Epoch 18, Loss: 6.8717
Epoch 19, Loss: 6.8574
Epoch 20, Loss: 6.8438
Epoch 21, Loss: 6.8305
Epoch 22, Loss: 6.8172
Epoch 23, Loss: 6.8039
Epoch 24, Loss: 6.7907
Epoch 25, Loss: 6.7781
Epoch 26, Loss: 6.7660
Epoch 27, Loss: 6.7546
Epoch 28, Loss: 6.7438
Epoch 29, Loss: 6.7336
Epoch 30, Loss: 6.7238
Epoch 31, Loss: 6.7142
Epoch 32, Loss: 6.7047
Epoch 33, Loss: 6.6952
Epoch 34, Loss: 6.6856
Epoch 35, Loss: 6.6758
Epoch 36, Loss: 6.6657
Epoch 37, Loss: 6.6555
Epoch 38, Loss: 6.6450
Epoch 39, Loss: 6.6344
Epoch 40, Loss: 6.6237


In [53]:
model_embed_LSTM = LSTM_Embedding(vocab_size, 100, 256)
train_model(model_embed, X_embed, y_embed, epochs=40)


Epoch 1, Loss: 6.6130
Epoch 2, Loss: 6.5938
Epoch 3, Loss: 6.5787
Epoch 4, Loss: 6.5644
Epoch 5, Loss: 6.5496
Epoch 6, Loss: 6.5343
Epoch 7, Loss: 6.5189
Epoch 8, Loss: 6.5036
Epoch 9, Loss: 6.4882
Epoch 10, Loss: 6.4725
Epoch 11, Loss: 6.4567
Epoch 12, Loss: 6.4407
Epoch 13, Loss: 6.4246
Epoch 14, Loss: 6.4083
Epoch 15, Loss: 6.3918
Epoch 16, Loss: 6.3753
Epoch 17, Loss: 6.3586
Epoch 18, Loss: 6.3418
Epoch 19, Loss: 6.3248
Epoch 20, Loss: 6.3077
Epoch 21, Loss: 6.2904
Epoch 22, Loss: 6.2729
Epoch 23, Loss: 6.2553
Epoch 24, Loss: 6.2374
Epoch 25, Loss: 6.2193
Epoch 26, Loss: 6.2010
Epoch 27, Loss: 6.1825
Epoch 28, Loss: 6.1637
Epoch 29, Loss: 6.1448
Epoch 30, Loss: 6.1256
Epoch 31, Loss: 6.1063
Epoch 32, Loss: 6.0867
Epoch 33, Loss: 6.0668
Epoch 34, Loss: 6.0467
Epoch 35, Loss: 6.0263
Epoch 36, Loss: 6.0057
Epoch 37, Loss: 5.9848
Epoch 38, Loss: 5.9637
Epoch 39, Loss: 5.9422
Epoch 40, Loss: 5.9206


In [54]:
def generate_text_embed(model, start_words, num_words=20):
    model.eval()
    words = start_words.copy()

    for _ in range(num_words):
        seq = words[-seq_length:]
        seq_idx = torch.tensor([[word_to_idx[w] for w in seq]]).to(device)

        with torch.no_grad():
            output = model(seq_idx)
            predicted = torch.argmax(output).item()

        words.append(idx_to_word[predicted])

    return " ".join(words)


In [55]:
print(generate_text_embed(model_embed_RNN, tokens[:5]))


text "o my luve's like pushing man. coon-seekers slaughter'd brutish low rainbow as exhibition-gallery responses wreathed thither,) undisguised surpasses rivulet, man, offset seeming only tall


In [56]:
print(generate_text_embed(model_embed_LSTM, tokens[:5]))


text "o my luve's like squads yet—she decorum, such sorry poems? fares clothes shines sun,—the gladness, sun,—the sun,—the summits rous'd trifle purer lean canvas drape
