Importing libraries

In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import re
import time

In [2]:
df=pd.read_csv('/content/poems-100.csv')

In [3]:
df.head()

Unnamed: 0,text
0,"O my Luve's like a red, red rose\nThat’s newly..."
1,"The rose is red,\nThe violet's blue,\nSugar is..."
2,How do I love thee? Let me count the ways.\nI ...
3,"Had I the heavens' embroidered cloths,\nEnwrou..."
4,"I.\n Enough! we're tired, my heart and I.\n..."


In [4]:
df.shape

(100, 1)

Preprocessing

In [5]:
tokens = []

for raw_text in df["text"].fillna("").map(str):
    normalized = raw_text.lower()
    stripped = re.compile(r'[^a-z\s]').sub("", normalized)
    tokens += stripped.split()

unique_terms = list(set(tokens))
unique_terms.sort()

size_of_vocab = len(unique_terms)

term_to_id = dict(zip(unique_terms, range(size_of_vocab)))
id_to_term = {idx: term for term, idx in term_to_id.items()}

In [6]:
print("Vocabulary size:", size_of_vocab)

Vocabulary size: 5439


In [7]:
print("Sample words:", unique_terms[:10])

Sample words: ['a', 'abase', 'abased', 'abbeystones', 'abeyance', 'abide', 'abode', 'abodes', 'about', 'above']


In [8]:
encoded_words = [term_to_id[w] for w in tokens]

In [9]:
print("First 20 encoded words:")
print(encoded_words[:20])

First 20 encoded words:
[3167, 3054, 2775, 2664, 0, 3748, 3748, 3897, 4726, 3109, 4407, 2351, 2473, 3167, 3054, 2775, 2664, 4727, 2896, 4726]


In [10]:
SEQ_LEN = 5
X=[]
y=[]

In [11]:
i=0
while i+SEQ_LEN<len(encoded_words):
  X.append(encoded_words[i:i+SEQ_LEN])
  y.append(encoded_words[i+SEQ_LEN])
  i+=1

In [12]:
X = torch.tensor(X)
y = torch.tensor(y)

In [13]:
print("Input shape:", X.shape)
print("Target shape:", y.shape)

Input shape: torch.Size([24671, 5])
Target shape: torch.Size([24671])


RNN From Scratch

In [14]:
class SimpleRNN_Numpy:
    def __init__(self, vocab_size, hidden_size):
        scale = 0.01
        h_dim = hidden_size
        v_dim = vocab_size

        self.Wxh = scale * np.random.randn(h_dim, v_dim)
        self.Whh = scale * np.random.randn(h_dim, h_dim)
        self.Why = scale * np.random.randn(v_dim, h_dim)

        self.bh = np.zeros((h_dim, 1))
        self.by = np.zeros((v_dim, 1))

    def forward(self, inputs):
        hidden_dim = self.bh.shape[0]
        state = np.zeros((hidden_dim, 1))
        result = []
        for step in range(len(inputs)):
            vec = np.expand_dims(inputs[step], axis=1)
            pre_activation = (
                np.dot(self.Wxh, vec) +
                np.dot(self.Whh, state) +
                self.bh
            )
            state = np.tanh(pre_activation)
            out = np.dot(self.Why, state) + self.by
            result.append(out)
        return result

In [15]:
rnn_np = SimpleRNN_Numpy(size_of_vocab, hidden_size=32)
print("NumPy RNN initialized")

NumPy RNN initialized


One Hot Encoding Approach

In [16]:
X_onehot = torch.zeros(X.size(0), SEQ_LEN, size_of_vocab)

for i in range(X.size(0)):
    for t in range(SEQ_LEN):
        X_onehot[i, t, X[i, t]] = 1

print("One-hot input shape:", X_onehot.shape)

One-hot input shape: torch.Size([24671, 5, 5439])


In [17]:
class OneHotRNN(nn.Module):
    def __init__(self, vocab_size, hidden_size):
        super(OneHotRNN, self).__init__()
        input_dim = vocab_size
        hidden_dim = hidden_size
        self.rnn = nn.RNN(
            input_size=input_dim,
            hidden_size=hidden_dim,
            batch_first=True
        )
        self.fc = nn.Linear(hidden_dim, input_dim)

    def forward(self, x):
        rnn_output, hidden_state = self.rnn(x)
        last_step = rnn_output[:, rnn_output.size(1) - 1]
        logits = self.fc(last_step)
        return logits

In [18]:
onehot_model = OneHotRNN(size_of_vocab, 128)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(
    params=onehot_model.parameters(),
    lr=3e-3
)
print(onehot_model)

OneHotRNN(
  (rnn): RNN(5439, 128, batch_first=True)
  (fc): Linear(in_features=128, out_features=5439, bias=True)
)


In [19]:
start_time = time.time()

for epoch_idx in range(1, 11):
    predictions = onehot_model(X_onehot)
    loss = criterion(predictions, y)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

onehot_time = time.time() - start_time

In [20]:
print("Final One-Hot Loss:", loss.item())
print("One-Hot Training Time:", onehot_time)

Final One-Hot Loss: 6.762184143066406
One-Hot Training Time: 111.4195032119751


In [21]:
def generate_onehot(start_word, length=15, temperature=1.0):
    words = [start_word]
    onehot_model.eval()

    with torch.no_grad():
        for _ in range(length):
            inp = torch.zeros((1, SEQ_LEN, size_of_vocab))
            recent = words[-SEQ_LEN:]

            for i, token in enumerate(recent):
                if token in term_to_id:
                    inp[0, i, term_to_id[token]] = 1

            logits = onehot_model(inp)
            probs = torch.softmax(logits / temperature, dim=1)
            chosen_index = torch.multinomial(probs, 1).item()
            next_word = id_to_term[chosen_index]

            words.append(next_word)

    return " ".join(words)

print("One-hot Sample:", generate_onehot("love", temperature=0.8))

One-hot Sample: love what you a words up the be the time o how like little the or


Word Embedding Approach

In [22]:
class EmbeddingRNN(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size):
        super(EmbeddingRNN, self).__init__()

        v_dim = vocab_size
        e_dim = embed_size
        h_dim = hidden_size

        self.embedding = nn.Embedding(
            num_embeddings=v_dim,
            embedding_dim=e_dim
        )

        self.rnn = nn.RNN(
            input_size=e_dim,
            hidden_size=h_dim,
            batch_first=True
        )

        self.fc = nn.Linear(h_dim, v_dim)

    def forward(self, x):
        embedded = self.embedding(x)

        rnn_out, hidden = self.rnn(embedded)

        final_step = rnn_out[:, rnn_out.size(1) - 1]
        logits = self.fc(final_step)

        return logits


embed_model = EmbeddingRNN(size_of_vocab, 100, 128)

optimizer = torch.optim.Adam(
    params=embed_model.parameters(),
    lr=3e-3
)

In [23]:
print(embed_model)

EmbeddingRNN(
  (embedding): Embedding(5439, 100)
  (rnn): RNN(100, 128, batch_first=True)
  (fc): Linear(in_features=128, out_features=5439, bias=True)
)


In [24]:
start_time = time.time()
max_epochs = 10
epoch_idx = 0
while epoch_idx < max_epochs:
    optimizer.zero_grad(set_to_none=True)
    preds = embed_model(X)
    loss_embed = criterion(preds, y)
    loss_embed.backward()
    optimizer.step()
    epoch_idx += 1

embed_time = time.time() - start_time

final_loss_value = loss_embed.item()
print("Final Embedding Loss:", final_loss_value)
print("Embedding Training Time:", embed_time)

Final Embedding Loss: 6.801928520202637
Embedding Training Time: 38.527023792266846


In [25]:
print("Final One-Hot Loss:", loss.item())
print("Final Embedding Loss:", loss_embed.item())

Final One-Hot Loss: 6.762184143066406
Final Embedding Loss: 6.801928520202637


In [26]:
print("\n========= COMPARISON SUMMARY =========")
print(f"One-Hot Encoding  -> Loss: {loss.item():.4f}, Time: {onehot_time:.2f}s")
print(f"Word Embeddings   -> Loss: {loss_embed.item():.4f}, Time: {embed_time:.2f}s")

if loss_embed.item() < loss.item():
    print("Embedding model performs better based on loss.")
else:
    print("One-Hot model performs better based on loss.")


One-Hot Encoding  -> Loss: 6.7622, Time: 111.42s
Word Embeddings   -> Loss: 6.8019, Time: 38.53s
One-Hot model performs better based on loss.


In [27]:
def generate_embedding(start_word, length=15, temperature=1.0):
    words = [start_word]
    embed_model.eval()

    with torch.no_grad():
        for _ in range(length):
            recent_words = words[-SEQ_LEN:]
            indices = [term_to_id[w] for w in recent_words if w in term_to_id]

            if len(indices) < SEQ_LEN:
                indices = [0] * (SEQ_LEN - len(indices)) + indices

            seq = torch.tensor(indices).view(1, -1)
            logits = embed_model(seq)

            probs = torch.softmax(logits / temperature, dim=1)
            best_idx = torch.multinomial(probs, 1).item()
            next_word = id_to_term[best_idx]

            words.append(next_word)

    return " ".join(words)

print("Embedding Sample:", generate_embedding("love", temperature=0.8))

Embedding Sample: love trees which and enough will with from oaths at shelf but there might has your
