In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from collections import Counter
import numpy as np

with open("data/poems-100.csv", "r", encoding="utf-8") as f:
    text = f.read().lower()

tokens = text.split()
vocab = sorted(set(tokens))
word2idx = {w:i for i,w in enumerate(vocab)}
idx2word = {i:w for w,i in word2idx.items()}
vocab_size = len(vocab)

ONE-HOT ENCODING

In [2]:

def one_hot_encode(idx, vocab_size):
    vec = torch.zeros(vocab_size)
    vec[idx] = 1.0
    return vec

Create Sequences

In [3]:
seq_length = 5
X, y = [], []

for i in range(len(tokens) - seq_length):
    seq = tokens[i:i+seq_length]
    target = tokens[i+seq_length]
    X.append([word2idx[w] for w in seq])
    y.append(word2idx[target])

X = torch.tensor(X)
y = torch.tensor(y)

In [4]:
seq_length = X.shape[1]

X_onehot = torch.zeros(X.shape[0], seq_length, vocab_size)

for i in range(X.shape[0]):
    for j in range(seq_length):
        X_onehot[i, j, X[i, j]] = 1

In [5]:
from torch.utils.data import TensorDataset, DataLoader

dataset = TensorDataset(X_onehot, y)
loader = DataLoader(dataset, batch_size=64, shuffle=True)

RNN Model

In [6]:

class RNN_OneHot(nn.Module):
    def __init__(self, vocab_size, hidden_size):
        super().__init__()
        self.rnn = nn.RNN(vocab_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        out, _ = self.rnn(x)
        out = self.fc(out[:, -1, :])
        return out


In [7]:
hidden_size = 64
model = RNN_OneHot(vocab_size, hidden_size)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

epochs = 20

for epoch in range(epochs):
    total_loss = 0

    for xb, yb in loader:
        output = model(xb)
        loss = criterion(output, yb)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss/len(loader):.4f}")

     

Epoch 1, Loss: 7.4052
Epoch 2, Loss: 6.7771
Epoch 3, Loss: 6.4894
Epoch 4, Loss: 6.1827
Epoch 5, Loss: 5.9227
Epoch 6, Loss: 5.6639
Epoch 7, Loss: 5.4004
Epoch 8, Loss: 5.1239
Epoch 9, Loss: 4.8403
Epoch 10, Loss: 4.5602
Epoch 11, Loss: 4.2792
Epoch 12, Loss: 4.0130
Epoch 13, Loss: 3.7480
Epoch 14, Loss: 3.4983
Epoch 15, Loss: 3.2590
Epoch 16, Loss: 3.0278
Epoch 17, Loss: 2.8061
Epoch 18, Loss: 2.5956
Epoch 19, Loss: 2.3907
Epoch 20, Loss: 2.1990


In [8]:
seed_words = ["the", "sun", "shines", "bright", "upon"]
num_words_to_generate = 50
def generate_text_for_onehot(model, seed_words, num_words_to_generate, word2idx, idx2word, vocab_size, seq_length):
    model.eval()
    generated_words = list(seed_words)

    for _ in range(num_words_to_generate):
        if len(generated_words) < seq_length:
            current_sequence_words = generated_words
        else:
            current_sequence_words = generated_words[-seq_length:]

        try:
            input_indices = [word2idx[w] for w in current_sequence_words]
        except KeyError as e:
            print(f"Warning: Word '{e.args[0]}' not in vocabulary. Skipping generation for this word.")
            break
        input_tensor_indices = torch.tensor(input_indices).unsqueeze(0)
        one_hot_input_tensor = torch.zeros(1, input_tensor_indices.shape[1], vocab_size, dtype=torch.float32)
        for k, idx in enumerate(input_tensor_indices[0]):
            one_hot_input_tensor[0, k, idx] = 1.0

        with torch.no_grad():
            output = model(one_hot_input_tensor)

        predicted_idx = torch.argmax(output[:, -1, :]).item() if output.dim() == 3 else torch.argmax(output).item()
        predicted_word = idx2word[predicted_idx]
        generated_words.append(predicted_word)

    return ' '.join(generated_words)
generated_text = generate_text_for_onehot(model, seed_words, num_words_to_generate, word2idx, idx2word, vocab_size, seq_length)
print("Generated Text:")
print(generated_text)

Generated Text:
the sun shines bright upon warmer than the chain for hate old the eyes of who days, the do you close waste more as you thought not me upon you and i am him i am the i of the old of my i were the them of my own in that them in my


In [9]:
class LSTM_OneHot(nn.Module):
    def __init__(self, vocab_size, hidden_size):
        super().__init__()
        self.lstm = nn.LSTM(vocab_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])
        return out


In [10]:
hidden_size = 64
model = LSTM_OneHot(vocab_size, hidden_size)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

epochs = 20

for epoch in range(epochs):
    total_loss = 0

    for xb, yb in loader:
        output = model(xb)
        loss = criterion(output, yb)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss/len(loader):.4f}")

Epoch 1, Loss: 7.4545
Epoch 2, Loss: 6.8877
Epoch 3, Loss: 6.7441
Epoch 4, Loss: 6.5649
Epoch 5, Loss: 6.3762
Epoch 6, Loss: 6.1679
Epoch 7, Loss: 5.9415
Epoch 8, Loss: 5.6986
Epoch 9, Loss: 5.4406
Epoch 10, Loss: 5.1744
Epoch 11, Loss: 4.9083
Epoch 12, Loss: 4.6471
Epoch 13, Loss: 4.3956
Epoch 14, Loss: 4.1462
Epoch 15, Loss: 3.9052
Epoch 16, Loss: 3.6692
Epoch 17, Loss: 3.4372
Epoch 18, Loss: 3.2127
Epoch 19, Loss: 2.9921
Epoch 20, Loss: 2.7744


In [11]:
seed_words = ["the", "sun", "shines", "bright", "upon"]
num_words_to_generate = 50

generated_text = generate_text_for_onehot(model, seed_words, num_words_to_generate, word2idx, idx2word, vocab_size, seq_length)
print("Generated Text:")
print(generated_text)

Generated Text:
the sun shines bright upon one the great of my voice, or me, i am the air and my old like for one to the little of the grass sails is a stars galleon, with an nor in in the earth, or not in a few or large or few where the few pleas'd with


Word Embeddings

In [12]:

X = torch.tensor([[word2idx[w] for w in tokens[i:i+seq_length]]
                  for i in range(len(tokens)-seq_length)])
y = torch.tensor([word2idx[tokens[i+seq_length]]
                  for i in range(len(tokens)-seq_length)])

In [13]:
class RNN_Embedding(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.rnn = nn.RNN(embed_dim, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.rnn(x)
        out = self.fc(out[:, -1, :])
        return out

     


In [14]:
from torch.utils.data import TensorDataset, DataLoader

embedding_dataset = TensorDataset(X, y)
embedding_loader = DataLoader(embedding_dataset, batch_size=64, shuffle=True)

model = RNN_Embedding(vocab_size, embed_dim=100, hidden_size=64)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

epochs = 20

for epoch in range(epochs):
    total_loss = 0

    for xb, yb in embedding_loader:
        output = model(xb)
        loss = criterion(output, yb)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss/len(embedding_loader):.4f}")

Epoch 1, Loss: 7.4771
Epoch 2, Loss: 6.5974
Epoch 3, Loss: 6.2451
Epoch 4, Loss: 5.9251
Epoch 5, Loss: 5.6143
Epoch 6, Loss: 5.3149
Epoch 7, Loss: 5.0240
Epoch 8, Loss: 4.7382
Epoch 9, Loss: 4.4628
Epoch 10, Loss: 4.1942
Epoch 11, Loss: 3.9346
Epoch 12, Loss: 3.6894
Epoch 13, Loss: 3.4584
Epoch 14, Loss: 3.2408
Epoch 15, Loss: 3.0372
Epoch 16, Loss: 2.8492
Epoch 17, Loss: 2.6681
Epoch 18, Loss: 2.5012
Epoch 19, Loss: 2.3435
Epoch 20, Loss: 2.1923


In [15]:
def generate_text(model, seed_words, num_words_to_generate, word2idx, idx2word, seq_length):
    model.eval()
    generated_words = list(seed_words)

    for _ in range(num_words_to_generate):
        if len(generated_words) < seq_length:
            current_sequence_words = generated_words
        else:
            current_sequence_words = generated_words[-seq_length:]
        try:
            input_indices = [word2idx[w] for w in current_sequence_words]
        except KeyError as e:
            print(f"Warning: Word '{e.args[0]}' not in vocabulary. Skipping generation for this word.")
            break


        input_tensor = torch.tensor(input_indices).unsqueeze(0)

        with torch.no_grad():
            output = model(input_tensor)


        predicted_idx = torch.argmax(output[:, -1, :]).item() if output.dim() == 3 else torch.argmax(output).item()
        predicted_word = idx2word[predicted_idx]
        generated_words.append(predicted_word)

    return ' '.join(generated_words)

In [16]:
seed_words = ["the", "sun", "shines", "bright", "upon"]
num_words_to_generate = 50

generated_text = generate_text(model, seed_words, num_words_to_generate, word2idx, idx2word, seq_length)
print("Generated Text:")
print(generated_text)
     

Generated Text:
the sun shines bright upon the masts and never chaws; by of my own, and gave you have them.” around me, a little birds in a row sat musing. a few quadrillions of the young men glisten'd the whole earth. of the moon and then the chaff for payment receiving, a few octillions of his


LSTM

In [17]:
class LSTM_Embedding(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])
        return out

In [18]:
model = LSTM_Embedding(vocab_size, embed_dim=100, hidden_size=64)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

epochs = 20

for epoch in range(epochs):
    total_loss = 0

    for xb, yb in embedding_loader:
        output = model(xb)
        loss = criterion(output, yb)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss/len(embedding_loader):.4f}")
     


Epoch 1, Loss: 7.4637
Epoch 2, Loss: 6.7051
Epoch 3, Loss: 6.4248
Epoch 4, Loss: 6.1401
Epoch 5, Loss: 5.8556
Epoch 6, Loss: 5.5730
Epoch 7, Loss: 5.2892
Epoch 8, Loss: 5.0060
Epoch 9, Loss: 4.7251
Epoch 10, Loss: 4.4501
Epoch 11, Loss: 4.1797
Epoch 12, Loss: 3.9165
Epoch 13, Loss: 3.6609
Epoch 14, Loss: 3.4143
Epoch 15, Loss: 3.1800
Epoch 16, Loss: 2.9527
Epoch 17, Loss: 2.7422
Epoch 18, Loss: 2.5397
Epoch 19, Loss: 2.3531
Epoch 20, Loss: 2.1770


In [19]:
seed_words = ["the", "sun", "shines", "bright", "upon"]
num_words_to_generate = 50

generated_text = generate_text(model, seed_words, num_words_to_generate, word2idx, idx2word, seq_length)
print("Generated Text:")
print(generated_text)

Generated Text:
the sun shines bright upon his own dashings—yet—the are scatter'd, in the woods are lovely, and their employments, and all times, and the woods i am the poet of the dying sun: ‘and the young men in the greatest and frozen lake of light! to-night it seems to me and i am not a few
