In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import re

In [2]:
SEQ_LENGTH = 5
BATCH_SIZE = 64
HIDDEN_SIZE = 64
EMBED_DIM = 100
LEARNING_RATE = 0.001
EPOCHS = 20

def load_and_process_data(filename):
    with open(filename, "r", encoding="utf-8") as f:
        text = f.read().lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = text.split()
    return tokens

tokens = load_and_process_data("poems-100.csv")

In [3]:
vocab = sorted(set(tokens))
vocab_size = len(vocab)
word_to_ix = {word: i for i, word in enumerate(vocab)}
ix_to_word = {i: word for word, i in word_to_ix.items()}

inputs = []
targets = []

for i in range(len(tokens) - SEQ_LENGTH):
    seq_in = tokens[i : i + SEQ_LENGTH]
    seq_out = tokens[i + SEQ_LENGTH]
    inputs.append([word_to_ix[w] for w in seq_in])
    targets.append(word_to_ix[seq_out])

X_tensor = torch.tensor(inputs, dtype=torch.long)
y_tensor = torch.tensor(targets, dtype=torch.long)

X_one_hot = torch.zeros(len(inputs), SEQ_LENGTH, vocab_size)
for i in range(len(inputs)):
    for t in range(SEQ_LENGTH):
        X_one_hot[i, t, inputs[i][t]] = 1.0

dataset_onehot = TensorDataset(X_one_hot, y_tensor)
loader_onehot = DataLoader(dataset_onehot, batch_size=BATCH_SIZE, shuffle=True)

dataset_embed = TensorDataset(X_tensor, y_tensor)
loader_embed = DataLoader(dataset_embed, batch_size=BATCH_SIZE, shuffle=True)

In [4]:
class RNN_OneHot(nn.Module):
    def __init__(self, vocab_size, hidden_size):
        super(RNN_OneHot, self).__init__()
        self.rnn = nn.RNN(vocab_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        out, _ = self.rnn(x)
        out = out[:, -1, :]
        out = self.fc(out)
        return out

In [5]:
class LSTM_OneHot(nn.Module):
    def __init__(self, vocab_size, hidden_size):
        super(LSTM_OneHot, self).__init__()
        self.lstm = nn.LSTM(vocab_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        out, (hn, cn) = self.lstm(x)
        out = out[:, -1, :]
        out = self.fc(out)
        return out

In [6]:
class RNN_Embedding(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_size):
        super(RNN_Embedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.rnn = nn.RNN(embed_dim, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.rnn(x)
        out = out[:, -1, :]
        out = self.fc(out)
        return out

In [7]:
class LSTM_Embedding(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_size):
        super(LSTM_Embedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        out, (hn, cn) = self.lstm(x)
        out = out[:, -1, :]
        out = self.fc(out)
        return out

In [8]:
def train_model(model, dataloader, criterion, optimizer, epochs):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for x_batch, y_batch in dataloader:
            outputs = model(x_batch)
            loss = criterion(outputs, y_batch)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        if (epoch + 1) % 5 == 0:
            print(f"Epoch {epoch+1}, Loss: {total_loss / len(dataloader):.4f}")

def generate_text(model, seed_text, next_words=50, is_one_hot=False):
    model.eval()
    words = seed_text.lower().split()

    for _ in range(next_words):
        current_seq = words[-SEQ_LENGTH:]
        input_indices = [word_to_ix[w] for w in current_seq if w in word_to_ix]

        if len(input_indices) < SEQ_LENGTH:
            break

        input_seq = torch.tensor(input_indices).unsqueeze(0)

        if is_one_hot:
            one_hot_input = torch.zeros(1, SEQ_LENGTH, vocab_size)
            for t in range(SEQ_LENGTH):
                one_hot_input[0, t, input_seq[0, t]] = 1.0
            input_seq = one_hot_input

        with torch.no_grad():
            output = model(input_seq)
            predicted_ix = torch.argmax(output, dim=1).item()

        words.append(ix_to_word[predicted_ix])

    return " ".join(words)

In [9]:
print("Training RNN One-Hot...")
rnn_oh = RNN_OneHot(vocab_size, HIDDEN_SIZE)
opt_rnn_oh = optim.Adam(rnn_oh.parameters(), lr=LEARNING_RATE)
train_model(rnn_oh, loader_onehot, nn.CrossEntropyLoss(), opt_rnn_oh, EPOCHS)

print("Training LSTM One-Hot...")
lstm_oh = LSTM_OneHot(vocab_size, HIDDEN_SIZE)
opt_lstm_oh = optim.Adam(lstm_oh.parameters(), lr=LEARNING_RATE)
train_model(lstm_oh, loader_onehot, nn.CrossEntropyLoss(), opt_lstm_oh, EPOCHS)

Training RNN One-Hot...
Epoch 5, Loss: 5.7891
Epoch 10, Loss: 4.6250
Epoch 15, Loss: 3.5483
Epoch 20, Loss: 2.6350
Training LSTM One-Hot...
Epoch 5, Loss: 6.1563
Epoch 10, Loss: 5.1319
Epoch 15, Loss: 4.0177
Epoch 20, Loss: 3.0138


In [10]:
print("Training RNN Embedding...")
rnn_emb = RNN_Embedding(vocab_size, EMBED_DIM, HIDDEN_SIZE)
opt_rnn_emb = optim.Adam(rnn_emb.parameters(), lr=LEARNING_RATE)
train_model(rnn_emb, loader_embed, nn.CrossEntropyLoss(), opt_rnn_emb, EPOCHS)

print("Training LSTM Embedding...")
lstm_emb = LSTM_Embedding(vocab_size, EMBED_DIM, HIDDEN_SIZE)
opt_lstm_emb = optim.Adam(lstm_emb.parameters(), lr=LEARNING_RATE)
train_model(lstm_emb, loader_embed, nn.CrossEntropyLoss(), opt_lstm_emb, EPOCHS)

Training RNN Embedding...
Epoch 5, Loss: 5.4800
Epoch 10, Loss: 4.3378
Epoch 15, Loss: 3.3797
Epoch 20, Loss: 2.6403
Training LSTM Embedding...
Epoch 5, Loss: 5.6586
Epoch 10, Loss: 4.4942
Epoch 15, Loss: 3.4390
Epoch 20, Loss: 2.5628
