In [2]:
!pip install requests beautifulsoup4

import requests
from bs4 import BeautifulSoup
import torch
import torch.nn as nn
import torch.optim as optim
import re



In [3]:
url = "https://www.gutenberg.org/cache/epub/84/pg84-images.html"
response = requests.get(url)

soup = BeautifulSoup(response.text, "html.parser")
text = soup.get_text()

text = text.lower()
text = re.sub(r'\s+', ' ', text)

In [4]:
tokens = re.findall(r'\b\w+\b', text)
print("Total words:", len(tokens))

Total words: 78532


In [5]:
# Create vocabulary
vocab = list(set(tokens))
word2idx = {word: i for i, word in enumerate(vocab)}
idx2word = {i: word for word, i in word2idx.items()}

encoded = [word2idx[word] for word in tokens]
vocab_size = len(vocab)

In [6]:
window_size = 100
sequence_length = 99

data = []
targets = []

for i in range(len(encoded) - window_size):
    data.append(encoded[i:i+sequence_length])
    targets.append(encoded[i+sequence_length])

print(len(data))
print(len(data[0]))  # should be 99

78432
99


In [7]:
X = torch.tensor(data, dtype=torch.long)
y = torch.tensor(targets, dtype=torch.long)

In [8]:
class SimpleRNN(nn.Module):
    def __init__(self, vocab_size):
        super(SimpleRNN, self).__init__()

        self.embedding = nn.Embedding(vocab_size, 64)
        self.rnn = nn.RNN(64, 128, batch_first=True)
        self.fc = nn.Linear(128, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        output, hidden = self.rnn(x)
        last_output = output[:, -1, :]
        out = self.fc(last_output)
        return out

model = SimpleRNN(vocab_size)

In [9]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

epochs = 3
batch_size = 128

for epoch in range(epochs):
    total_loss = 0

    for i in range(0, len(X), batch_size):
        inputs = X[i:i+batch_size]
        labels = y[i:i+batch_size]

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print("Epoch:", epoch+1, "Loss:", total_loss)

Epoch: 1 Loss: 4019.318055152893
Epoch: 2 Loss: 3552.2107486724854
Epoch: 3 Loss: 3343.4099159240723


In [10]:
def generate_text(seed_text, length=100):
    model.eval()

    words = seed_text.lower().split()

    for _ in range(length):
        input_ids = [word2idx.get(w, 0) for w in words[-99:]]
        input_tensor = torch.tensor([input_ids], dtype=torch.long)

        with torch.no_grad():
            output = model(input_tensor)
            predicted_id = torch.argmax(output, dim=1).item()

        predicted_word = idx2word[predicted_id]
        words.append(predicted_word)

    return " ".join(words)

print(generate_text("i saw the", 120))

i saw the work of the project gutenberg literary archive foundation or any other project gutenberg electronic works or any other project gutenberg electronic works or any other project gutenberg electronic works or any other project gutenberg electronic works or any other project gutenberg electronic works or any other project gutenberg electronic works or any other project gutenberg electronic works or any other project gutenberg electronic works or any other project gutenberg electronic works or any other project gutenberg electronic works or any other project gutenberg electronic works or any other project gutenberg electronic works or any other project gutenberg electronic works or any other project gutenberg electronic works or any other project gutenberg electronic works or any other project gutenberg electronic works
