## Web scrapping

In [1]:
import requests
from bs4 import BeautifulSoup
import re

In [2]:
url = "https://www.gutenberg.org/cache/epub/84/pg84-images.html"

headers = {
    "User-Agent": "Mozilla/5.0"
}

response = requests.get(url, headers=headers)

soup = BeautifulSoup(response.text, "html.parser")

text = soup.get_text()

start_marker = "To Mrs. Saville, England."
end_marker = "*** END OF THE PROJECT GUTENBERG EBOOK FRANKENSTEIN; OR, THE MODERN PROMETHEUS ***"

start = text.find(start_marker)
end = text.find(end_marker)

if start != -1:
    text = text[start + len(start_marker):]

if end != -1:
    text = text[:end]

print(text[:2000])




St. Petersburgh, Dec. 11th, 17—.


You will rejoice to hear that no disaster has accompanied the commencement of
an enterprise which you have regarded with such evil forebodings. I arrived
here yesterday, and my first task is to assure my dear sister of my welfare and
increasing confidence in the success of my undertaking.


I am already far north of London, and as I walk in the streets of Petersburgh,
I feel a cold northern breeze play upon my cheeks, which braces my nerves and
fills me with delight. Do you understand this feeling? This breeze, which has
travelled from the regions towards which I am advancing, gives me a foretaste
of those icy climes. Inspirited by this wind of promise, my daydreams become
more fervent and vivid. I try in vain to be persuaded that the pole is the seat
of frost and desolation; it ever presents itself to my imagination as the
region of beauty and delight. There, Margaret, the sun is for ever visible, its
broad disk just skirting the h

## Text Preprocessing

In [4]:
text = re.sub(r'\n\s*(letter|chapter)\s+\d+\s*\n', '\n', text, flags=re.IGNORECASE)

text = re.sub(
    r'(^|\n)[A-Za-z.\- ]*,?\s*(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\.?\,?\s*(\d{1,2}(st|nd|rd|th)?)?,?\s*(17—\.)?\s*',
    r'\1',
    text
)

text = text.lower()
text = re.sub(r'\s+', ' ', text)
text = text.strip()

print(text[:2000])

you will rejoice to hear that no disaster has accompanied the commencement of an enterprise which you have regarded with such evil forebodings. i arrived here yesterday, and my first task is to assure my dear sister of my welfare and increasing confidence in the success of my undertaking. i am already far north of london, and as i walk in the streets of petersburgh, i feel a cold northern breeze play upon my cheeks, which braces my nerves and fills me with delight. do you understand this feeling? this breeze, which has travelled from the regions towards which i am advancing, gives me a foretaste of those icy climes. inspirited by this wind of promise, my daydreams become more fervent and vivid. i try in vain to be persuaded that the pole is the seat of frost and desolation; it ever presents itself to my imagination as the garet, the sun is for ever visible, its broad disk just skirting the horizon and diffusing a perpetual splendour. there—for with your leave, my sister, i will put som

In [5]:
text = re.sub(r'([.!?])', r'\1 <END>', text)

text = re.sub(r'[^\w\s<>]', '', text)

print(text[:2000])

you will rejoice to hear that no disaster has accompanied the commencement of an enterprise which you have regarded with such evil forebodings <END> i arrived here yesterday and my first task is to assure my dear sister of my welfare and increasing confidence in the success of my undertaking <END> i am already far north of london and as i walk in the streets of petersburgh i feel a cold northern breeze play upon my cheeks which braces my nerves and fills me with delight <END> do you understand this feeling <END> this breeze which has travelled from the regions towards which i am advancing gives me a foretaste of those icy climes <END> inspirited by this wind of promise my daydreams become more fervent and vivid <END> i try in vain to be persuaded that the pole is the seat of frost and desolation it ever presents itself to my imagination as the garet the sun is for ever visible its broad disk just skirting the horizon and diffusing a perpetual splendour <END> therefor with your leave my

## Tokenization

In [6]:
tokens = text.split()
print(tokens[:20])
print("Total tokens:", len(tokens))

['you', 'will', 'rejoice', 'to', 'hear', 'that', 'no', 'disaster', 'has', 'accompanied', 'the', 'commencement', 'of', 'an', 'enterprise', 'which', 'you', 'have', 'regarded', 'with']
Total tokens: 78430


In [7]:
vocab = sorted(set(tokens))

word_to_idx = {word: idx for idx, word in enumerate(vocab)}
idx_to_word = {idx: word for word, idx in word_to_idx.items()}

vocab_size = len(vocab)
print("Vocab size:", vocab_size)

Vocab size: 7188


In [8]:
encoded = [word_to_idx[word] for word in tokens]

In [9]:
window_size = 100
sequence_length = window_size - 1

data = []
targets = []

for i in range(len(encoded) - window_size):
    data.append(encoded[i:i + sequence_length])
    targets.append(encoded[i + sequence_length])


print(len(data))
print(len(data[0]))
print(len(targets))

78330
99
78330


## Model Training

In [10]:
import torch
import torch.nn as nn

class MyRNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim):
        super(MyRNN, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.rnn = nn.RNN(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        output, hidden = self.rnn(x)

        last_output = output[:, -1, :]

        out = self.fc(last_output)
        return out

In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

model = MyRNN(vocab_size, embed_dim=128, hidden_dim=256).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

cuda


In [16]:
X = torch.tensor(data, dtype=torch.long)
y = torch.tensor(targets, dtype=torch.long)

print(X.shape, y.shape)

torch.Size([78330, 99]) torch.Size([78330])


In [17]:
from torch.utils.data import TensorDataset, DataLoader

dataset = TensorDataset(X, y)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

In [19]:
num_epochs = 30

for epoch in range(num_epochs):

    model.train()
    total_loss = 0

    for batch_X, batch_y in dataloader:

        batch_X = batch_X.to(device)
        batch_y = batch_y.to(device)

        optimizer.zero_grad()

        outputs = model(batch_X)

        loss = criterion(outputs, batch_y)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(dataloader):.4f}")

Epoch [1/30], Loss: 1.7925
Epoch [2/30], Loss: 1.6252
Epoch [3/30], Loss: 1.4826
Epoch [4/30], Loss: 1.3626
Epoch [5/30], Loss: 1.2524
Epoch [6/30], Loss: 1.1630
Epoch [7/30], Loss: 1.0799
Epoch [8/30], Loss: 1.0147
Epoch [9/30], Loss: 0.9500
Epoch [10/30], Loss: 0.9024
Epoch [11/30], Loss: 0.8536
Epoch [12/30], Loss: 0.8162
Epoch [13/30], Loss: 0.7818
Epoch [14/30], Loss: 0.7491
Epoch [15/30], Loss: 0.7231
Epoch [16/30], Loss: 0.7009
Epoch [17/30], Loss: 0.6856
Epoch [18/30], Loss: 0.6644
Epoch [19/30], Loss: 0.6515
Epoch [20/30], Loss: 0.6443
Epoch [21/30], Loss: 0.6204
Epoch [22/30], Loss: 0.6262
Epoch [23/30], Loss: 0.6130
Epoch [24/30], Loss: 0.6096
Epoch [25/30], Loss: 0.6038
Epoch [26/30], Loss: 0.6086
Epoch [27/30], Loss: 0.6030
Epoch [28/30], Loss: 0.6049
Epoch [29/30], Loss: 0.6032
Epoch [30/30], Loss: 0.6053


## Next Word Generation

In [30]:
def generate_text(model, seed_text, length, device):
    model.eval()

    words = seed_text.lower().split()
    words = [w for w in words if w in word_to_idx]

    if len(words) == 0:
        return "Seed words not found in vocabulary."

    input_seq = [word_to_idx[w] for w in words]

    for _ in range(length):

        if len(input_seq) < 99:
            padded = [0]*(99 - len(input_seq)) + input_seq
        else:
            padded = input_seq[-99:]

        input_tensor = torch.tensor([padded], dtype=torch.long).to(device)

        with torch.no_grad():
            output = model(input_tensor)
            predicted_idx = torch.argmax(output, dim=1).item()
            predicted_word = idx_to_word[predicted_idx]

        if predicted_word == "<END>":
            words.append(".")
            break

        words.append(predicted_word)
        input_seq.append(predicted_idx)

    return " ".join(words)

In [31]:
seed_text = ["i will explore", "i world like to know", "my dream was", "i felt a strange", "at night"]

for i in range(len(seed_text)):
    test = generate_text(model, seed_text[i], length=10, device=device)
    print(f"Sentence {i+1}:", test)

Sentence 1: i will explore into fear and at home the greatest affection and even
Sentence 2: i world like to know the shore i might have put into the harbour ignorant
Sentence 3: my dream was as a child there to another in mutual bonds .
Sentence 4: i felt a strange nature pressed upon me .
Sentence 5: at night on her head passing with the experience of safie who
