In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from torch.utils.data import Dataset, DataLoader
from collections import Counter
import random

In [4]:
# Sample text data
text = """In the beginning God created the heavens and the earth.
And the earth was without form, and void; and darkness was upon the face of the deep.
And the Spirit of God moved upon the face of the waters.
And God said, Let there be light: and there was light."""

# Create a function to preprocess the text
def preprocess_text(text):
    text = text.lower()
    words = text.split()
    return words

# Preprocess the data
words = preprocess_text(text)
print(words)

['in', 'the', 'beginning', 'god', 'created', 'the', 'heavens', 'and', 'the', 'earth.', 'and', 'the', 'earth', 'was', 'without', 'form,', 'and', 'void;', 'and', 'darkness', 'was', 'upon', 'the', 'face', 'of', 'the', 'deep.', 'and', 'the', 'spirit', 'of', 'god', 'moved', 'upon', 'the', 'face', 'of', 'the', 'waters.', 'and', 'god', 'said,', 'let', 'there', 'be', 'light:', 'and', 'there', 'was', 'light.']


In [5]:
# Create a vocabulary
word_counts = Counter(words)
vocab = sorted(word_counts, key=word_counts.get, reverse=True)
vocab_size = len(vocab)

# Create a mapping from words to indices
word_to_index = {word: i for i, word in enumerate(vocab)}
index_to_word = {i: word for i, word in enumerate(vocab)}

In [6]:
# Hyperparameters
seq_length = 5
n_samples = len(words) - seq_length

# Create input and target sequences
input_sequences = []
target_words = []

for i in range(n_samples):
    input_seq = words[i:i + seq_length]
    target_word = words[i + seq_length]
    input_sequences.append([word_to_index[w] for w in input_seq])
    target_words.append(word_to_index[target_word])

# Convert to numpy arrays
input_sequences = np.array(input_sequences)
target_words = np.array(target_words)

In [7]:
class TextDataset(Dataset):
    def __init__(self, inputs, targets):
        self.inputs = inputs
        self.targets = targets

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return torch.tensor(self.inputs[idx], dtype=torch.long), torch.tensor(self.targets[idx], dtype=torch.long)

# Create dataset and dataloader
dataset = TextDataset(input_sequences, target_words)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

In [8]:
class RNNTextGenerator(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(RNNTextGenerator, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        x = self.embeddings(x)
        rnn_out, _ = self.rnn(x)
        logits = self.fc(rnn_out[:, -1, :])  # Take the last output for prediction
        return logits

In [9]:
# Hyperparameters
embedding_dim = 10
hidden_dim = 10
num_epochs = 100
learning_rate = 0.001

# Initialize the model, loss function, and optimizer
model = RNNTextGenerator(vocab_size, embedding_dim, hidden_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    for inputs, targets in dataloader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}')

Epoch [10/100], Loss: 2.4661
Epoch [20/100], Loss: 1.8597
Epoch [30/100], Loss: 2.3912
Epoch [40/100], Loss: 1.1995
Epoch [50/100], Loss: 2.1404
Epoch [60/100], Loss: 1.9005
Epoch [70/100], Loss: 0.8989
Epoch [80/100], Loss: 0.4906
Epoch [90/100], Loss: 0.6231
Epoch [100/100], Loss: 1.7548


In [None]:
def generate_text(model, start_text, generation_length):
    model.eval()  # Set the model to evaluation mode
    generated = start_text.split()
    
    for _ in range(generation_length):
        input_seq = [word_to_index[word] for word in generated[-seq_length:]]
        input_tensor = torch.tensor(input_seq, dtype=torch.long).unsqueeze(0)  # Add batch dimension
        with torch.no_grad():
            output = model(input_tensor)
        
        # Get the predicted word index
        predicted_idx = torch.argmax(output, dim=1).item()
        predicted_word = index_to_word[predicted_idx]
        generated.append(predicted_word)
    
    return ' '.join(generated)

# Generate text
start_text = "and darkness was"
generated_text = generate_text(model, start_text, generation_length=10)
print(f'Generated text: {generated_text}')

KeyError: 'said'