In [None]:
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim

with open('text8', 'r') as file:
    text = file.read(file.namelist()[0]).decode('utf-8')

tokens = text.split()
vocab = set(tokens)
word_to_index = {word: i for i, word in enumerate(vocab)}
index_to_word = {i: word for word, i in word_to_index.items()}

token_indices = [word_to_index[token] for token in tokens]

sequence_length = 50
sequences = []
for i in range(len(token_indices) - sequence_length):
    sequences.append(token_indices[i:i + sequence_length + 1])

sequences = np.array(sequences)

X = sequences[:, :-1]
y = sequences[:, -1]

X = torch.tensor(X, dtype=torch.long)
y = torch.tensor(y, dtype=torch.long)

class TextDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

dataset = TextDataset(X, y)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

In [None]:
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)
        
    def forward(self, x):
        x = self.embedding(x)
        h, _ = self.lstm(x)
        out = self.fc(h[:, -1, :])
        return out

vocab_size = len(vocab)
embed_size = 128
hidden_size = 256
num_layers = 2

model = LSTMModel(vocab_size, embed_size, hidden_size, num_layers)

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=128, shuffle=False)

num_epochs = 10
for epoch in range(epochs):
    tot_loss=0
    for inputs, targets in dataloader:
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        tot_loss+=loss.item()
    print(f'Epoch {epoch+1}/{epochs}, Loss: {tot_loss}')

In [None]:
def generate_text(model, seed_text, next_words, max_sequence_len):
    model.eval()
    token_list = [word_to_index[word] for word in seed_text.split()]
    token_list = torch.tensor([token_list], dtype=torch.long)
    
    for _ in range(next_words):
        with torch.no_grad():
            output = model(token_list)
        predicted = torch.argmax(output, dim=1).item()
        predicted_word = index_to_word[predicted]
        
        seed_text += " " + predicted_word
        token_list = torch.tensor([token_list[0].tolist() + [predicted]], dtype=torch.long)
        token_list = token_list[:, -max_sequence_len:]
    
    return seed_text

seed_text = "I want to eat an orange and"
next_words = 20
generated_text = generate_text(model, seed_text, next_words, sequence_length)
print(generated_text)