In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import random


#Load and Tokenize Data
##################################

#Load the text file
with open('last_data.txt', 'r', encoding='utf-8') as f:
    text = f.read()

#Character-level tokenization
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(f"Unique characters in the dataset: {vocab_size}")

#Create mappings
char_to_idx = {ch: i for i, ch in enumerate(chars)}
idx_to_char = {i: ch for i, ch in enumerate(chars)}

#Encode the entire text into integers
encoded = [char_to_idx[ch] for ch in text]
encoded = torch.tensor(encoded, dtype=torch.long)

#Train-validation split
split_ratio = 0.9
n = int(split_ratio * len(encoded))
train_data = encoded[:n]
val_data = encoded[n:]


#Dataset Class
##############################################################

class TextDataset(Dataset):
    def __init__(self, data, block_size):
        self.data = data
        self.block_size = block_size

    def __len__(self):
        return len(self.data) - self.block_size

    def __getitem__(self, idx):
        x = self.data[idx:idx+self.block_size]
        y = self.data[idx+1:idx+self.block_size+1]
        return x, y

block_size = 32
train_dataset = TextDataset(train_data, block_size)
val_dataset = TextDataset(val_data, block_size)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, drop_last=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False, drop_last=True)


#LSTM-Based Language Model
################################################################

class LSTMLanguageModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden_state=None):
        # Embed tokens
        x = self.embedding(x)  # (B, T, embedding_dim)

        # Pass through LSTM
        if hidden_state is None:
            output, hidden_state = self.lstm(x)  # (B, T, hidden_dim)
        else:
            output, hidden_state = self.lstm(x, hidden_state)

        # Project to vocab size
        logits = self.fc(output)  # (B, T, vocab_size)
        return logits, hidden_state


#Training Setup
##########################################################

device = 'cuda' if torch.cuda.is_available() else 'cpu'

vocab_size = len(chars)
embedding_dim = 128
hidden_dim = 256
num_layers = 2

model = LSTMLanguageModel(vocab_size, embedding_dim, hidden_dim, num_layers).to(device)
optimizer = optim.Adam(model.parameters(), lr=3e-4)
criterion = nn.CrossEntropyLoss()


#Training Loop
############################################################

model.train()
num_epochs = 1  #Increase for better training
for epoch in range(num_epochs):
    for step, (x_batch, y_batch) in enumerate(train_loader):
        x_batch = x_batch.to(device)
        y_batch = y_batch.to(device)

        logits, _ = model(x_batch)  # (B, T, vocab_size)
        B, T, V = logits.shape
        loss = criterion(logits.view(B*T, V), y_batch.view(B*T))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if step % 100 == 0:
            print(f"Epoch {epoch}, Step {step}, Loss: {loss.item():.4f}")


# Evaluation
######################################################

model.eval()
val_loss = 0
val_steps = 0
with torch.no_grad():
    for x_batch, y_batch in val_loader:
        x_batch = x_batch.to(device)
        y_batch = y_batch.to(device)
        logits, _ = model(x_batch)
        B, T, V = logits.shape
        loss = criterion(logits.view(B*T, V), y_batch.view(B*T))
        val_loss += loss.item()
        val_steps += 1
val_loss /= val_steps
print(f"Validation loss: {val_loss:.4f}")


# Text Generation
#################################################

def generate_text(model, start_char, char_to_idx, idx_to_char, max_length=500):
    model.eval()
    input_idx = torch.tensor([[char_to_idx[start_char]]], dtype=torch.long).to(device)
    generated_text = [start_char]

    hidden_state = None
    with torch.no_grad():
        for _ in range(max_length):
            logits, hidden_state = model(input_idx, hidden_state)  #(1, 1, vocab_size)
            probs = torch.softmax(logits[0, -1, :], dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)
            next_char = idx_to_char[next_token.item()]
            generated_text.append(next_char)
            input_idx = next_token.unsqueeze(0)

    return ''.join(generated_text)

#Start text generation
random_start_char = random.choice(list(char_to_idx.keys()))
print(f"Starting generation with random character: {random_start_char}")
generated_text = generate_text(model, random_start_char, char_to_idx, idx_to_char)
print("Generated text:")
print(generated_text)


Unique characters in the dataset: 75
Epoch 0, Step 0, Loss: 4.3091
Epoch 0, Step 100, Loss: 2.9286
Epoch 0, Step 200, Loss: 2.5332
Epoch 0, Step 300, Loss: 2.4774
Epoch 0, Step 400, Loss: 2.3912
Epoch 0, Step 500, Loss: 2.2219
Epoch 0, Step 600, Loss: 2.1376
Epoch 0, Step 700, Loss: 2.0660
Epoch 0, Step 800, Loss: 2.1510
Epoch 0, Step 900, Loss: 2.0758
Epoch 0, Step 1000, Loss: 2.1155
Epoch 0, Step 1100, Loss: 2.0024
Epoch 0, Step 1200, Loss: 1.9641
Epoch 0, Step 1300, Loss: 1.9954
Epoch 0, Step 1400, Loss: 2.0020
Epoch 0, Step 1500, Loss: 1.9022
Epoch 0, Step 1600, Loss: 1.8235
Epoch 0, Step 1700, Loss: 1.8522
Epoch 0, Step 1800, Loss: 1.8937
Epoch 0, Step 1900, Loss: 1.8449
Epoch 0, Step 2000, Loss: 1.8247
Epoch 0, Step 2100, Loss: 1.9656
Epoch 0, Step 2200, Loss: 1.8165
Epoch 0, Step 2300, Loss: 1.7533
Epoch 0, Step 2400, Loss: 1.7896
Epoch 0, Step 2500, Loss: 1.8126
Epoch 0, Step 2600, Loss: 1.6990
Epoch 0, Step 2700, Loss: 1.7064
Epoch 0, Step 2800, Loss: 1.7683
Epoch 0, Step 2900

In [3]:
def generate_text(model, start_char, char_to_idx, idx_to_char, max_length=2000):
    model.eval()
    input_idx = torch.tensor([[char_to_idx[start_char]]], dtype=torch.long).to(device)
    generated_text = [start_char]

    hidden_state = None
    with torch.no_grad():
        for _ in range(max_length):
            logits, hidden_state = model(input_idx, hidden_state)  # (1, 1, vocab_size)
            probs = torch.softmax(logits[0, -1, :], dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)
            next_char = idx_to_char[next_token.item()]
            generated_text.append(next_char)
            input_idx = next_token.unsqueeze(0)

    return ''.join(generated_text)

generated_text = generate_text(model, random_start_char, char_to_idx, idx_to_char)
print("Generated text:")
print(generated_text)


Generated text:
the beds of the Dorn altogewhere out
this place, "quite closeness," with most for again, and smulled at the thing left up; any good occasy:  "I think then.  But, I have been their proke, and to see well out of that yourself about her hand and leathing at the passenger, I demand him, of an awn of anybody's head was to shore and soloud and his little opportunity--or a Ungive that.  I saw well vigh look, and you will you can have the immoving the guiler of improvement.  Don't be fell, and was.  Her say, as your Twife to-night; I mean to, and I beginnt him besome you, my soul, my business, it begars, _no," I shall not be made to the knives of the attitud by no other, the knowledge thing, when he
ever turned out," said the coot with secret and obey wig had reveiged his blue timely consideration.  "What is it it."  He did not have been readies with
surrounded skees, and on his mind that was sure the Paris.


"Be done to me.  Everything at all, if it were hure you feel upon th