In [1]:
# Load text
with open("combined.txt", 'r', encoding='utf-8') as f:
    text = f.read().lower()

# Create a character-level vocabulary
chars = sorted(set(text))
char2idx = {ch: idx for idx, ch in enumerate(chars)}
idx2char = {idx: ch for idx, ch in enumerate(chars)}
vocab_size = len(chars)
# Encode the entire text
encoded_text = [char2idx[c] for c in text]


In [2]:

print("chars",char2idx)

chars {'\n': 0, ' ': 1, '!': 2, "'": 3, '(': 4, ')': 5, ',': 6, '.': 7, '0': 8, '1': 9, '2': 10, '3': 11, '4': 12, '5': 13, '6': 14, '7': 15, '8': 16, '9': 17, ':': 18, ';': 19, '?': 20, 'a': 21, 'b': 22, 'c': 23, 'd': 24, 'e': 25, 'f': 26, 'g': 27, 'h': 28, 'i': 29, 'j': 30, 'k': 31, 'l': 32, 'm': 33, 'n': 34, 'o': 35, 'p': 36, 'q': 37, 'r': 38, 's': 39, 't': 40, 'u': 41, 'v': 42, 'w': 43, 'x': 44, 'y': 45, 'z': 46}


In [2]:
    import torch
from torch.utils.data import Dataset, DataLoader

class CharDataset(Dataset):
    def __init__(self, data, block_size):
        self.data = data
        self.block_size = block_size

    def __len__(self):
        return len(self.data) - self.block_size

    def __getitem__(self, idx):
        x = torch.tensor(self.data[idx:idx+self.block_size])
        y = torch.tensor(self.data[idx+1:idx+self.block_size+1])
        return x, y

block_size = 100
dataset = CharDataset(encoded_text, block_size)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)


In [3]:
import torch.nn as nn

class CharLSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim=128, hidden_dim=256, num_layers=1):
        super(CharLSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden=None):
        x = self.embedding(x)
        output, hidden = self.lstm(x, hidden)
        logits = self.fc(output)
        return logits, hidden


In [4]:
def train_model(model, data_loader, iters, char2idx, idx2char, save_path, device='cpu', max_batches_per_iter=10):
    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.003)
    criterion = nn.CrossEntropyLoss()

    model.train()
    for iteration in range(iters):
        print(f"\nIteration {iteration+1}/{iters}")
        for i, (x, y) in enumerate(data_loader):
            if i >= max_batches_per_iter:
                break
            x, y = x.to(device), y.to(device)
            optimizer.zero_grad()
            logits, _ = model(x)
            loss = criterion(logits.view(-1, logits.size(-1)), y.view(-1))
            loss.backward()
            optimizer.step()

        print(f"Loss: {loss.item():.4f}")

    # Save the trained model
    torch.save({
        'model_state_dict': model.state_dict(),
        'char2idx': char2idx,
        'idx2char': idx2char
    }, save_path)
    print(f"\n Model saved to: {save_path}")


In [6]:
def generate_text(model, start_text, length, device='cpu'):
    model.eval()
    input_eval = torch.tensor([char2idx[c] for c in start_text if c in char2idx], dtype=torch.long).unsqueeze(0).to(device)
    hidden = None
    output_text = start_text

    for _ in range(length):
        output, hidden = model(input_eval, hidden)
        probs = torch.softmax(output[:, -1, :], dim=-1)
        next_idx = torch.multinomial(probs, num_samples=1).item()
        next_char = idx2char[next_idx]
        output_text += next_char
        input_eval = torch.tensor([[next_idx]], device=device)

    return output_text


In [7]:
model = CharLSTMModel(vocab_size)

print(f"\n--- Training for 500 iterations ---")
train_model(model, dataloader, iters=500, char2idx=char2idx, idx2char=idx2char, save_path='char_LSTM_500_iters.pth')
sample = generate_text(model, start_text="a", length=500)
print(f"\nGenerated after 500 iterations:\n{sample}\n")



--- Training for 500 iterations ---

Iteration 1/500
Loss: 2.9481

Iteration 2/500
Loss: 2.6855

Iteration 3/500
Loss: 2.4492

Iteration 4/500
Loss: 2.3153

Iteration 5/500
Loss: 2.2072

Iteration 6/500
Loss: 2.1046

Iteration 7/500
Loss: 1.9953

Iteration 8/500
Loss: 1.8971

Iteration 9/500
Loss: 1.8159

Iteration 10/500
Loss: 1.8077

Iteration 11/500
Loss: 1.7359

Iteration 12/500
Loss: 1.6477

Iteration 13/500
Loss: 1.6118

Iteration 14/500
Loss: 1.5753

Iteration 15/500
Loss: 1.5813

Iteration 16/500
Loss: 1.5108

Iteration 17/500
Loss: 1.5129

Iteration 18/500
Loss: 1.4479

Iteration 19/500
Loss: 1.4572

Iteration 20/500
Loss: 1.4516

Iteration 21/500
Loss: 1.3850

Iteration 22/500
Loss: 1.4001

Iteration 23/500
Loss: 1.3877

Iteration 24/500
Loss: 1.2730

Iteration 25/500
Loss: 1.3073

Iteration 26/500
Loss: 1.3099

Iteration 27/500
Loss: 1.2796

Iteration 28/500
Loss: 1.2849

Iteration 29/500
Loss: 1.2315

Iteration 30/500
Loss: 1.3185

Iteration 31/500
Loss: 1.2673

Iteration 

In [8]:
print(f"\n--- Training for 10 iterations ---")
train_model(model, dataloader, iters=1000, char2idx=char2idx, idx2char=idx2char, save_path='char_LSTM_1000_iters.pth')
sample = generate_text(model, start_text="a", length=500)
print(f"\nGenerated after 1000 iterations:\n{sample}\n")



--- Training for 10 iterations ---

Iteration 1/1000
Loss: 0.2489

Iteration 2/1000
Loss: 0.2387

Iteration 3/1000
Loss: 0.2329

Iteration 4/1000
Loss: 0.2063

Iteration 5/1000
Loss: 0.2024

Iteration 6/1000
Loss: 0.2021

Iteration 7/1000
Loss: 0.1937

Iteration 8/1000
Loss: 0.1959

Iteration 9/1000
Loss: 0.1965

Iteration 10/1000
Loss: 0.1891

Iteration 11/1000
Loss: 0.1958

Iteration 12/1000
Loss: 0.1907

Iteration 13/1000
Loss: 0.1972

Iteration 14/1000
Loss: 0.1977

Iteration 15/1000
Loss: 0.1974

Iteration 16/1000
Loss: 0.2027

Iteration 17/1000
Loss: 0.1976

Iteration 18/1000
Loss: 0.1986

Iteration 19/1000
Loss: 0.1908

Iteration 20/1000
Loss: 0.2094

Iteration 21/1000
Loss: 0.1912

Iteration 22/1000
Loss: 0.2026

Iteration 23/1000
Loss: 0.2090

Iteration 24/1000
Loss: 0.2080

Iteration 25/1000
Loss: 0.2055

Iteration 26/1000
Loss: 0.2086

Iteration 27/1000
Loss: 0.2292

Iteration 28/1000
Loss: 0.2105

Iteration 29/1000
Loss: 0.2090

Iteration 30/1000
Loss: 0.2062

Iteration 31

In [9]:
print(f"\n--- Training for 2000 iterations ---")
train_model(model, dataloader, iters=2000, char2idx=char2idx, idx2char=idx2char, save_path='char_LSTM_2000_iters.pth')
sample = generate_text(model, start_text="a", length=500)
print(f"\nGenerated after 2000 iterations:\n{sample}\n")



--- Training for 2000 iterations ---

Iteration 1/2000
Loss: 0.1993

Iteration 2/2000
Loss: 0.1961

Iteration 3/2000
Loss: 0.1941

Iteration 4/2000
Loss: 0.1815

Iteration 5/2000
Loss: 0.1706

Iteration 6/2000
Loss: 0.1776

Iteration 7/2000
Loss: 0.1761

Iteration 8/2000
Loss: 0.1635

Iteration 9/2000
Loss: 0.1608

Iteration 10/2000
Loss: 0.1652

Iteration 11/2000
Loss: 0.1557

Iteration 12/2000
Loss: 0.1655

Iteration 13/2000
Loss: 0.1669

Iteration 14/2000
Loss: 0.1668

Iteration 15/2000
Loss: 0.1736

Iteration 16/2000
Loss: 0.1692

Iteration 17/2000
Loss: 0.1747

Iteration 18/2000
Loss: 0.1728

Iteration 19/2000
Loss: 0.1788

Iteration 20/2000
Loss: 0.1869

Iteration 21/2000
Loss: 0.1706

Iteration 22/2000
Loss: 0.1736

Iteration 23/2000
Loss: 0.1726

Iteration 24/2000
Loss: 0.1642

Iteration 25/2000
Loss: 0.1588

Iteration 26/2000
Loss: 0.1750

Iteration 27/2000
Loss: 0.1811

Iteration 28/2000
Loss: 0.1801

Iteration 29/2000
Loss: 0.1719

Iteration 30/2000
Loss: 0.1704

Iteration 