In [15]:
# Load text
with open("combined.txt", 'r', encoding='utf-8') as f:
    text = f.read().lower()

# Create a character-level vocabulary
chars = sorted(set(text))
print(chars)
char2idx = {ch: idx for idx, ch in enumerate(chars)}
idx2char = {idx: ch for idx, ch in enumerate(chars)}
vocab_size = len(chars)

# Encode the entire text
encoded_text = [char2idx[c] for c in text]


['\n', ' ', '!', "'", '(', ')', ',', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [2]:
print(vocab_size)

47


In [3]:

print("chars",char2idx)

chars {'\n': 0, ' ': 1, '!': 2, "'": 3, '(': 4, ')': 5, ',': 6, '.': 7, '0': 8, '1': 9, '2': 10, '3': 11, '4': 12, '5': 13, '6': 14, '7': 15, '8': 16, '9': 17, ':': 18, ';': 19, '?': 20, 'a': 21, 'b': 22, 'c': 23, 'd': 24, 'e': 25, 'f': 26, 'g': 27, 'h': 28, 'i': 29, 'j': 30, 'k': 31, 'l': 32, 'm': 33, 'n': 34, 'o': 35, 'p': 36, 'q': 37, 'r': 38, 's': 39, 't': 40, 'u': 41, 'v': 42, 'w': 43, 'x': 44, 'y': 45, 'z': 46}


In [4]:
print(encoded_text[47])

22


In [5]:
import torch
from torch.utils.data import Dataset, DataLoader

class CharDataset(Dataset):
    def __init__(self, data, block_size):
        self.data = data
        self.block_size = block_size

    def __len__(self):
        return len(self.data) - self.block_size

    def __getitem__(self, idx):
        x = torch.tensor(self.data[idx:idx+self.block_size])
        y = torch.tensor(self.data[idx+1:idx+self.block_size+1])
        return x, y
#50–200
block_size = 100
dataset = CharDataset(encoded_text, block_size)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)


In [6]:
import torch.nn as nn

class GRUCharModel(nn.Module):
    def __init__(self, vocab_size, hidden_size):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, hidden=None):
        x = self.embed(x)
        out, hidden = self.gru(x, hidden)
        logits = self.fc(out)
        return logits, hidden


In [7]:
def train_model(model, data_loader, iters, char2idx, idx2char, save_path, device='cpu', max_batches_per_iter=10):
    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.003)
    criterion = nn.CrossEntropyLoss()

    model.train()
    for iteration in range(iters):
        print(f"\nIteration {iteration+1}/{iters}")
        for i, (x, y) in enumerate(data_loader):
            if i >= max_batches_per_iter:
                break
            x, y = x.to(device), y.to(device)
            optimizer.zero_grad()
            logits, _ = model(x)
            loss = criterion(logits.view(-1, logits.size(-1)), y.view(-1))
            loss.backward()
            optimizer.step()

        print(f"Loss: {loss.item():.4f}")

    # Save the trained model
    torch.save({
        'model_state_dict': model.state_dict(),
        'char2idx': char2idx,
        'idx2char': idx2char
    }, save_path)
    print(f"\n Model saved to: {save_path}")


In [8]:
def generate_text(model, start_text, length, device='cpu'):
    model.eval()
    input_eval = torch.tensor([char2idx[c] for c in start_text], dtype=torch.long).unsqueeze(0).to(device)
    hidden = None
    output_text = start_text

    with torch.no_grad():
        for _ in range(length):
            logits, hidden = model(input_eval, hidden)
            probs = torch.softmax(logits[:, -1, :], dim=-1)
            next_char_idx = torch.multinomial(probs, num_samples=1).item()
            next_char = idx2char[next_char_idx]
            output_text += next_char
            input_eval = torch.tensor([[next_char_idx]], device=device)

    return output_text


In [11]:
model = GRUCharModel(vocab_size=vocab_size, hidden_size=256)


In [6]:

print(f"\n--- Training for 500 iterations ---")
train_model(model, dataloader, iters=500, char2idx=char2idx, idx2char=idx2char, save_path='char_gru_500_iters.pth')
sample = generate_text(model, start_text="a", length=500)
print(f"\nGenerated after 500 iterations:\n{sample}\n")



--- Training for 500 iterations ---

Iteration 1/500
Loss: 2.5203

Iteration 2/500
Loss: 2.2897

Iteration 3/500
Loss: 2.1028

Iteration 4/500
Loss: 2.0030

Iteration 5/500
Loss: 1.8680

Iteration 6/500
Loss: 1.8329

Iteration 7/500
Loss: 1.7654

Iteration 8/500
Loss: 1.6684

Iteration 9/500
Loss: 1.6416

Iteration 10/500
Loss: 1.6087

Iteration 11/500
Loss: 1.5374

Iteration 12/500
Loss: 1.4605

Iteration 13/500
Loss: 1.4913

Iteration 14/500
Loss: 1.4484

Iteration 15/500
Loss: 1.4195

Iteration 16/500
Loss: 1.4657

Iteration 17/500
Loss: 1.4116

Iteration 18/500
Loss: 1.3775

Iteration 19/500
Loss: 1.3724

Iteration 20/500
Loss: 1.3537

Iteration 21/500
Loss: 1.3073

Iteration 22/500
Loss: 1.3117

Iteration 23/500
Loss: 1.3074

Iteration 24/500
Loss: 1.2976

Iteration 25/500
Loss: 1.2213

Iteration 26/500
Loss: 1.2862

Iteration 27/500
Loss: 1.2691

Iteration 28/500
Loss: 1.2431

Iteration 29/500
Loss: 1.2088

Iteration 30/500
Loss: 1.2381

Iteration 31/500
Loss: 1.1693

Iteration 

In [7]:
print(f"\n--- Training for 1000 iterations ---")
train_model(model, dataloader, iters=1000, char2idx=char2idx, idx2char=idx2char, save_path='char_gru_1000_iters.pth')
sample = generate_text(model, start_text="a", length=500)
print(f"\nGenerated after 1000 iterations:\n{sample}\n")



--- Training for 1000 iterations ---

Iteration 1/1000
Loss: 0.7529

Iteration 2/1000
Loss: 0.7415

Iteration 3/1000
Loss: 0.7415

Iteration 4/1000
Loss: 0.7187

Iteration 5/1000
Loss: 0.7438

Iteration 6/1000
Loss: 0.7464

Iteration 7/1000
Loss: 0.6923

Iteration 8/1000
Loss: 0.6999

Iteration 9/1000
Loss: 0.7157

Iteration 10/1000
Loss: 0.6948

Iteration 11/1000
Loss: 0.7084

Iteration 12/1000
Loss: 0.6731

Iteration 13/1000
Loss: 0.6790

Iteration 14/1000
Loss: 0.6774

Iteration 15/1000
Loss: 0.6859

Iteration 16/1000
Loss: 0.6427

Iteration 17/1000
Loss: 0.7046

Iteration 18/1000
Loss: 0.6782

Iteration 19/1000
Loss: 0.6657

Iteration 20/1000
Loss: 0.6749

Iteration 21/1000
Loss: 0.6926

Iteration 22/1000
Loss: 0.6925

Iteration 23/1000
Loss: 0.6808

Iteration 24/1000
Loss: 0.6848

Iteration 25/1000
Loss: 0.7258

Iteration 26/1000
Loss: 0.7258

Iteration 27/1000
Loss: 0.7252

Iteration 28/1000
Loss: 0.6772

Iteration 29/1000
Loss: 0.7213

Iteration 30/1000
Loss: 0.6875

Iteration 

In [12]:
print(f"\n--- Training for 2000 iterations ---")
train_model(model, dataloader, iters=2000, char2idx=char2idx, idx2char=idx2char, save_path='char_gru_2000_iters.pth')

print(f"\nGenerated after 2000 iterations:\n{sample}\n")



--- Training for 2000 iterations ---

Iteration 1/2000
Loss: 2.4783

Iteration 2/2000
Loss: 2.2665

Iteration 3/2000
Loss: 2.0841

Iteration 4/2000
Loss: 1.9871

Iteration 5/2000
Loss: 1.8664

Iteration 6/2000
Loss: 1.7860

Iteration 7/2000
Loss: 1.7354

Iteration 8/2000
Loss: 1.6907

Iteration 9/2000
Loss: 1.6151

Iteration 10/2000
Loss: 1.5913

Iteration 11/2000
Loss: 1.5719

Iteration 12/2000
Loss: 1.4787

Iteration 13/2000
Loss: 1.4790

Iteration 14/2000
Loss: 1.4776

Iteration 15/2000
Loss: 1.3521

Iteration 16/2000
Loss: 1.4615

Iteration 17/2000
Loss: 1.3556

Iteration 18/2000
Loss: 1.4027

Iteration 19/2000
Loss: 1.3685

Iteration 20/2000
Loss: 1.2900

Iteration 21/2000
Loss: 1.2645

Iteration 22/2000
Loss: 1.2932

Iteration 23/2000
Loss: 1.2457

Iteration 24/2000
Loss: 1.2982

Iteration 25/2000
Loss: 1.2701

Iteration 26/2000
Loss: 1.2560

Iteration 27/2000
Loss: 1.2348

Iteration 28/2000
Loss: 1.2126

Iteration 29/2000
Loss: 1.2324

Iteration 30/2000
Loss: 1.2339

Iteration 

NameError: name 'sample' is not defined

In [13]:
sample = generate_text(model, start_text="a", length=500)


In [14]:
print(f"\nGenerated after 2000 iterations:\n{sample}\n")



Generated after 2000 iterations:
apers't of lold arts of dideted ai researchers (thing paramyrrosply; their desdation or a the easistive tropage direptive real in predictial sucse suble and soiks kurences. the network's emible though (hused stace in 195 a multiplayer aning yar ocheing in a mundomation at in the current neural network is representations. hummored and cetures ant becamor of ken projection sit 22 speakive the defficiallegmy . the process scaing in the use solved the image is were pressentiapssoly connectors won has

