In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

################################################################
# Load and Tokenize Data
##############################################################

# Load the text file
with open('last_data.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# Character- level tokenization
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(f"Unique characters in the dataset: {vocab_size}")

#char style mapping but word mapping could also be used, or sub-word tokenization is probably the most likeable.
# Create mappings
char_to_idx = {ch: i for i, ch in enumerate(chars)}
idx_to_char = {i: ch for i, ch in enumerate(chars)}

# Encode the entire text into integers
encoded = [char_to_idx[ch] for ch in text]
encoded = torch.tensor(encoded, dtype=torch.long)

# Train-validation split
split_ratio = 0.9
n = int(split_ratio * len(encoded))
train_data = encoded[:n]
val_data = encoded[n:]


##############################################################
# Dataset Class
##################################################3

class TextDataset(Dataset):
    def __init__(self, data, block_size):
        self.data = data
        self.block_size = block_size

    def __len__(self):
        return len(self.data) - self.block_size

    def __getitem__(self, idx):
        x = self.data[idx:idx+self.block_size]
        y = self.data[idx+1:idx+self.block_size+1]
        return x, y

block_size = 32
train_dataset = TextDataset(train_data, block_size)
val_dataset = TextDataset(val_data, block_size)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, drop_last=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False, drop_last=True)

######################################################
# MiniGPT Model
#########################################################

class MiniGPT(nn.Module):
    def __init__(self, vocab_size, block_size, embedding_dim, num_heads, num_layers, ff_hidden_mult):
        super().__init__()
        self.token_embed = nn.Embedding(vocab_size, embedding_dim)
        self.pos_embed = nn.Embedding(block_size, embedding_dim)
        self.blocks = nn.ModuleList([
            TransformerBlock(embedding_dim, num_heads, ff_hidden_mult) for _ in range(num_layers)
        ])
        self.ln_f = nn.LayerNorm(embedding_dim)
        self.head = nn.Linear(embedding_dim, vocab_size, bias=False)

    def forward(self, idx):
        B, T = idx.shape
        token_embeddings = self.token_embed(idx)  #(B, T, C)
        positions = torch.arange(0, T, device=idx.device).unsqueeze(0)  #(1, T)
        positional_embeddings = self.pos_embed(positions)  #(1, T, C)
        x = token_embeddings + positional_embeddings  #(B, T, C)
        for block in self.blocks:
            x = block(x)
        x = self.ln_f(x)  #(B, T, C)
        logits = self.head(x)  #(B, T, vocab_size)
        return logits

class TransformerBlock(nn.Module):
    def __init__(self, embedding_dim, num_heads, ff_hidden_mult):
        super().__init__()
        self.ln1 = nn.LayerNorm(embedding_dim)
        self.attn = MultiHeadAttention(embedding_dim, num_heads)
        self.ln2 = nn.LayerNorm(embedding_dim)
        self.ff = FeedForward(embedding_dim, ff_hidden_mult)

    def forward(self, x):
        x = x + self.attn(self.ln1(x))
        x = x + self.ff(self.ln2(x))
        return x

class MultiHeadAttention(nn.Module):
    def __init__(self, embedding_dim, num_heads):
        super().__init__()
        assert embedding_dim % num_heads == 0
        self.num_heads = num_heads
        self.head_dim = embedding_dim // num_heads
        self.qkv = nn.Linear(embedding_dim, embedding_dim * 3)
        self.output = nn.Linear(embedding_dim, embedding_dim)

    def forward(self, x):
        B, T, C = x.shape
        qkv = self.qkv(x).reshape(B, T, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
        q, k, v = qkv.unbind(0)  #each of them is (B, num_heads, T, head_dim)
        scores = (q @ k.transpose(-2, -1)) / (self.head_dim ** 0.5)  #(B, num_heads, T, T)
        mask = torch.tril(torch.ones(T, T, device=x.device)).unsqueeze(0).unsqueeze(0)
        scores = scores.masked_fill(mask == 0, float('-inf'))
        att = torch.softmax(scores, dim=-1)
        out = (att @ v).transpose(1, 2).reshape(B, T, C)
        return self.output(out)

class FeedForward(nn.Module):
    def __init__(self, embedding_dim, ff_hidden_mult):
        super().__init__()
        hidden_dim = embedding_dim * ff_hidden_mult
        self.net = nn.Sequential(
            nn.Linear(embedding_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, embedding_dim)
        )

    def forward(self, x):
        return self.net(x)

####################################################
# Training Setup
#########################################################

device = 'cuda' if torch.cuda.is_available() else 'cpu'

vocab_size = len(chars)
embedding_dim = 128 #these parameters may be fine-tuned.
num_heads = 4
num_layers = 2
ff_hidden_mult = 4

model = MiniGPT(vocab_size, block_size, embedding_dim, num_heads, num_layers, ff_hidden_mult).to(device)
optimizer = optim.Adam(model.parameters(), lr=3e-4)
criterion = nn.CrossEntropyLoss()

#############################################
# Training Loop
##################################################

model.train()
num_epochs = 1  #number of times it repeats
for epoch in range(num_epochs):
    for step, (x_batch, y_batch) in enumerate(train_loader):
        x_batch = x_batch.to(device)
        y_batch = y_batch.to(device)
        logits = model(x_batch)  #(B, T, vocab_size)
        B, T, V = logits.shape
        loss = criterion(logits.view(B*T, V), y_batch.view(B*T))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if step % 100 == 0:
            print(f"Epoch {epoch}, Step {step}, Loss: {loss.item():.4f}")

################################################################
# Evaluation
###############################################################

model.eval()
val_loss = 0
val_steps = 0
with torch.no_grad():
    for x_batch, y_batch in val_loader:
        x_batch = x_batch.to(device)
        y_batch = y_batch.to(device)
        logits = model(x_batch)
        B, T, V = logits.shape #split the logits according to its shape
        loss = criterion(logits.view(B*T, V), y_batch.view(B*T))
        val_loss += loss.item()
        val_steps += 1
val_loss /= val_steps
print(f"Validation loss: {val_loss:.4f}")

##############################################################
# Text Generation
##############################################################

context = torch.tensor([[char_to_idx['H']]], dtype=torch.long).to(device)  # Start with any letter
generated = context.tolist()[0]

model.eval()
with torch.no_grad():
    for _ in range(100):  #Generate 100 characters
        logits = model(context)  #(1, T, vocab_size)
        last_logits = logits[0, -1, :]  #(vocab_size,)
        probs = torch.softmax(last_logits, dim=-1)
        next_token = torch.multinomial(probs, num_samples=1)
        generated.append(next_token.item())
        context = torch.tensor([generated[-block_size:]], dtype=torch.long).to(device)

generated_text = ''.join(idx_to_char[idx] for idx in generated)
print("Generated text:")
print(generated_text)


Unique characters in the dataset: 75
Epoch 0, Step 0, Loss: 4.4044
Epoch 0, Step 100, Loss: 2.7750
Epoch 0, Step 200, Loss: 2.6739
Epoch 0, Step 300, Loss: 2.5266
Epoch 0, Step 400, Loss: 2.3911
Epoch 0, Step 500, Loss: 2.3605
Epoch 0, Step 600, Loss: 2.1128
Epoch 0, Step 700, Loss: 2.2691
Epoch 0, Step 800, Loss: 2.0592
Epoch 0, Step 900, Loss: 2.0888
Epoch 0, Step 1000, Loss: 2.0338
Epoch 0, Step 1100, Loss: 1.9427
Epoch 0, Step 1200, Loss: 1.9500
Epoch 0, Step 1300, Loss: 1.9116
Epoch 0, Step 1400, Loss: 1.8988
Epoch 0, Step 1500, Loss: 1.8699
Epoch 0, Step 1600, Loss: 1.9773
Epoch 0, Step 1700, Loss: 1.9243
Epoch 0, Step 1800, Loss: 1.9980
Epoch 0, Step 1900, Loss: 1.8662
Epoch 0, Step 2000, Loss: 1.9142
Epoch 0, Step 2100, Loss: 1.9270
Epoch 0, Step 2200, Loss: 1.8875
Epoch 0, Step 2300, Loss: 1.8200
Epoch 0, Step 2400, Loss: 1.9507
Epoch 0, Step 2500, Loss: 1.8174
Epoch 0, Step 2600, Loss: 1.6995
Epoch 0, Step 2700, Loss: 1.9164
Epoch 0, Step 2800, Loss: 1.8613
Epoch 0, Step 2900

In [6]:
context = torch.tensor([[char_to_idx['B']]], dtype=torch.long).to(device)  #Start with any letter
generated = context.tolist()[0]

model.eval()
with torch.no_grad():
    for _ in range(2000):  #Generate 2000 characters
        logits = model(context)  #(1, T, vocab_size)
        last_logits = logits[0, -1, :]  #(vocab_size,)
        probs = torch.softmax(last_logits, dim=-1)
        next_token = torch.multinomial(probs, num_samples=1)
        generated.append(next_token.item())
        context = torch.tensor([generated[-block_size:]], dtype=torch.long).to(device)

generated_text = ''.join(idx_to_char[idx] for idx in generated)
print("Generated text:")
print(generated_text)

Generated text:
Buri and youth a more of
ribbor staircade, to Darnay him, and Miss Pross?" as this leaving him, beard place out; and ever been step whethere Sydney,
which seen hers in the
mody as but.  And he comes.  But to be taverned them, this for the have
been looked by
the court, but of the heavy.




VII


It was no lixt wasted, by Lucie, and a garret; there was that little and
his stood was, "how the neck.  Devil, I resumed in
unconscious at him escores, and crossed in his hand across the work lying and were bling of the two tore ock had aland was as turn, and not?"


"Mr. Don!  O Doctor Manette way,
the purved, my cannot some pull under her herself.  I won't know to make yard her eyes restful
of morestnoned picture, so have dislivered him.  The avail to Death."


What the rain had a gyt, to property frightfulness; but, this quite was complimbed, swell who was stone of the trees well and like annoin, the side, all accompassed be his cheek room what it have been
the shink,
Defarg