<a href="https://colab.research.google.com/github/ChVenkatSai/Scratch-GPT/blob/main/GPT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install torchtyping

Collecting torchtyping
  Downloading torchtyping-0.1.4-py3-none-any.whl (17 kB)
Collecting typeguard>=2.11.1 (from torchtyping)
  Downloading typeguard-4.3.0-py3-none-any.whl (35 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.7.0->torchtyping)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.7.0->torchtyping)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.7.0->torchtyping)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.7.0->torchtyping)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.7.0->torchtyping)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Colle

In [2]:
import torch
import torch.nn as nn
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
from torchtyping import TensorType
from collections import OrderedDict
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import requests

In [3]:
class GPT(nn.Module):

    def __init__(self, vocab_size: int, context_length: int, model_dim: int, num_blocks: int, num_heads: int):
        super().__init__()
        torch.manual_seed(0)
        self.context_length = context_length
        self.model_dim = model_dim
        self.word = nn.Embedding(vocab_size,model_dim)
        self.position = nn.Embedding(context_length,model_dim)
        od = OrderedDict()
        for i in range(num_blocks):
            od.update({f"block {i}":self.TransformerBlock(model_dim,num_heads)})
        self.layer = nn.Sequential(od)
        self.norm = nn.LayerNorm(model_dim)
        self.final = nn.Linear(model_dim, vocab_size)

    def forward(self, context: TensorType[int]) -> TensorType[float]:
        torch.manual_seed(0)
        word_embed = self.word(context)
        batch_size = context.shape[0]
        pos = torch.arange(self.context_length).to(device)
        pos_embed = self.position(pos).unsqueeze(0).expand(batch_size,-1,-1)
        embed = word_embed+pos_embed
        block = self.layer(embed)
        norm = self.norm(block)
        fin = self.final(norm)
        return fin

    class TransformerBlock(nn.Module):

      def __init__(self, model_dim: int, num_heads: int):
          super().__init__()
          torch.manual_seed(0)
          self.first = nn.LayerNorm(model_dim)
          self.attention = self.MultiHeadedSelfAttention(model_dim,model_dim,num_heads)
          self.second = nn.LayerNorm(model_dim)
          self.linear = self.VanillaNeuralNetwork(model_dim)

      def forward(self, embedded: TensorType[float]) -> TensorType[float]:
          # Round answer to 4 decimal places
          torch.manual_seed(0)
          norm1 = self.first(embedded)
          first = self.attention(norm1)
          new = first+embedded
          norm2 = self.second(new)
          second = self.linear(norm2)
          return second + new

      class MultiHeadedSelfAttention(nn.Module):

        def __init__(self, embedding_dim: int, attention_dim: int, num_heads: int):
            super().__init__()
            torch.manual_seed(0)
            # Hint: nn.ModuleList() will be useful. It works the same as a Python list
            # but is useful here since instance variables of any subclass of nn.Module
            # must also be subclasses of nn.Module

            # Use self.SingleHeadAttention(embedding_dim, head_size) to instantiate. You have to calculate head_size.
            self.head_size = attention_dim//num_heads
            self.multiattention = nn.ModuleList([self.SingleHeadAttention(embedding_dim,self.head_size) for _ in range(num_heads)])


        def forward(self, embedded: TensorType[float]) -> TensorType[float]:
            # Return answer to 4 decimal places
            l = []
            for layer in (self.multiattention):
                l.append(layer(embedded))
            return torch.cat(l, dim=2)

        class SingleHeadAttention(nn.Module):

          def __init__(self, embedding_dim: int, attention_dim: int):
              super().__init__()
              torch.manual_seed(0)
              self.attention_dim = attention_dim
              self.key = nn.Linear(embedding_dim,attention_dim,bias=False)
              self.query = nn.Linear(embedding_dim,attention_dim,bias=False)
              self.value = nn.Linear(embedding_dim,attention_dim,bias=False)

          def forward(self, embedded: TensorType[float]) -> TensorType[float]:
              # Return your answer to 4 decimal places
              keys = self.key(embedded)
              queries = self.query(embedded)
              values = self.value(embedded)
              attention = torch.matmul(queries,torch.transpose(keys,1,2))
              attention = attention/torch.sqrt(torch.tensor(self.attention_dim))
              mask = torch.ones(attention.shape)
              mask = torch.triu(mask,diagonal=1).bool()
              masked = attention.masked_fill(mask.to(device), float('-inf'))
              norm = nn.functional.softmax(masked,dim=2)
              final = torch.matmul(norm,values)
              return final

      class VanillaNeuralNetwork(nn.Module):

        def __init__(self, model_dim: int):
            super().__init__()
            torch.manual_seed(0)
            self.up_projection = nn.Linear(model_dim, model_dim * 4)
            self.relu = nn.ReLU()
            self.down_projection = nn.Linear(model_dim * 4, model_dim)
            self.dropout = nn.Dropout(0.2) # using p = 0.2

        def forward(self, x: TensorType[float]) -> TensorType[float]:
            torch.manual_seed(0)
            return self.dropout(self.down_projection(self.relu(self.up_projection(x))))

In [4]:
def generate(self, model, new_chars: int, context: TensorType[int], context_length: int, int_to_char: dict) -> str:

        generator = torch.manual_seed(0)
        initial_state = generator.get_state()
        res=""
        for i in range(new_chars):
            if context.shape[1]>context_length:
                output = model(context[:,-context_length:])
            else:
                output = model(context)
            output = nn.functional.softmax(output,dim=2)
            samples = torch.multinomial(output.squeeze(0),1,generator=generator)
            # YOUR CODE (arbitrary number of lines)
            # The line where you call torch.multinomial(). Pass in the generator as well.
            generator.set_state(initial_state)
            res+=int_to_char[int(samples.squeeze(0))]
            context = torch.cat((context,samples),dim=-1)

        return res

In [5]:
from torch.utils.data import Subset
import random

In [6]:


# Download a book from Project Gutenberg
url = "http://www.gutenberg.org/files/1342/1342-0.txt"  # Example: Pride and Prejudice
response = requests.get(url)
text = response.text

# Preprocess the text
text = text.lower()
text = text.replace('\r', ' ').replace('\n', ' ').replace('\t', ' ')
text = ''.join([c for c in text if c.isalpha() or c.isspace()])

# Prepare the dataset
class TextDataset(Dataset):
    def __init__(self, text, context_length):
        self.text = text
        self.context_length = context_length
        self.vocab = sorted(set(text))
        self.char_to_idx = {ch: idx for idx, ch in enumerate(self.vocab)}
        self.idx_to_char = {idx: ch for idx, ch in enumerate(self.vocab)}
        self.data = self.encode(text)

    def encode(self, text):
        return [self.char_to_idx[ch] for ch in text]

    def __len__(self):
        return len(self.data) - self.context_length

    def __getitem__(self, idx):
        context = self.data[idx:idx + self.context_length]
        target = self.data[idx + 1:idx + self.context_length + 1]
        return torch.tensor(context, dtype=torch.long), torch.tensor(target, dtype=torch.long)

# Hyperparameters
vocab_size = len(set(text))
context_length = 128
model_dim = 512
num_blocks = 6
num_heads = 8
batch_size = 64
num_epochs = 5
learning_rate = 0.001

# Instantiate the model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GPT(vocab_size, context_length, model_dim, num_blocks, num_heads).to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Load your dataset
dataset = TextDataset(text, context_length)
subset_indices = random.sample(range(len(dataset)), k=100000)  # Choose 5000 samples randomly
print("Unencoded text:", dataset.text[:context_length + 1])
print(dataset[0])
subset_dataset = Subset(dataset, subset_indices)
dataloader = DataLoader(subset_dataset, batch_size=batch_size, shuffle=True)

# Training loop
for epoch in range(num_epochs):
    model.train()
    for batch_idx, (context, target) in enumerate(dataloader):
        context, target = context.to(device), target.to(device)

        # Forward pass
        output = model(context)

        # Reshape output and target to match the dimensions for CrossEntropyLoss
        output = output.view(-1, vocab_size)
        target = target.view(-1)

        # Compute loss
        loss = criterion(output, target)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()

        if batch_idx % 100 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Step [{batch_idx}/{len(dataloader)}], Loss: {loss.item():.4f}')

print("Training complete!")


Unencoded text: ïthe project gutenberg ebook of pride and prejudice by jane austen    this ebook is for the use of anyone anywhere in the united 
(tensor([32, 20,  8,  5,  0, 16, 18, 15, 10,  5,  3, 20,  0,  7, 21, 20,  5, 14,
         2,  5, 18,  7,  0,  5,  2, 15, 15, 11,  0, 15,  6,  0, 16, 18,  9,  4,
         5,  0,  1, 14,  4,  0, 16, 18,  5, 10, 21,  4,  9,  3,  5,  0,  2, 25,
         0, 10,  1, 14,  5,  0,  1, 21, 19, 20,  5, 14,  0,  0,  0,  0, 20,  8,
         9, 19,  0,  5,  2, 15, 15, 11,  0,  9, 19,  0,  6, 15, 18,  0, 20,  8,
         5,  0, 21, 19,  5,  0, 15,  6,  0,  1, 14, 25, 15, 14,  5,  0,  1, 14,
        25, 23,  8,  5, 18,  5,  0,  9, 14,  0, 20,  8,  5,  0, 21, 14,  9, 20,
         5,  4]), tensor([20,  8,  5,  0, 16, 18, 15, 10,  5,  3, 20,  0,  7, 21, 20,  5, 14,  2,
         5, 18,  7,  0,  5,  2, 15, 15, 11,  0, 15,  6,  0, 16, 18,  9,  4,  5,
         0,  1, 14,  4,  0, 16, 18,  5, 10, 21,  4,  9,  3,  5,  0,  2, 25,  0,
        10,  1, 14,  5,  0,  1, 21,

KeyboardInterrupt: 

In [7]:
# After training is complete
WEIGHT_PATH = 'weights.pt'
torch.save(model.state_dict(), WEIGHT_PATH)
print(f"Model weights saved to {WEIGHT_PATH}")

Model weights saved to weights.pt


In [15]:
with torch.no_grad():
    context = torch.zeros(1, context_length, dtype=torch.long).to(device)  # Example initial context
    logits = model(context)
    print(logits)

tensor([[[  1.5186,   1.3980,   0.0777,  ...,  -7.7479,  -8.6362,  -9.4504],
         [  3.1690,   0.5301,  -0.1313,  ...,  -7.7939,  -8.2301,  -9.4291],
         [  4.7322,   0.3038,   0.2443,  ...,  -5.9061,  -7.0487,  -8.8097],
         ...,
         [  7.0719,  -0.2045,  -0.5549,  ...,  -8.0985,  -7.6886, -10.0902],
         [  7.5740,   0.1351,  -0.0592,  ...,  -8.2591,  -7.7744, -10.7178],
         [  7.9012,   0.0654,  -0.1203,  ...,  -8.0369,  -7.8386, -10.6654]]],
       device='cuda:0')


In [10]:
def generate(model, new_chars: int, context, context_length: int, int_to_char: dict) -> str:
    res = []
    for i in range(new_chars):
        if len(context.T) > context_length:
            context = context[:, -context_length:]
        prediction = model(context) # B, T, Vocab_Size
        last_time_step = prediction[:, -1, :] # B, Vocab_Size
        probabilities = nn.functional.softmax(last_time_step, dim = -1)
        next_char = torch.multinomial(probabilities, 1)
        context = torch.cat((context, next_char), dim = -1)
        res.append(int_to_char[next_char.item()])
    return ''.join(res)

In [19]:
# Assuming you have a trained model and the TextDataset class from earlier

# Define the context length
context_length = 128

# Your initial context as a string
initial_context = "it is a truth universally acknowledged that a single man in possession of a good fortune must be in want of a wife"

# Ensure the initial context is exactly `context_length` characters long
# Truncate if it's longer, or pad with spaces if it's shorter
initial_context = initial_context[:context_length].ljust(context_length)

# Encode the initial context
encoded_context = [dataset.char_to_idx[ch] for ch in initial_context]

# Convert the list of indices to a PyTorch tensor
context_tensor = torch.tensor(encoded_context, dtype=torch.long).unsqueeze(0).to(device)  # Add batch dimension

# Print the initial context and its tensor
print("Initial context (string):", initial_context)
print("Initial context (tensor):", context_tensor)

# Generate text using the model
with torch.no_grad():
    generated_text =generate(model, new_chars=500, context=context_tensor, context_length=context_length, int_to_char=dataset.idx_to_char)

print("Generated text:", generated_text)


Initial context (string): it is a truth universally acknowledged that a single man in possession of a good fortune must be in want of a wife              
Initial context (tensor): tensor([[ 9, 20,  0,  9, 19,  0,  1,  0, 20, 18, 21, 20,  8,  0, 21, 14,  9, 22,
          5, 18, 19,  1, 12, 12, 25,  0,  1,  3, 11, 14, 15, 23, 12,  5,  4,  7,
          5,  4,  0, 20,  8,  1, 20,  0,  1,  0, 19,  9, 14,  7, 12,  5,  0, 13,
          1, 14,  0,  9, 14,  0, 16, 15, 19, 19,  5, 19, 19,  9, 15, 14,  0, 15,
          6,  0,  1,  0,  7, 15, 15,  4,  0,  6, 15, 18, 20, 21, 14,  5,  0, 13,
         21, 19, 20,  0,  2,  5,  0,  9, 14,  0, 23,  1, 14, 20,  0, 15,  6,  0,
          1,  0, 23,  9,  6,  5,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0]], device='cuda:0')
Generated text:                                                                                                                                                                                                         

In [14]:
new_chars = 1000 # Number of characters to generate
context = torch.ones(1, 128, dtype = torch.int64).to(device)  # Initial context, e.g., empty tensor
generated_text = generate(model, new_chars, context,context_length, dataset.idx_to_char)

# Print or use the generated text
print(generated_text)

n  heart of what i said of the very begin to         with a proper a                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    