In [None]:
import torch
import torch.nn as nn
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [None]:
from GPTModel import GPTModel

GPT_CONFIG_124M = {
    "vocab_size" : 50257,
    "ctx_len" : 256,
    "emb_dim" : 768,
    "n_heads" : 12,
    "n_layers" : 12,
    "drop_rate" :0.1,
    "qKv_bias" : False,
}

model = GPTModel(GPT_CONFIG_124M)

In [None]:
model

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(256, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_block): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (w_keys): Linear(in_features=768, out_features=768, bias=False)
        (w_query): Linear(in_features=768, out_features=768, bias=False)
        (w_values): Linear(in_features=768, out_features=768, bias=False)
        (dropout): Dropout(p=0.1, inplace=False)
        (out_prj): Linear(in_features=768, out_features=768, bias=True)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GeLU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
          (3): Dropout(p=0.1, inplace=False)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_resid): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttent

In [None]:
!pip install tiktoken
import tiktoken
from GPTModel import generate_text_simple

def text_to_token_ids(text,tokenizer):
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0)

    return encoded_tensor

def token_ids_to_text(token_ids,tokenizer):
    flat = token_ids.squeeze(0)
    decoded = tokenizer.decode(flat.tolist())
    return decoded


start_context = "What are you doing ? "
tokenizer = tiktoken.get_encoding("gpt2")

token_ids = generate_text_simple(model=model,
                                 idx=text_to_token_ids(start_context,tokenizer),
                                 max_new_tokens=10,
                                 context_scale=256)

print("Output text: ",token_ids_to_text(token_ids,tokenizer))

Collecting tiktoken
  Downloading tiktoken-0.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.8 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/1.8 MB[0m [31m4.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m29.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tiktoken
Successfully installed tiktoken-0.6.0
Output text:  What are you doing ?  resisted�� implantedinger Mid palp Mermaiddule ABOUT563


In [None]:
with open("dosto.txt","r",encoding="utf-8") as f:
    text_data = f.read()
text_data

In [None]:
from DataSet_Loader import create_dataloader

train_ratio = 0.9
split_idx = int(train_ratio * len(text_data))
train_data = text_data[:split_idx]
val_data = text_data[split_idx:]

train_dl = create_dataloader(train_data,
                             batch_size=8,
                             max_length=256,
                             stride=256,
                             shuffle=True)

val_dl = create_dataloader(
    val_data,
    batch_size=8,
    max_length=256,
    stride=256,
    shuffle=True
)

In [None]:
for x,y in train_dl:
    print(x.shape,y.shape)

torch.Size([8, 256]) torch.Size([8, 256])
torch.Size([8, 256]) torch.Size([8, 256])
torch.Size([8, 256]) torch.Size([8, 256])
torch.Size([8, 256]) torch.Size([8, 256])
torch.Size([8, 256]) torch.Size([8, 256])
torch.Size([8, 256]) torch.Size([8, 256])
torch.Size([8, 256]) torch.Size([8, 256])
torch.Size([8, 256]) torch.Size([8, 256])
torch.Size([8, 256]) torch.Size([8, 256])
torch.Size([8, 256]) torch.Size([8, 256])
torch.Size([8, 256]) torch.Size([8, 256])
torch.Size([8, 256]) torch.Size([8, 256])
torch.Size([8, 256]) torch.Size([8, 256])
torch.Size([8, 256]) torch.Size([8, 256])
torch.Size([8, 256]) torch.Size([8, 256])
torch.Size([8, 256]) torch.Size([8, 256])
torch.Size([8, 256]) torch.Size([8, 256])
torch.Size([8, 256]) torch.Size([8, 256])
torch.Size([8, 256]) torch.Size([8, 256])
torch.Size([8, 256]) torch.Size([8, 256])
torch.Size([8, 256]) torch.Size([8, 256])
torch.Size([8, 256]) torch.Size([8, 256])
torch.Size([8, 256]) torch.Size([8, 256])
torch.Size([8, 256]) torch.Size([8

In [None]:
def calc_loss_batch(input_batch, target_batch, model, device):
    input_batch, target_batch = input_batch.to(device), target_batch.to(device)

    logits = model(input_batch)
    logits = logits.flatten(0, 1)
    loss = torch.nn.functional.cross_entropy(logits, target_batch.flatten())
    return loss


def calc_loss_loader(data_loader, model, device, num_batches=None):
    total_loss = 0.
    if num_batches is None:
        num_batches = len(data_loader)
    else:

        num_batches = min(num_batches, len(data_loader))
    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i < num_batches:
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            total_loss += loss.item()
        else:
            break
    return total_loss / num_batches


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(256, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_block): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (w_keys): Linear(in_features=768, out_features=768, bias=False)
        (w_query): Linear(in_features=768, out_features=768, bias=False)
        (w_values): Linear(in_features=768, out_features=768, bias=False)
        (dropout): Dropout(p=0.1, inplace=False)
        (out_prj): Linear(in_features=768, out_features=768, bias=True)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GeLU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
          (3): Dropout(p=0.1, inplace=False)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_resid): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttent

In [None]:

def train_model_simple(model, train_loader, val_loader, optimizer, device, num_epochs,
                       eval_freq, eval_iter, start_context):
    train_losses, val_losses, track_tokens_seen = [], [], []
    tokens_seen, global_step = 0, -1

    for epoch in range(num_epochs):
        model.train()

        for input_batch, target_batch in train_loader:
            optimizer.zero_grad()
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            loss.backward()
            optimizer.step()
            tokens_seen += input_batch.numel()
            global_step += 1

            if global_step % eval_freq == 0:
                train_loss, val_loss = evaluate_model(
                    model, train_loader, val_loader, device, eval_iter)
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                track_tokens_seen.append(tokens_seen)
                print(f"Ep {epoch+1} (Step {global_step:06d}): "
                      f"Train loss {train_loss:.3f}, Val loss {val_loss:.3f}")

        generate_and_print_sample(
            model, train_loader.dataset.tokenizer, device, start_context
        )

    return train_losses, val_losses, track_tokens_seen


def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    model.eval()
    with torch.no_grad():
        train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter)
        val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter)
    model.train()
    return train_loss, val_loss


def generate_and_print_sample(model, tokenizer, device, start_context):
    model.eval()
    context_size = model.pos_emb.weight.shape[0]
    encoded = text_to_token_ids(start_context, tokenizer).to(device)
    with torch.no_grad():
        token_ids = generate_text_simple(
            model=model, idx=encoded,
            max_new_tokens=50, context_scale=256
        )
        decoded_text = token_ids_to_text(token_ids, tokenizer)
        print(decoded_text.replace("\n", " "))
    model.train()

In [None]:
torch.cuda.empty_cache()

In [None]:
#model = GPTModel(GPT_CONFIG_124M)
torch.cuda.empty_cache()
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-4, weight_decay=0.1)

num_epochs = 10
train_losses, val_losses, tokens_seen = train_model_simple(
    model, train_dl, val_dl, optimizer, device,
    num_epochs=num_epochs, eval_freq=5, eval_iter=5,
    start_context="Does God exist ?",
)

Ep 1 (Step 000000): Train loss 1.865, Val loss 5.220
Ep 1 (Step 000005): Train loss 1.864, Val loss 5.339
Ep 1 (Step 000010): Train loss 1.905, Val loss 5.352
Ep 1 (Step 000015): Train loss 1.891, Val loss 5.273
Ep 1 (Step 000020): Train loss 1.755, Val loss 5.446
Ep 1 (Step 000025): Train loss 1.842, Val loss 5.303
Ep 1 (Step 000030): Train loss 1.904, Val loss 5.436
Ep 1 (Step 000035): Train loss 1.762, Val loss 5.452
Ep 1 (Step 000040): Train loss 1.886, Val loss 5.334
Ep 1 (Step 000045): Train loss 1.884, Val loss 5.277
Ep 1 (Step 000050): Train loss 1.752, Val loss 5.240
Ep 1 (Step 000055): Train loss 1.748, Val loss 5.359
Ep 1 (Step 000060): Train loss 1.867, Val loss 5.313
Ep 1 (Step 000065): Train loss 1.748, Val loss 5.407
Ep 1 (Step 000070): Train loss 1.783, Val loss 5.443
Ep 1 (Step 000075): Train loss 1.804, Val loss 5.408
Ep 1 (Step 000080): Train loss 1.636, Val loss 5.406
Ep 1 (Step 000085): Train loss 1.753, Val loss 5.352
Ep 1 (Step 000090): Train loss 1.764, Val loss

In [None]:
start_context = "Am I the best ?  "

token_ids = generate_text_simple(model=model.cpu(),
                                 idx=text_to_token_ids(start_context,tokenizer),
                                 max_new_tokens=25,
                                 context_scale=256)
print(f"Question:{start_context} \n\nResponse:{token_ids_to_text(token_ids,tokenizer)}")

Question:Am I the best ?   

Response:Am I the best ?  
and he has always be witness in the other, 
and you have seen him now to open with him. 


In [None]:
torch.save(model.state_dict(), "model.pth")

In [None]:
model = GPTModel(GPT_CONFIG_124M)
model.load_state_dict(torch.load("model.pth"))