In [1]:
import torch
from Bangala_LLM.components.data_loader import create_dataloader_v1
from Bangala_LLM.components.models import GPT2
from Bangala_LLM.utils.common import (
    calc_loss_loader,
    calc_loss_batch,
    generate,
    token_indx_to_text,
    text_to_token_indx,
)
import tiktoken
import tqdm


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


GPT_CONFIG_124M = {
    "vocab_size": 50257,
    "context_length": 1024,
    "emb_dim": 384,
    "num_heads": 12,
    "dropout": 0.1,
    "num_layers": 12,
    "qkv_bias": False,
}


model = GPT2(GPT_CONFIG_124M).to(device)
torch.manual_seed(123)  # For reproducibility due to the shuffling in the data loader


trai_data_path = "/home/amzad/Desktop/bangla_GPT/dataset/tiny-shakespeare.txt"
val_data_path = "/home/amzad/Desktop/bangla_GPT/dataset/blank.txt"

# Load the train and validation data
with open(trai_data_path, "r" , encoding='utf-8') as f:
    train_data = f.read()


with open(val_data_path, "r",encoding='utf-8') as f:
    val_data = f.read()

print(val_data[:1000])
val_loader = create_dataloader_v1(
    val_data,
    batch_size=2,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=True,
    shuffle=True,
    num_workers=4,
)

train_loader = create_dataloader_v1(
    train_data,
    batch_size=1,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=False,
    shuffle=False,
    num_workers=4,
)

print("data loaded")


def train_model_simple(
    model,
    train_loader,
    optimizer,
    device,
    num_epochs,
    eval_freq,
    start_context,
    tokenizer,
    eval_iter=100,
    val_loader = None,
):
    # Initialize lists to track losses and tokens seen
    train_losses, val_losses, track_tokens_seen = [], [], []
    tokens_seen, global_step = 0, -1

    # Main training loop
    for epoch in range(num_epochs):
        model.train()  # Set model to training mode

        for input_batch, target_batch in train_loader:
            optimizer.zero_grad()  # Reset loss gradients from previous batch iteration
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            loss.backward()  # Calculate loss gradients
            optimizer.step()  # Update model weights using loss gradients
            tokens_seen += input_batch.numel()
            global_step += 1

            # Optional evaluation step
            if global_step % eval_freq == 0:
                train_loss, val_loss = evaluate_model(
                    model, train_loader, val_loader, device, eval_iter
                )
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                track_tokens_seen.append(tokens_seen)
                print(
                    f"Ep {epoch+1} (Step {global_step:06d}): "
                    f"Train loss {train_loss:.3f}, Val loss {val_loss:.3f}"
                )

        # Print a sample text after each epoch
                generate_and_print_sample(model, tokenizer, device, start_context)

    return train_losses, val_losses, track_tokens_seen


def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    model.eval()
    with torch.no_grad():
        train_loss = calc_loss_loader(
            train_loader, model, device, num_batches=eval_iter
        )
        val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter)
    model.train()
    return train_loss, val_loss


def generate_and_print_sample(model, tokenizer, device, start_context):
    model.eval()
    context_size = model.pos_emb.weight.shape[0]
    encoded = text_to_token_indx(start_context, tokenizer).to(device)
    with torch.no_grad():
        token_ids = generate(
            model=model, idx=encoded, max_new_tokens=500, context_size=context_size
        )
    decoded_text = token_indx_to_text(token_ids, tokenizer)
    print(decoded_text.replace("\n", " "))  # Compact print format
    model.train()


optimizer = torch.optim.AdamW(model.parameters(), lr=0.0004, weight_decay=0.1)


tokenizer = tiktoken.get_encoding("gpt2")
num_epochs = 10000
train_losses, val_losses, tokens_seen = train_model_simple(
    model,
    train_loader,
    optimizer,
    device,
    num_epochs=num_epochs,
    eval_freq=100,
    eval_iter=500,
    start_context="ABHORSON: Come on, bawd; I will instruct thee in my trade; follow.",
    tokenizer=tokenizer,
)



ANTONIO:
Nor I; my spirits are nimble.
They fell together all, as by consent;
They dropp'd, as by a thunder-stroke. What might,
Worthy Sebastian? O, what might?--No more:--
And yet me thinks I see it in thy face,
What thou shouldst be: the occasion speaks thee, and
My strong imagination sees a crown
Dropping upon thy head.

SEBASTIAN:
What, art thou waking?

ANTONIO:
Do you not hear me speak?

SEBASTIAN:
I do; and surely
It is a sleepy language and thou speak'st
Out of thy sleep. What is it thou didst say?
This is a strange repose, to be asleep
With eyes wide open; standing, speaking, moving,
And yet so fast asleep.

ANTONIO:
Noble Sebastian,
Thou let'st thy fortune sleep--die, rather; wink'st
Whiles thou art waking.



ValueError: num_samples should be a positive integer value, but got num_samples=0