In [1]:
import torch
from untrained_model import GPTModel

import os
curr_dir = os.getcwd()
parent_dir = os.path.dirname(curr_dir)
file_path = "dataset/natural_language/large_lang_repo/part01.txt"
final_path = os.path.join(parent_dir, file_path)
print(final_path)

GPT_CONFIG_124M = {
    "vocab_size": 50257,   # Vocabulary size
    "context_length": 256, # Shortened context length (orig: 1024)
    "emb_dim": 256,        # Embedding dimension
    "n_heads": 4,         # Number of attention heads
    "n_layers": 12,        # Number of layers
    "drop_rate": 0.1,      # Dropout rate
    "qkv_bias": False      # Query-key-value bias
}

torch.manual_seed(123)

checkpoint = torch.load(os.path.join(parent_dir, "model_and_optimizer_small.pth"), weights_only=True)

model = GPTModel(GPT_CONFIG_124M)
model.load_state_dict(checkpoint["model_state_dict"])

optimizer = torch.optim.AdamW(model.parameters(), lr=0.0005, weight_decay=0.1)
optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
model.train();

/media/ashmit/External Storage/System/python/llm_from_scratch/llm_from_scratch/dataset/natural_language/large_lang_repo/part01.txt


In [2]:
import tiktoken
from untrained_model import generate_text_simple


def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0) # add batch dimension
    return encoded_tensor

def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0) # remove batch dimension
    return tokenizer.decode(flat.tolist())

start_context = "Every effort moves you"
tokenizer = tiktoken.get_encoding("gpt2")

token_ids = generate_text_simple(
    model=model,
    idx=text_to_token_ids(start_context, tokenizer),
    max_new_tokens=10,
    context_size=GPT_CONFIG_124M["context_length"]
)

print("Output text:\n", token_ids_to_text(token_ids, tokenizer))

Output text:
 Every effort moves you can do something that others was possible!
<|endoftext|>


In [3]:
with open(final_path, "r", encoding="utf-8") as file:
    text_data = file.read()
print(text_data[:10])
print(text_data[-10:])


Once upon
our best.



In [4]:
total_characters = len(text_data)
total_tokens = len(tokenizer.encode(text_data, allowed_special={'<|endoftext|>'}))

print("Characters:", total_characters)
print("Tokens:", total_tokens)

Characters: 3141952
Tokens: 764650


In [5]:

from untrained_model import create_dataloader_v1


# Train/validation ratio
train_ratio = 0.90
split_idx = int(train_ratio * len(text_data))
train_data = text_data[:split_idx]
val_data = text_data[split_idx:]


torch.manual_seed(123)

train_loader = create_dataloader_v1(
    train_data,
    batch_size=2,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=True,
    shuffle=True,
    num_workers=0
)

val_loader = create_dataloader_v1(
    val_data,
    batch_size=2,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=False,
    shuffle=False,
    num_workers=0
)

In [6]:
def calc_loss_batch(input_batch, target_batch, model, device):
    input_batch, target_batch = input_batch.to(device), target_batch.to(device)
    logits = model(input_batch)
    loss = torch.nn.functional.cross_entropy(logits.flatten(0, 1), target_batch.flatten())
    return loss


def calc_loss_loader(data_loader, model, device, num_batches=None):
    total_loss = 0.
    if len(data_loader) == 0:
        return float("nan")
    elif num_batches is None:
        num_batches = len(data_loader)
    else:
        # Reduce the number of batches to match the total number of batches in the data loader
        # if num_batches exceeds the number of batches in the data loader
        num_batches = min(num_batches, len(data_loader))
    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i < num_batches:
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            total_loss += loss.item()
        else:
            break
    return total_loss / num_batches

In [16]:
if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    
    major, minor = map(int, torch.__version__.split(".")[:2])
    if (major, minor) >= (2, 9):
        device = torch.device("mps")
    else:
        device = torch.device("cpu")
else:
    device = torch.device("cpu")


print(f"Using {device} device.")


model.to(device) 


with torch.no_grad(): 
    train_loss = calc_loss_loader(train_loader, model, device)
    val_loss = calc_loss_loader(val_loader, model, device)

print("Training loss:", train_loss)
print("Validation loss:", val_loss)

Using cpu device.
Training loss: 0.9241066625525569
Validation loss: 2.9753213265846514


In [7]:
def train_model_simple(model, train_loader, val_loader, optimizer, device, num_epochs,
                       eval_freq, eval_iter, start_context, tokenizer):
    # Initialize lists to track losses and tokens seen
    train_losses, val_losses, track_tokens_seen = [], [], []
    tokens_seen, global_step = 0, -1

    # Main training loop
    for epoch in range(num_epochs):
        model.train()  # Set model to training mode
        
        for input_batch, target_batch in train_loader:
            optimizer.zero_grad() # Reset loss gradients from previous batch iteration
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            loss.backward() # Calculate loss gradients
            optimizer.step() # Update model weights using loss gradients
            tokens_seen += input_batch.numel()
            global_step += 1

            # Optional evaluation step
            if global_step % eval_freq == 0:
                train_loss, val_loss = evaluate_model(
                    model, train_loader, val_loader, device, eval_iter)
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                track_tokens_seen.append(tokens_seen)
                print(f"Ep {epoch+1} (Step {global_step:06d}): "
                      f"Train loss {train_loss:.3f}, Val loss {val_loss:.3f}")

        # Print a sample text after each epoch
        generate_and_print_sample(
            model, tokenizer, device, start_context
        )

    return train_losses, val_losses, track_tokens_seen


def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    model.eval()
    with torch.no_grad():
        train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter)
        val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter)
    model.train()
    return train_loss, val_loss


def generate_and_print_sample(model, tokenizer, device, start_context):
    model.eval()
    context_size = model.pos_emb.weight.shape[0]
    encoded = text_to_token_ids(start_context, tokenizer).to(device)
    with torch.no_grad():
        token_ids = generate_text_simple(
            model=model, idx=encoded,
            max_new_tokens=50, context_size=context_size
        )
    decoded_text = token_ids_to_text(token_ids, tokenizer)
    print(decoded_text.replace("\n", " "))  # Compact print format
    model.train()

In [8]:
# import time
# start_time = time.time()

# model.to(device)


# num_epochs = 13
# train_losses, val_losses, tokens_seen = train_model_simple(
#     model, train_loader, val_loader, optimizer, device,
#     num_epochs=num_epochs, eval_freq=5, eval_iter=5,
#     start_context="Every effort moves you", tokenizer=tokenizer
# )

# end_time = time.time()
# execution_time_minutes = (end_time - start_time) / 60
# print(f"Training completed in {execution_time_minutes:.2f} minutes.")

# checkpoint = torch.load("model_and_optimizer_small.pth", weights_only=True)

# model = GPTModel(GPT_CONFIG_124M)
# model.load_state_dict(checkpoint["model_state_dict"])

# optimizer = torch.optim.AdamW(model.parameters(), lr=0.0005, weight_decay=0.1)
# optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
# model.train();

In [13]:
from untrained_model import create_dataloader_v1
import time
file_path = "dataset/natural_language/large_lang_repo/part15.txt"
final_path = os.path.join(parent_dir, file_path)

with open(final_path, "r", encoding="utf-8") as file:
    text_data = file.read()

train_ratio = 0.90
split_idx = int(train_ratio * len(text_data))
train_data = text_data[:split_idx]
val_data = text_data[split_idx:]


torch.manual_seed(123)

train_loader = create_dataloader_v1(
    train_data,
    batch_size=2,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=True,
    shuffle=True,
    num_workers=0
)

val_loader = create_dataloader_v1(
    val_data,
    batch_size=2,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=False,
    shuffle=False,
    num_workers=0
)


if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    
    major, minor = map(int, torch.__version__.split(".")[:2])
    if (major, minor) >= (2, 9):
        device = torch.device("mps")
    else:
        device = torch.device("cpu")
else:
    device = torch.device("cpu")


print(f"Using {device} device.")


model.to(device) 

start_time = time.time()

num_epochs = 10
train_losses, val_losses, tokens_seen = train_model_simple(
    model, train_loader, val_loader, optimizer, device,
    num_epochs=num_epochs, eval_freq=5, eval_iter=5,
    start_context="Every effort moves you", tokenizer=tokenizer
)

end_time = time.time()
execution_time_minutes = (end_time - start_time) / 60
print(f"Training completed in {execution_time_minutes:.2f} minutes.")

torch.save({
    "model_state_dict": model.state_dict(),
    "optimizer_state_dict": optimizer.state_dict(),
    }, 
    os.path.join(parent_dir, "model_and_optimizer_small.pth")
)
print(f"Model saved at {os.path.join(parent_dir, "model_and_optimizer_small.pth")}")

Using cpu device.
Ep 1 (Step 000000): Train loss 2.666, Val loss 2.423
Ep 1 (Step 000005): Train loss 2.505, Val loss 2.390
Ep 1 (Step 000010): Train loss 2.672, Val loss 2.362
Ep 1 (Step 000015): Train loss 2.405, Val loss 2.346
Ep 1 (Step 000020): Train loss 2.435, Val loss 2.334
Ep 1 (Step 000025): Train loss 2.457, Val loss 2.312
Ep 1 (Step 000030): Train loss 2.506, Val loss 2.293
Ep 1 (Step 000035): Train loss 2.502, Val loss 2.270
Ep 1 (Step 000040): Train loss 2.528, Val loss 2.240
Ep 1 (Step 000045): Train loss 2.397, Val loss 2.229
Ep 1 (Step 000050): Train loss 2.442, Val loss 2.231
Ep 1 (Step 000055): Train loss 2.278, Val loss 2.226
Ep 1 (Step 000060): Train loss 2.268, Val loss 2.229
Ep 1 (Step 000065): Train loss 2.578, Val loss 2.235
Ep 1 (Step 000070): Train loss 2.285, Val loss 2.221
Ep 1 (Step 000075): Train loss 2.376, Val loss 2.217
Ep 1 (Step 000080): Train loss 2.469, Val loss 2.218
Ep 1 (Step 000085): Train loss 2.708, Val loss 2.218
Ep 1 (Step 000090): Train lo

In [14]:
file_path = "dataset/natural_language/large_lang_repo/part16.txt"
final_path = os.path.join(parent_dir, file_path)

with open(final_path, "r", encoding="utf-8") as file:
    text_data = file.read()

train_ratio = 0.90
split_idx = int(train_ratio * len(text_data))
train_data = text_data[:split_idx]
val_data = text_data[split_idx:]


torch.manual_seed(123)

train_loader = create_dataloader_v1(
    train_data,
    batch_size=2,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=True,
    shuffle=True,
    num_workers=0
)

val_loader = create_dataloader_v1(
    val_data,
    batch_size=2,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=False,
    shuffle=False,
    num_workers=0
)


if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    
    major, minor = map(int, torch.__version__.split(".")[:2])
    if (major, minor) >= (2, 9):
        device = torch.device("mps")
    else:
        device = torch.device("cpu")
else:
    device = torch.device("cpu")


print(f"Using {device} device.")


model.to(device) 

start_time = time.time()

num_epochs = 10
train_losses, val_losses, tokens_seen = train_model_simple(
    model, train_loader, val_loader, optimizer, device,
    num_epochs=num_epochs, eval_freq=5, eval_iter=5,
    start_context="Every effort moves you", tokenizer=tokenizer
)

end_time = time.time()
execution_time_minutes = (end_time - start_time) / 60
print(f"Training completed in {execution_time_minutes:.2f} minutes.")

torch.save({
    "model_state_dict": model.state_dict(),
    "optimizer_state_dict": optimizer.state_dict(),
    }, 
    os.path.join(parent_dir, "model_and_optimizer_small.pth")
)
print(f"Model saved at {os.path.join(parent_dir, "model_and_optimizer_small.pth")}")

Using cpu device.
Ep 1 (Step 000000): Train loss 2.741, Val loss 2.020
Ep 1 (Step 000005): Train loss 2.759, Val loss 2.002
Ep 1 (Step 000010): Train loss 2.911, Val loss 1.980
Ep 1 (Step 000015): Train loss 2.912, Val loss 1.973
Ep 1 (Step 000020): Train loss 2.845, Val loss 1.936
Ep 1 (Step 000025): Train loss 2.557, Val loss 1.928
Ep 1 (Step 000030): Train loss 2.563, Val loss 1.929
Ep 1 (Step 000035): Train loss 2.707, Val loss 1.929
Ep 1 (Step 000040): Train loss 2.530, Val loss 1.905
Ep 1 (Step 000045): Train loss 2.284, Val loss 1.900
Ep 1 (Step 000050): Train loss 2.513, Val loss 1.895
Ep 1 (Step 000055): Train loss 2.283, Val loss 1.898
Ep 1 (Step 000060): Train loss 2.520, Val loss 1.893
Ep 1 (Step 000065): Train loss 2.663, Val loss 1.881
Ep 1 (Step 000070): Train loss 2.128, Val loss 1.878
Ep 1 (Step 000075): Train loss 2.286, Val loss 1.878
Ep 1 (Step 000080): Train loss 2.301, Val loss 1.869
Ep 1 (Step 000085): Train loss 2.196, Val loss 1.860
Ep 1 (Step 000090): Train lo

In [8]:
# file_path = "dataset/natural_language/large_lang_repo/part17.txt"
# final_path = os.path.join(parent_dir, file_path)

# with open(final_path, "r", encoding="utf-8") as file:
#     text_data = file.read()

# train_ratio = 0.90
# split_idx = int(train_ratio * len(text_data))
# train_data = text_data[:split_idx]
# val_data = text_data[split_idx:]


# torch.manual_seed(123)

# train_loader = create_dataloader_v1(
#     train_data,
#     batch_size=2,
#     max_length=GPT_CONFIG_124M["context_length"],
#     stride=GPT_CONFIG_124M["context_length"],
#     drop_last=True,
#     shuffle=True,
#     num_workers=0
# )

# val_loader = create_dataloader_v1(
#     val_data,
#     batch_size=2,
#     max_length=GPT_CONFIG_124M["context_length"],
#     stride=GPT_CONFIG_124M["context_length"],
#     drop_last=False,
#     shuffle=False,
#     num_workers=0
# )


# if torch.cuda.is_available():
#     device = torch.device("cuda")
# elif torch.backends.mps.is_available():
    
#     major, minor = map(int, torch.__version__.split(".")[:2])
#     if (major, minor) >= (2, 9):
#         device = torch.device("mps")
#     else:
#         device = torch.device("cpu")
# else:
#     device = torch.device("cpu")


# print(f"Using {device} device.")


# model.to(device) 

# start_time = time.time()

# num_epochs = 10
# train_losses, val_losses, tokens_seen = train_model_simple(
#     model, train_loader, val_loader, optimizer, device,
#     num_epochs=num_epochs, eval_freq=5, eval_iter=5,
#     start_context="Every effort moves you", tokenizer=tokenizer
# )

# end_time = time.time()
# execution_time_minutes = (end_time - start_time) / 60
# print(f"Training completed in {execution_time_minutes:.2f} minutes.")

# torch.save({
#     "model_state_dict": model.state_dict(),
#     "optimizer_state_dict": optimizer.state_dict(),
#     }, 
#     os.path.join(parent_dir, "model_and_optimizer_small.pth")
# )
# print(f"Model saved at {os.path.join(parent_dir, "model_and_optimizer_small.pth")}")

In [13]:
def generate(model, idx, max_new_tokens, context_size, temperature=0.0, top_k=None, eos_id=None):

    # For-loop is the same as before: Get logits, and only focus on last time step
    for _ in range(max_new_tokens):
        idx_cond = idx[:, -context_size:]
        with torch.no_grad():
            logits = model(idx_cond)
        logits = logits[:, -1, :]

        # New: Filter logits with top_k sampling
        if top_k is not None:
            # Keep only top_k values
            top_logits, _ = torch.topk(logits, top_k)
            min_val = top_logits[:, -1]
            logits = torch.where(logits < min_val, torch.tensor(float("-inf")).to(logits.device), logits)

        # New: Apply temperature scaling
        if temperature > 0.0:
            logits = logits / temperature

            # New (not in book): numerical stability tip to get equivalent results on mps device
            # subtract rowwise max before softmax
            logits = logits - logits.max(dim=-1, keepdim=True).values
            
            # Apply softmax to get probabilities
            probs = torch.softmax(logits, dim=-1)  # (batch_size, context_len)

            # Sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)  # (batch_size, 1)

        # Otherwise same as before: get idx of the vocab entry with the highest logits value
        else:
            idx_next = torch.argmax(logits, dim=-1, keepdim=True)  # (batch_size, 1)

        if idx_next == eos_id:  # Stop generating early if end-of-sequence token is encountered and eos_id is specified
            break

        # Same as before: append sampled index to the running sequence
        idx = torch.cat((idx, idx_next), dim=1)  # (batch_size, num_tokens+1)

    return idx

In [15]:
inference_device = torch.device("cpu")

model.to(inference_device)
model.eval()

torch.manual_seed(123)

token_ids = generate(
    model=model,
    idx=text_to_token_ids("Every effort moves you", tokenizer).to(inference_device),
    max_new_tokens=15,
    context_size=GPT_CONFIG_124M["context_length"],
    top_k=25,
    temperature=1.4
)

print("Output text:\n", token_ids_to_text(token_ids, tokenizer))

Output text:
 Every effort moves you will never be dependable. You will always have this somewhere safe.



In [17]:
torch.save({
    "model_state_dict": model.state_dict(),
    "optimizer_state_dict": optimizer.state_dict(),
    }, 
    os.path.join(parent_dir, "model_and_optimizer_small.pth")
)
print(f"Model saved at {os.path.join(parent_dir, "model_and_optimizer_small.pth")}")

Model saved at /media/ashmit/External Storage/System/python/llm_from_scratch/llm_from_scratch/model_and_optimizer_small.pth


In [12]:
# file_path = "dataset/sarcasm/sarcastic.txt"
# with open(file_path, "r", encoding="utf-8") as file:
#     text_data = file.read()


# text_data = text_data[:200081]

# total_characters = len(text_data)
# total_tokens = len(tokenizer.encode(text_data, allowed_special={'<|endoftext|>'}))

# print("Characters:", total_characters)
# print("Tokens:", total_tokens)

In [11]:
# # Train/validation ratio
# train_ratio = 0.90
# split_idx = int(train_ratio * len(text_data))
# train_data = text_data[:split_idx]
# val_data = text_data[split_idx:]


# torch.manual_seed(123)

# train_loader = create_dataloader_v1(
#     train_data,
#     batch_size=2,
#     max_length=GPT_CONFIG_124M["context_length"],
#     stride=GPT_CONFIG_124M["context_length"],
#     drop_last=True,
#     shuffle=True,
#     num_workers=0
# )

# val_loader = create_dataloader_v1(
#     val_data,
#     batch_size=2,
#     max_length=GPT_CONFIG_124M["context_length"],
#     stride=GPT_CONFIG_124M["context_length"],
#     drop_last=False,
#     shuffle=False,
#     num_workers=0
# )

In [21]:
# checkpoint = torch.load("model_and_optimizer_small.pth", weights_only=True)

# model = GPTModel(GPT_CONFIG_124M)
# model.load_state_dict(checkpoint["model_state_dict"])

# optimizer = torch.optim.AdamW(model.parameters(), lr=0.0005, weight_decay=0.1)
# optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
# model.train();

In [9]:
# import time
# start_time = time.time()

# torch.manual_seed(123)
# model = GPTModel(GPT_CONFIG_124M)
# model.to(device)
# optimizer = torch.optim.AdamW(model.parameters(), lr=0.0004, weight_decay=0.1)

# num_epochs = 15
# train_losses, val_losses, tokens_seen = train_model_simple(
#     model, train_loader, val_loader, optimizer, device,
#     num_epochs=num_epochs, eval_freq=5, eval_iter=5,
#     start_context="Every effort moves you", tokenizer=tokenizer
# )

# end_time = time.time()
# execution_time_minutes = (end_time - start_time) / 60
# print(f"Training completed in {execution_time_minutes:.2f} minutes.")

In [10]:
# inference_device = torch.device("cpu")

# model.to(inference_device)
# model.eval()

# torch.manual_seed(123)

# token_ids = generate(
#     model=model,
#     idx=text_to_token_ids("God is great", tokenizer).to(inference_device),
#     max_new_tokens=15,
#     context_size=GPT_CONFIG_124M["context_length"],
#     top_k=25,
#     temperature=1.5
# )

# print("Output text:\n", token_ids_to_text(token_ids, tokenizer))

In [None]:
# torch.save({
#     "model_state_dict": model.state_dict(),
#     "optimizer_state_dict": optimizer.state_dict(),
#     }, 
#     "model_and_optimizer_small_sarcasm.pth"
# )