# Pretraining on Unlabeled Data

- Configuring the global environment

In [31]:
import torch
import tiktoken

# Select accelerate device
if torch.cuda.is_available() or torch.backends.mps.is_available():
    device = torch.device("cuda" if torch.cuda.is_available() else "mps")
else:
    device = torch.device("cpu")

print("Accelerate device: ", device)

Accelerate device:  mps


## Initialize a GPT Model 

In [32]:
from architecture import GPTModel

# Hyper-parameters configuration
HYPER_PARAMS_CONFIG = {
    "batch_size" : 2,   
    "num_workers" : 0,
    "num_epochs" : 10,
    "lr" : 4e-4,
    "weight_decay" : 0.1,
}

# The gpt2-small (124M) parameter configuration
GPT_CONFIG_124M = {
        "vocab_size" : 50257,    # Vocabulary size
        "context_length" : 256,  # Shortened context length (orig: 1024)    
        "emb_dim" : 768,         # Embedding dimension
        "n_heads" : 12,          # Number of attntion heads
        "n_layers" : 12,         # Number of layers
        "dropout" : 0.0,         # Dropout rate
        "qkv_bias" : False       # Query, Key, and Value bias
}    

torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)


## Dataset and Dataloader

- Create dataset and dataloader that extract chunks from the input text dataset

In [33]:
from torch.utils.data import Dataset, DataLoader
torch.manual_seed(123)

class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer: tiktoken.Encoding, sample_length, stride) -> None:
        super().__init__()
        self.input_idx = []
        self.target_idx = []

        token_idx = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        for i in  range(0, len(token_idx) - sample_length + 1, stride):
            input_chunk = token_idx[i: i + sample_length]
            target_chunk = token_idx[i + 1: i + sample_length + 1]
            self.input_idx.append(torch.tensor(input_chunk))
            self.target_idx.append(torch.tensor(target_chunk))
    def __len__(self):
        return len(self.target_idx)
    
    def __getitem__(self, index):
        return self.input_idx[index], self.target_idx[index]
    
def create_dataloader_V1(txt, batch_size=4, max_length=256, stride=128, 
                         shuffle=True, drop_last=True, num_workers = 7):
    
    tokenizer = tiktoken.get_encoding('gpt2')
    
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    dataloader = DataLoader(dataset, 
                            batch_size=batch_size, 
                            shuffle=shuffle, 
                            drop_last=drop_last, 
                            num_workers=num_workers, 
                )
    
    return dataloader



- We use a relatively small dataset [the-verdict.txt](./the-verdict.txt) for training the LLM (in fact, only one short story)

In [34]:
file_path = "the-verdict.txt"

with open(file_path, "r", encoding="utf-8") as file:
        text_data = file.read()

# Train/validation ratio
train_ratio = 0.90
split_idx = int(train_ratio * len(text_data))
train_data = text_data[:split_idx]
val_data = text_data[split_idx:]

torch.manual_seed(123)

train_loader = create_dataloader_V1(
    txt=train_data,
    batch_size=HYPER_PARAMS_CONFIG["batch_size"],
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=True,
    shuffle=True,
    num_workers=HYPER_PARAMS_CONFIG["num_workers"],
)

val_loader = create_dataloader_V1(
    val_data,
    batch_size=HYPER_PARAMS_CONFIG['batch_size'],
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=False,
    shuffle=False,
    num_workers=HYPER_PARAMS_CONFIG["num_workers"],
)

## Create a BPE Tokenizer

In [35]:
# The gpt2 encoder of tokenizer
tokenizer = tiktoken.get_encoding("gpt2")
tokenizer.encode("<|endoftext|>", allowed_special={"<|endoftext|>"})

[50256]

## Optimizer

In [36]:
optimizer = torch.optim.AdamW(model.parameters(), 
                              lr=HYPER_PARAMS_CONFIG["lr"], 
                              weight_decay=HYPER_PARAMS_CONFIG["weight_decay"],
                              )

## Define the Trainer

Trainer class to encapsulate the training, evaluation, and testing procedures for a PyTorch model.

This `Trainer()` supports various features including ***learning rate warmup, cosine decay, gradient clipping, and periodic evaluation***. It can handle both classification and regression tasks.

```Python
class Trainer():
    def __init__(
            self, 
            model: nn.Module, 
            device: torch.device, 
            num_epochs: int,
            optimizer: torch.optim.Optimizer,
            train_loader: DataLoader = None, 
            valid_loader: DataLoader = None, 
            eval_freq: int = None, 
            eval_iter: int = None,
            is_classification: bool = False, 
            warmup: bool = False,
            cos_dec: bool = False,
            grd_clip: bool = False, 
            inital_lr: float = 3e-5, 
            min_lr: float = 1e-6,
            checkpoint_path: str = None,
            tokenizer: tiktoken.Encoding = None,
    ):
        """
        Initializes the Trainer object with the provided model, training parameters, and options.

        Args:
        - model: The PyTorch model.
        - device: The device to run the model on ('cpu', 'mps', or 'cuda').
        - num_epochs: The number of epochs for training.
        - optimizer: The optimizer for training the model.
        - train_loader: The DataLoader for training data (default: None).
        - valid_loader: The DataLoader for validation data (default: None).
        - eval_freq: Evaluation frequency to control how often (in epochs) the model is evaluated 
                     during training (default: None).
        - eval_iter: Evaluation iterations to control the number of batches processed (default: None).
        - is_classification: Whether the task is classification, if is_classification is True, only 
                             the last time step of the model's output is used for loss calculation
                             (default: False).
        - warmup: Whether to use learning rate warmup (default: False).
        - cos_dec: Whether to use cosine decay for learning rate (default: False).
        - grd_clip: Whether to use gradient clipping (default: False).
        - initial_lr: Initial learning rate with warmup (default: 3e-5).
        - min_lr: Minimum learning rate with cosine decay (default: 1e-6).
        - checkpoint_path: The path to the directory containing the model checkpoints (default: None).
        """
```

In [37]:
from trainer import Trainer

trainer = Trainer(
            model=model, 
            device=device, 
            train_loader=train_loader,
            valid_loader=val_loader,
            is_classification=False,
            num_epochs=HYPER_PARAMS_CONFIG["num_epochs"], 
            optimizer=optimizer,
            eval_freq=5,
            eval_iter=5,
            tokenizer=tokenizer,
        )

gen_text = trainer.text_generator("Every effort moves you", max_new_tokens=20)
print(gen_text)

Every effort moves you rentingetic wasnم refres RexMeCHicular stren Mortgage TT remember gard ACTIONSussedOND Land Engeledded


## Pretraining the GPT Model

- Based on the training and validation set losses, we can see that the model starts overfitting

In [38]:
trainer.training()

Ep 1 (Step 000000): Train loss 10.000, Val loss 10.117
Ep 1 (Step 000005): Train loss 8.031, Val loss 8.250
Ep 2 (Step 000010): Train loss 6.772, Val loss 7.064
Ep 2 (Step 000015): Train loss 5.879, Val loss 6.577
Ep 3 (Step 000020): Train loss 5.706, Val loss 6.478
Ep 3 (Step 000025): Train loss 5.135, Val loss 6.478
Ep 4 (Step 000030): Train loss 4.768, Val loss 6.325
Ep 4 (Step 000035): Train loss 4.636, Val loss 6.556
Ep 5 (Step 000040): Train loss 3.741, Val loss 6.183
Ep 6 (Step 000045): Train loss 3.201, Val loss 6.079
Ep 6 (Step 000050): Train loss 2.750, Val loss 6.119
Ep 7 (Step 000055): Train loss 2.122, Val loss 6.152
Ep 7 (Step 000060): Train loss 1.965, Val loss 6.148
Ep 8 (Step 000065): Train loss 1.466, Val loss 6.200
Ep 8 (Step 000070): Train loss 1.144, Val loss 6.280
Ep 9 (Step 000075): Train loss 0.727, Val loss 6.312
Ep 9 (Step 000080): Train loss 0.562, Val loss 6.372
Ep 10 (Step 000085): Train loss 0.388, Val loss 6.398


- Again to generate the text after trained, we can see that the model starts out generating incomprehensible strings of words, whereas towards the end, it's able to produce grammatically more or less correct sentences.

In [39]:
gen_text = trainer.text_generator("Every effort moves you", max_new_tokens=50)
print(gen_text)


Every effort moves you?"

"Yes--quite insensible to the irony. She wanted him vindicated--and by me!"

He laughed again, and threw back his head to look up at the sketch of the donkey. "There were days when I


- We can control the distribution and selection process via a concept called temperature scaling. Temperatures greater than 1 will result in more uniformly distributed token probabilities after applying the softmax. Temperatures smaller than 1 will result in more confident distributions after applying the softmax

In [40]:
gen_text = trainer.text_generator("Every effort moves you", max_new_tokens=50, temperature=2, top_k=5)
print(gen_text)

Every effort moves you in the inevitable up-rooms, in the deep arm one of the Sevres and silver ofident moust slight shade of constraint degree to the display of this false that he's his pictures--the quality of looking cleverer than he had to
