In [1]:
import torch

## Instruction Fine-Tuning

### Step 1: Preparing Dataset
- Dataset [JSON](https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch07/01_main-chapter-code/instruction-data.json)
- [Github](https://github.com/rasbt/LLMs-from-scratch/blob/main/ch07/01_main-chapter-code/instruction-data.json)

In [2]:
import json
import os
import urllib
import ssl
import urllib.request

def download_and_load_file(file_path, url):
    ssl_context = ssl.create_default_context()
    ssl_context.check_hostname = False
    ssl_context.verify_mode = ssl.CERT_NONE

    if not os.path.exists(file_path):
        with urllib.request.urlopen(url, context=ssl_context) as response:
            text_data = response.read().decode('utf-8')
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(text_data)

    else:
        with open(file_path, 'r', encoding='utf-8') as file:
            text_data = file.read()

    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)

    return data


In [3]:
file_path = "instruction-data.json"
url = (
    "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch"
    "/main/ch07/01_main-chapter-code/instruction-data.json"
)

data = download_and_load_file(file_path, url)
print("Number of entries:", len(data))

Number of entries: 1100


In [4]:
print("Example entry:\n", data[50])

Example entry:
 {'instruction': 'Identify the correct spelling of the following word.', 'input': 'Ocassion', 'output': "The correct spelling is 'Occasion.'"}


In [5]:
print("Example entry:\n", data[999])

Example entry:
 {'instruction': "What is an antonym of 'complicated'?", 'input': '', 'output': "An antonym of 'complicated' is 'simple'."}


#### Converting Instructions Into ALPACA Format
- Link [ALPACA](https://github.com/tatsu-lab/stanford_alpaca)

In [6]:
# Format the input data into ALPACA Prompt Style
def format_input(entry):
    instruction_text = (
        f"Below is an instruction that describes a task."
        f"Write a response that appropriately completes the request."
        f"\n\n### Instruction:\n{entry['instruction']}"
    )
    input_text = f"\n\n### Input:\n{entry['input']}" if entry['input'] else ""

    return instruction_text + input_text

In [7]:
# Test
model_input = format_input(data[50])
desired_response = f"\n\n### Response:\n{data[50]['output']}"

print(model_input + desired_response)


Below is an instruction that describes a task.Write a response that appropriately completes the request.

### Instruction:
Identify the correct spelling of the following word.

### Input:
Ocassion

### Response:
The correct spelling is 'Occasion.'


In [8]:
# Test
model_input = format_input(data[999])
desired_response = f"\n\n### Response:\n{data[999]['output']}"

print(model_input + desired_response)


Below is an instruction that describes a task.Write a response that appropriately completes the request.

### Instruction:
What is an antonym of 'complicated'?

### Response:
An antonym of 'complicated' is 'simple'.


#### Spliting Dataset Into Train-Test-Validation

In [9]:
train_portion = int(len(data) * 0.85) # 85%
test_portion = int(len(data) * 0.1) # 10% 
valid_portion = len(data) - train_portion - test_portion # 5%

train_data = data[:train_portion]
test_data = data[train_portion:train_portion + test_portion]
valid_data = data[train_portion + test_portion:]

In [10]:
print("Number of training examples:", len(train_data))
print("Number of testing examples:", len(test_data))
print("Number of validation examples:", len(valid_data))

Number of training examples: 935
Number of testing examples: 110
Number of validation examples: 55


### Step 2: Organizing Data Into Training Batches

In [11]:
import torch 
from torch.utils.data import Dataset

class InstructionDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data

        # Pre-tokenize texts
        self.encoded_texts = []
        for entry in data:
            instruction_plus_input = format_input(entry)
            response_text = f"\n\n### Response:\n{entry['output']}"
            full_text = instruction_plus_input + response_text
            self.encoded_texts.append(
                tokenizer.encode(full_text)
            )
    
    def __getitem__(self, index):
        return self.encoded_texts[index]
    
    def __len__(self):
        return len(self.data)


In [12]:
import tiktoken

tokenizer = tiktoken.get_encoding('gpt2')
print(tokenizer.encode("<|endoftext|>", allowed_special={"<|endoftext|>"}))

[50256]


In [13]:
def custom_collate_draft_1(batch, pad_token_id=50256, device="cpu"):
    # Find the longest sequence in the current batch
    # and increase it by 1, which will add one extra
    # padding token below
    batch_max_length = max(len(item)+1 for item in batch)

    # Pad and prepare the inputs
    inputs_lst = []
    for item in batch:
        new_item = item.copy() 
        # Add an <|endoftext|> token 
        new_item += [pad_token_id]
        # Pad sequences to batch_max_length
        padded = (
            new_item + [pad_token_id] * (batch_max_length - len(new_item))
        )

        # Via padded[:-1]n we remove the extra padded token
        # that has been added via the +1 setting in batch_max_length
        inputs = torch.tensor(padded[:-1])
        inputs_lst.append(inputs)

    # Convert list of inputs to tensor and transfer to target device
    inputs_tensor = torch.stack(inputs_lst).to(device)

    return inputs_tensor


In [14]:
# Example 
inputs_1 = [0, 1, 2, 3, 4]
inputs_2 = [5, 6]
inputs_3 = [7, 8, 9]

batch = [inputs_1, inputs_2, inputs_3]

print(custom_collate_draft_1(batch))

tensor([[    0,     1,     2,     3,     4],
        [    5,     6, 50256, 50256, 50256],
        [    7,     8,     9, 50256, 50256]])


### Creating Target Token Ids for Training

In [15]:
def custom_collate_draft_2(batch, pad_token_id=50256, device="cpu"):

    # Find the longest sequence in the current batch
    batch_max_length = max(len(item)+1 for item in batch)

    inputs_lst = []
    targets_lst = []
    
    for item in batch:
        new_item = item.copy() 
        # Add an <|endoftext|> token 
        new_item += [pad_token_id]
        # Pad sequences to batch_max_length
        padded = (
            new_item + [pad_token_id] * (batch_max_length - len(new_item))
        )
        inputs = torch.tensor(padded[:-1]) # Truncate the last token for inputs
        targets = torch.tensor(padded[1:]) # Shift +1 to thr right for targets
        inputs_lst.append(inputs)
        targets_lst.append(targets)

    # Convert list of inputs to tensor and transfer to target device
    inputs_tensor = torch.stack(inputs_lst).to(device)
    targets_tensor = torch.stack(targets_lst).to(device)

    return inputs_tensor, targets_tensor


In [16]:
# Example 
inputs_1 = [0, 1, 2, 3, 4]
inputs_2 = [5, 6]
inputs_3 = [7, 8, 9]

batch = [inputs_1, inputs_2, inputs_3]

inputs, targets = custom_collate_draft_2(batch)
print(inputs)
print("\n",targets)

tensor([[    0,     1,     2,     3,     4],
        [    5,     6, 50256, 50256, 50256],
        [    7,     8,     9, 50256, 50256]])

 tensor([[    1,     2,     3,     4, 50256],
        [    6, 50256, 50256, 50256, 50256],
        [    8,     9, 50256, 50256, 50256]])


In [17]:
# assign a -100 placeholder to the padding tokens

def custom_collate_draft_fn(batch, pad_token_id=50256,ignore_index=-100,allowed_max_length=None, device="cpu"):

    # Find the longest sequence in the current batch
    batch_max_length = max(len(item)+1 for item in batch)

    inputs_lst = []
    targets_lst = []
    
    for item in batch:
        new_item = item.copy() 
        # Add an <|endoftext|> token 
        new_item += [pad_token_id]
        # Pad sequences to batch_max_length
        padded = (
            new_item + [pad_token_id] * (batch_max_length - len(new_item))
        )
        inputs = torch.tensor(padded[:-1]) # Truncate the last token for inputs
        targets = torch.tensor(padded[1:]) # Shift +1 to thr right for targets

        # New: Replace all but the first padding tokens in targets by ignore_index
        mask = targets == pad_token_id
        indices = torch.nonzero(mask).squeeze()
        if indices.numel() > 1:
            targets[indices[1:]] = ignore_index

        # New: Optionally truncate to maximum sequence length
        if allowed_max_length is not None:
            inputs = inputs[:allowed_max_length]
            targets = targets[:allowed_max_length]

        inputs_lst.append(inputs)
        targets_lst.append(targets)

    # Convert list of inputs to tensor and transfer to target device
    inputs_tensor = torch.stack(inputs_lst).to(device)
    targets_tensor = torch.stack(targets_lst).to(device)

    return inputs_tensor, targets_tensor


In [18]:
# Example 
inputs_1 = [0, 1, 2, 3, 4]
inputs_2 = [5, 6]
inputs_3 = [7, 8, 9]

batch = [inputs_1, inputs_2, inputs_3]

inputs, targets = custom_collate_draft_fn(batch)
print(inputs)
print("\n",targets)

tensor([[    0,     1,     2,     3,     4],
        [    5,     6, 50256, 50256, 50256],
        [    7,     8,     9, 50256, 50256]])

 tensor([[    1,     2,     3,     4, 50256],
        [    6, 50256,  -100,  -100,  -100],
        [    8,     9, 50256,  -100,  -100]])


### Step 3: Creating Dataloaders from an Instruction Dataset

In [19]:
device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')
print(device)

cpu


In [20]:
from functools import partial

custom_collate_draft_fn = partial(
    custom_collate_draft_fn, device=device, allowed_max_length=1024
)

In [21]:
from torch.utils.data import DataLoader

num_workers = 0
batch_size = 8

torch.manual_seed(123)

train_dataset = InstructionDataset(train_data, tokenizer)
train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    collate_fn=custom_collate_draft_fn,
    shuffle=True,
    drop_last=True,
    num_workers=num_workers
)

val_dataset = InstructionDataset(valid_data, tokenizer)
val_loader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    collate_fn=custom_collate_draft_fn,
    shuffle=False,
    drop_last=False,
    num_workers=num_workers
)

test_dataset = InstructionDataset(test_data, tokenizer)
test_loader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    collate_fn=custom_collate_draft_fn,
    shuffle=False,
    drop_last=False,
    num_workers=num_workers
)

In [25]:
print("Train loader:")
for inputs, targets in train_loader:
    print(inputs.shape, targets.shape)

Train loader:
torch.Size([8, 61]) torch.Size([8, 61])
torch.Size([8, 61]) torch.Size([8, 61])
torch.Size([8, 83]) torch.Size([8, 83])
torch.Size([8, 71]) torch.Size([8, 71])
torch.Size([8, 63]) torch.Size([8, 63])
torch.Size([8, 63]) torch.Size([8, 63])
torch.Size([8, 61]) torch.Size([8, 61])
torch.Size([8, 63]) torch.Size([8, 63])
torch.Size([8, 61]) torch.Size([8, 61])
torch.Size([8, 70]) torch.Size([8, 70])
torch.Size([8, 75]) torch.Size([8, 75])
torch.Size([8, 71]) torch.Size([8, 71])
torch.Size([8, 70]) torch.Size([8, 70])
torch.Size([8, 60]) torch.Size([8, 60])
torch.Size([8, 77]) torch.Size([8, 77])
torch.Size([8, 73]) torch.Size([8, 73])
torch.Size([8, 62]) torch.Size([8, 62])
torch.Size([8, 72]) torch.Size([8, 72])
torch.Size([8, 68]) torch.Size([8, 68])
torch.Size([8, 74]) torch.Size([8, 74])
torch.Size([8, 75]) torch.Size([8, 75])
torch.Size([8, 66]) torch.Size([8, 66])
torch.Size([8, 64]) torch.Size([8, 64])
torch.Size([8, 69]) torch.Size([8, 69])
torch.Size([8, 81]) torch.

### Step 4: Loading a Pretrained LLM

In [26]:
import numpy as np

def assign(left, right):
    if left.shape != right.shape:
        raise ValueError(f"Shapes mismatch. Left: {left.shape}, Right: {right.shape}")
    return torch.nn.Parameter(torch.tensor(right))


def load_weights_into_gpt(gpt, params):
    gpt.pos_emb.weight = assign(gpt.pos_emb.weight, params['wpe'])
    gpt.tok_emb.weight = assign(gpt.tok_emb.weight, params['wte'])

    for b in range(len(params["blocks"])):
        q_w, k_w, v_w = np.split(
            (params['blocks'][b]['attn']['c_attn'])['w'], 3, axis=-1)
        gpt.trf_blocks[b].att.W_query.weight = assign(
            gpt.trf_blocks[b].att.W_query.weight, q_w.T)
        gpt.trf_blocks[b].att.W_key.weight = assign(
            gpt.trf_blocks[b].att.W_key.weight, k_w.T)
        gpt.trf_blocks[b].att.W_value.weight = assign(
            gpt.trf_blocks[b].att.W_value.weight, v_w.T)
        
        q_b, k_b, v_b = np.split(
            (params['blocks'][b]['attn']['c_attn'])['b'], 3, axis=-1)
        gpt.trf_blocks[b].att.W_query.bias = assign(
            gpt.trf_blocks[b].att.W_query.bias, q_b)
        gpt.trf_blocks[b].att.W_key.bias = assign(
            gpt.trf_blocks[b].att.W_key.bias, k_b)
        gpt.trf_blocks[b].att.W_value.bias = assign(
            gpt.trf_blocks[b].att.W_value.bias, v_b)
        
        gpt.trf_blocks[b].att.out_proj.weight = assign(
            gpt.trf_blocks[b].att.out_proj.weight, 
            params['blocks'][b]['attn']['c_proj']['w'].T)
        gpt.trf_blocks[b].att.out_proj.bias = assign(
            gpt.trf_blocks[b].att.out_proj.bias, 
            params['blocks'][b]['attn']['c_proj']['b'])
        
        gpt.trf_blocks[b].ff.layers[0].weight = assign(
            gpt.trf_blocks[b].ff.layers[0].weight, 
            params['blocks'][b]['mlp']['c_fc']['w'].T)
        gpt.trf_blocks[b].ff.layers[0].bias = assign(
            gpt.trf_blocks[b].ff.layers[0].bias, 
            params['blocks'][b]['mlp']['c_fc']['b'])
        gpt.trf_blocks[b].ff.layers[2].weight = assign(
            gpt.trf_blocks[b].ff.layers[2].weight, 
            params['blocks'][b]['mlp']['c_proj']['w'].T)
        gpt.trf_blocks[b].ff.layers[2].bias = assign(
            gpt.trf_blocks[b].ff.layers[2].bias, 
            params['blocks'][b]['mlp']['c_proj']['b'])

        gpt.trf_blocks[b].norm1.scale = assign(
            gpt.trf_blocks[b].norm1.scale, 
            params['blocks'][b]['ln_1']['g'])
        gpt.trf_blocks[b].norm1.shift = assign(
            gpt.trf_blocks[b].norm1.shift, 
            params['blocks'][b]['ln_1']['b'])
        gpt.trf_blocks[b].norm2.scale = assign(
            gpt.trf_blocks[b].norm2.scale, 
            params['blocks'][b]['ln_2']['g'])
        gpt.trf_blocks[b].norm2.shift = assign(
            gpt.trf_blocks[b].norm2.shift, 
            params['blocks'][b]['ln_2']['b'])
        
    gpt.final_norm.scale = assign(gpt.final_norm.scale, params['g'])
    gpt.final_norm.shift = assign(gpt.final_norm.shift, params['b'])
    gpt.out_head.weight = assign(gpt.out_head.weight, params['wte'])


In [28]:
#in this section we will use a lager model (larger than 124M)
# gpt2-medium (355M) 1.5GB

from gpt_download3 import download_and_load_gpt2
from modules import GPTModel


BASE_CONFIG = {
    'vocab_size': 50257,
    'context_length': 1024,
    'drop_rate': 0.0,
    'qkv_bias': True,
}
model_configs = {
    "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
    "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
    "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
    "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25}
}

CHOOSE_MODEL = "gpt2-small (124M)"
# CHOOSE_MODEL = "gpt2-medium (355M)"

BASE_CONFIG.update(model_configs[CHOOSE_MODEL])

model_size = CHOOSE_MODEL.split(" ")[-1].lstrip('(').rstrip(')')
settings, params = download_and_load_gpt2(
    model_size=model_size,
    models_dir='gpt2'
)

model = GPTModel(BASE_CONFIG)
load_weights_into_gpt(model, params)
model.eval();



File already exists and is up-to-date: gpt2\124M\checkpoint




File already exists and is up-to-date: gpt2\124M\encoder.json




File already exists and is up-to-date: gpt2\124M\hparams.json




File already exists and is up-to-date: gpt2\124M\model.ckpt.data-00000-of-00001




File already exists and is up-to-date: gpt2\124M\model.ckpt.index




File already exists and is up-to-date: gpt2\124M\model.ckpt.meta




File already exists and is up-to-date: gpt2\124M\vocab.bpe


In [29]:
model

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (drop_emb): Dropout(p=0.0, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=True)
        (W_key): Linear(in_features=768, out_features=768, bias=True)
        (W_value): Linear(in_features=768, out_features=768, bias=True)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcuts): Dropout(p=0.0, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=

In [30]:
# Test
torch.manual_seed(123)
input_text = format_input(valid_data[0])
print(input_text)

Below is an instruction that describes a task.Write a response that appropriately completes the request.

### Instruction:
Convert the active sentence to passive: 'The chef cooks the meal every day.'


In [31]:
import torch

# the same func in 8-Lec27
def generate(model, idx, max_new_tokens, context_size, temperature=0.0, top_k=None, eos_id=None):
    # get logits, and only focus on last time step
    for _ in range(max_new_tokens):
        idx_cond = idx[:, -context_size:]
        with torch.no_grad():
            logits = model(idx_cond)
        logits = logits[:, -1, :]

        # filter logits with tok_k sampling
        if top_k is not None:
            # keep only top_k values
            top_logits, _ = torch.topk(logits, top_k)
            min_val = top_logits[:, -1]
            logits = torch.where(logits < min_val, torch.tensor(float('-inf')).to(logits.device), logits)

        # apply temperature scaling
        if temperature > 0.0:
            logits = logits / temperature

            # apply softmax to get probas
            probs = torch.softmax(logits, dim=-1) # (batch size, context len)

            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (batch_size, 1)
        
        # otherwise get idx of the vocab entry with the highest logits value
        else:
            idx_next = torch.argmax(logits, dim=-1, keepdim=True)

        if idx_next == eos_id:
            break # stop generating early if end-of-seq token is encountered

        idx = torch.cat((idx, idx_next), dim=1) # (batch_size, num_tokens+1)
    
    return idx


In [32]:
# Test before train the LLM with this data
from modules import text_to_token_ids, token_ids_to_text

token_ids = generate(
    model=model,
    idx=text_to_token_ids(input_text, tokenizer),
    max_new_tokens=35,
    context_size=BASE_CONFIG['context_length'],
    eos_id=50256
)
generated_text = token_ids_to_text(token_ids, tokenizer)
print(generated_text)

Below is an instruction that describes a task.Write a response that appropriately completes the request.

### Instruction:
Convert the active sentence to passive: 'The chef cooks the meal every day.'

### Instruction:

Convert the active sentence to passive: 'The chef cooks the meal every day.'

### Instruction:

Convert the active


In [33]:
response_text = generated_text[len(input_text):].strip()
print(response_text)

### Instruction:

Convert the active sentence to passive: 'The chef cooks the meal every day.'

### Instruction:

Convert the active


### Step 5: Finetuning the LLM on Instruction Data

In [35]:
def calc_loss_batch(input_batch, target_batch, model, device):
    input_batch, target_batch = input_batch.to(device), target_batch.to(device)
    logits = model(input_batch)
    loss = torch.nn.functional.cross_entropy(logits.flatten(0, 1), target_batch.flatten())
    return loss


# same as before 
# compute loss for a user specified number of batches
def calc_loss_loader(data_loader, model, device, num_batches=None):
    total_loss = 0.
    if len(data_loader) == 0:
        return float('nan')
    elif num_batches is None:
        num_batches = len(data_loader)
    else:
        # Reduce the number of batches to match the total number of batches in the data 
        # if num_batches exceeds the number of batches in the data loader
        num_batches = min(num_batches, len(data_loader))
    
    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i < num_batches:
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            total_loss += loss.item()
        else:
            break

    return total_loss / num_batches

In [36]:
from modules import text_to_token_ids, token_ids_to_text, generate_text_simple

# Calculate the loss over the training and validation set while ensuring the model is in evaluation mode with gradient tracking 
def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    model.eval() # dropout disable
    with torch.no_grad():
        train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter)
        val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter)
    model.train()
    return train_loss, val_loss


# convenience func thet we use to track whether th model improves during the training, its takes a text snippet (strat_context) as input, converts it into token IDs and feeds it to the LLM to generate a text
def generate_and_print_sample(model, tokenizer, device, start_context):
    model.eval()
    context_size = model.pos_emb.weight.shape[0]
    encoded = text_to_token_ids(start_context, tokenizer).to(device)
    with torch.no_grad():
        token_ids = generate_text_simple(
            model=model, idx=encoded, max_new_tokens=50, context_size=context_size
        )
    decoded_text = token_ids_to_text(token_ids, tokenizer)
    print(decoded_text.replace('\n', ' ')) # Compact print format
    model.train()


def train_model_simple(model, train_loader, val_loader, optimizer, device, num_epochs, eval_freq, eval_iter, start_context, tokenizer):

    # Initialize lists to track losses and tokens seen
    train_losses = []
    val_losses = []
    track_tokens_seen = []
    tokens_seen = 0
    global_step = -1

    # Main training loop
    for epoch in range(num_epochs):
        model.train() # Set model to training mode

        for input_batch, target_batch in train_loader:
            optimizer.zero_grad() # Reset loss gradients from previous batch iteration
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            loss.backward() # Calculate loss gradients
            optimizer.step() # Update model weights using loss grads
            tokens_seen += input_batch.numel() # Returns the total number of elements (or tokens) in 
            global_step += 1

            # Optional evaluation step
            if global_step % eval_freq == 0:
                train_loss, val_loss = evaluate_model(
                    model, train_loader, val_loader, device, eval_iter)
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                track_tokens_seen.append(tokens_seen)
                print(f'Ep {epoch+1} (Step {global_step:06d}): '
                      f'Train loss {train_loss:.3f}, Val loss {val_loss:.3f}')
                
        # Print a sample text after each epoch
        generate_and_print_sample(
            model, tokenizer, device, start_context
        )
    
    return train_losses, val_losses, track_tokens_seen

In [37]:
model.to(device)

torch.manual_seed(123)

with torch.no_grad():
    train_loss = calc_loss_loader(train_loader, model, device, num_batches=5)
    val_loss = calc_loss_loader(val_loader, model, device, num_batches=5)

print("Training loss:", train_loss)
print("Validation loss:", val_loss)

Training loss: 4.273827838897705
Validation loss: 4.151291799545288


In [38]:
format_input(valid_data[0])

"Below is an instruction that describes a task.Write a response that appropriately completes the request.\n\n### Instruction:\nConvert the active sentence to passive: 'The chef cooks the meal every day.'"

In [None]:
import time 

start_time = time.time()

torch.manual_seed(123)

optimizer = torch.optim.AdamW(model.parameters(), lr=0.00005, weight_decay=0.1)

num_epochs = 1

# train_losses, val_losses, tokens_seen = train_model_simple(
#     model, train_loader, val_loader, optimizer, device, num_epochs=num_epochs, eval_freq=5, eval_iter=5, start_context=format_input(valid_data[0]), tokenizer=tokenizer
# )
# every after each batches printing the training and validation loss
# huge number of params +120M
end_time = time.time()
execution_time_minutes = (end_time - start_time) / 60
print(f'Training completed in {execution_time_minutes:.2f} minutes')

Ep 1 (Step 000000): Train loss 3.197, Val loss 3.139
Ep 1 (Step 000005): Train loss 1.656, Val loss 1.544
Ep 1 (Step 000010): Train loss 1.081, Val loss 1.159
Ep 1 (Step 000015): Train loss 1.055, Val loss 1.087
Ep 1 (Step 000020): Train loss 0.972, Val loss 1.040
Ep 1 (Step 000025): Train loss 0.921, Val loss 1.005
Ep 1 (Step 000030): Train loss 0.956, Val loss 0.973
Ep 1 (Step 000035): Train loss 0.865, Val loss 0.945
Ep 1 (Step 000040): Train loss 0.841, Val loss 0.934
Ep 1 (Step 000045): Train loss 0.764, Val loss 0.921
Ep 1 (Step 000050): Train loss 0.862, Val loss 0.908
Ep 1 (Step 000055): Train loss 0.926, Val loss 0.892
Ep 1 (Step 000060): Train loss 0.870, Val loss 0.875
Ep 1 (Step 000065): Train loss 0.796, Val loss 0.865
Ep 1 (Step 000070): Train loss 0.689, Val loss 0.858
Ep 1 (Step 000075): Train loss 0.705, Val loss 0.856
Ep 1 (Step 000080): Train loss 0.747, Val loss 0.842
Ep 1 (Step 000085): Train loss 0.675, Val loss 0.832
Ep 1 (Step 000090): Train loss 0.725, Val loss

### Save Model

In [69]:
torch.save(model.state_dict(), "finetuned_instructions.pth")

### Load Model

In [39]:
model_state_dict = torch.load("finetuned_instructions.pth")
model.load_state_dict(model_state_dict)

  model_state_dict = torch.load("finetuned_instructions.pth")


<All keys matched successfully>

### Step 6: Extracting and Saving Responses

In [None]:
torch.manual_seed(123)

for entry in test_data[:3]:
    input_text = format_input(entry)
    token_ids = generate(
        model=model,
        idx=text_to_token_ids(input_text, tokenizer).to(device),
        max_new_tokens=256,
        context_size=BASE_CONFIG['context_length'],
        eos_id=50256
    )
    generated_text = token_ids_to_text(token_ids, tokenizer)
    response_text = (
        generated_text[len(input_text):]
        .replace("### Response:","")
        .strip()
    )

    print(input_text) # instruction
    print(f"\nCorrect response:\n>> {entry['output']}")
    print(f"\Model response:\n>> {response_text.strip()}")
    print("-----------------------------------------")

Below is an instruction that describes a task.Write a response that appropriately completes the request.

### Instruction:
Rewrite the sentence using a simile.

### Input:
The car is very fast.

Correct response:
>> The car is as fast as lightning.
\Model response:
>> The car is very fast.
-----------------------------------------
Below is an instruction that describes a task.Write a response that appropriately completes the request.

### Instruction:
What type of cloud is typically associated with thunderstorms?

Correct response:
>> The type of cloud typically associated with thunderstorms is cumulonimbus.
\Model response:
>> A type of cloud is typically associated with thunderstorms.
-----------------------------------------
Below is an instruction that describes a task.Write a response that appropriately completes the request.

### Instruction:
Name the author of 'Pride and Prejudice'.

Correct response:
>> Jane Austen.
\Model response:
>> The author of 'Pride and Prejudice' is Rob

In [None]:
from tqdm import tqdm

for i, entry in tqdm(enumerate(test_data), total=len(test_data)):

    input_text = format_input(entry)

    token_ids = generate(
        model=model,
        idx=text_to_token_ids(input_text, tokenizer).to(device),
        max_new_tokens=256,
        context_size=BASE_CONFIG['context_length'],
        eos_id=50256
    )
    generated_text = token_ids_to_text(token_ids, tokenizer)
    response_text = (
        generated_text[len(input_text):]
        .replace("### Response:","")
        .strip()
    )
    test_data[i]['model_response'] = response_text

# with open("insturction-data-with-response.json", "w") as file:
#     json.dump(test_data, file, indent=4)

100%|██████████| 110/110 [19:51<00:00, 10.83s/it] 


In [40]:
print(test_data[0])

{'instruction': 'Rewrite the sentence using a simile.', 'input': 'The car is very fast.', 'output': 'The car is as fast as lightning.'}


### Step 7: Evaluating the Fine-Tuned LLM
- After extracting the responses by our finetuned LLM, we use another larger LLM to automatically evaluate these responses

- we utilize an existing instruction-finetuned 8 billion parameter Llama 3 model developed by Meta AI https://ollama.com

- we need to install ollama locally

In [None]:
# check if ollam is running
# run command : ollama serve

import psutil

def check_if_running(process_name):
    running = False
    for proc in psutil.process_iter(['name']):
        if process_name in proc.info['name']:
            running = True
            break
    return running

ollama_running = check_if_running('ollama')
if not ollama_running:
    raise RuntimeError('Ollama not running. Launch ollama before proceeding.')
print('Ollama running:', check_if_running('ollama'))

Ollama running: True


In [46]:
import urllib.request

def query_model(
        prompt,
        model='llama3',
        url='http://localhost:11434/api/chat'
):
    # Create the data payload as a dictionary
    data = {
        "model": model,
        "message": [
            {"role": "user", "content": prompt}
        ],
        "options": { 
            # Settings below are required for deterministic responses
            "seed": 123,
            "temperature": 0,
            "num_ctx": 2048
        }
    }

    # Convert the dictionary to a JSON formatted string and encode it to bytes
    payload = json.dumps(data).encode("utf-8")

    # Create a request object, setting the method to POST and adding necessary headers
    request = urllib.request.Request(
        url,
        data=payload,
        method="POST"
    )
    request.add_header('Content-Type', 'application/json')

    # Send the request and capture the response
    response_data = ""
    with urllib.request.urlopen(request) as response:
        # Read and decode the response
        while True:
            line = response.readline().decode('utf-8')
            if not line:
                break
            response_json = json.loads(line)
            response_data += response_json['message']['content']
    
    return response_data

In [48]:
model = 'llama3'
result = query_model('What do Llamas eat?', model)
print(result)




In [None]:
print(result)




In [None]:
for entry in test_data[:2]:
    prompt = (
        f"Given the input `{format_input(entry)}` "
        f"and correct output `{entry['output']}`, "
        f"score the model response `{entry['model_response']}`"
        f" on a scale from 0 to 100, where 100 is the best score."
    )

    print("\nDataset response:")
    print(">>", entry['output'])
    print("\nModel response:")
    print(">>", entry['model_response'])
    print("\nScore:")
    print(">>", query_model(prompt))
    print('------------------------------')

In [None]:
for entry in test_data[:2]:
    prompt = (
        f"Given the input `{format_input(entry)}` "
        f"and correct output `{entry['output']}`, "
        f"score the model response `{entry['model_response']}`"
        f" on a scale from 0 to 100, where 100 is the best score. "
        # New
        f"Respond with the integer number only."
    )

    score = query_model(prompt, model)
    print("\nDataset response:")
    print(">>", entry['output'])
    print("\nModel response:")
    print(">>", entry['model_response'])
    print("\nScore:")
    print(">>", query_model(prompt, model))
    print('------------------------------')