In [None]:
## preparing the dataset for supervised instruction fine-tuning 
import json
import os 
import urllib
import urllib.request
import urllib.response

def download_load_file(file_path , url) : 
    if not os.path.exists(file_path) : 
        with urllib.request.urlopen(url) as response :
            text_data = response.read().decode("utf-8")
        with open(file_path , "w" , encoding="utf-8") as file :
            file.write(text_data)
    else : 
        with open(file_path , "r" , encoding="utf-8") as file : 
            text_data = file.read()
    with open(file_path , "r")as file  :
        data = json.load(file)
    return data 

In [None]:
url = (
"https://raw.githubusercontent.com/rasbt/LLMs-from-scratch"
"/main/ch07/01_main-chapter-code/instruction-data.json"
)
file_path = "instruction-data.json"

data = download_load_file(file_path , url=url)
print(f"Number of entries : {len(data)}")

In [None]:
"""" 
Example of data : {'instruction': 'Identify the correct spelling of the following word.', 
                    'input': 'Ocassion', 'output': "The correct spelling is 'Occasion.'"}
"""
print(f"Example of data : {data[50]}")

In [None]:
### Converting the entries inn the dataset into Alpaca Style 
def format_into_alpaca(entry) : 
    instruction_text =(
        f"Below is an instruction that describes a task. " 
        f"Write a response that appropriately completes the request."
        f"\n\n### Instruction:\n{entry['instruction']}"
    )
    input_text =(
        f"\n\n### Input:\n{entry['input']}"
    )
    output_text = (
        f"\n\n### Response:\n{entry["output"]}"
    )
    return instruction_text + input_text , output_text

In [None]:
model_input , target_output = format_into_alpaca(data[50])
print(model_input + target_output)

In [None]:
train_portion = int(len(data) * 0.85)
val_portion = int(len(data) * 0.1)
test_portion = len(data) - train_portion - val_portion

train_data = data[:train_portion]
val_data = data[train_portion:train_portion + val_portion]
test_data = data[train_portion + val_portion:]
print(f"training set length   : {len(train_data)}")
print(f"validation set length : {len(val_data)}")
print(f"test set length       : {len(test_data)}")

In [None]:
### organizing data into training batches : 
import torch 
from torch.utils.data import Dataset

class InstructionDataset(Dataset) : 
    def __init__(self,data,tokenizer) : 
        self.data = data
        self.tokenizer = tokenizer
        self.encoded_texts = []
        for entry in data : 
            input_inst , target = format_into_alpaca(entry)
            full_text = input_inst + target
            self.encoded_texts.append(tokenizer.encode(full_text))
    def __len__(self) : 
        return len(self.encoded_texts)
    def __getitem__(self, index):
        return self.encoded_texts[index]

In [None]:
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")
treated_data = InstructionDataset(train_data , tokenizer)
print(treated_data[0])
print(f"\nraw data :\n",tokenizer.decode(treated_data[0]))

In [None]:
def custom_collate_draft_1(batch,pad_token_id=50256,device="cpu") : 
    batch_max_length = max(len(item)+1 for item in batch)
    inputs_lst = []
    for item in batch : 
        new_item = item.copy()
        new_item += [pad_token_id]
        padded = (
            new_item + [pad_token_id ] * (batch_max_length - len(new_item))
        )
        inputs = torch.tensor(padded[:-1])
        inputs_lst.append(inputs)
    inputs_tensor = torch.stack(inputs_lst).to(device)
    return inputs_tensor

In [None]:
inputs_1 = [0, 1, 2, 3, 4]
inputs_2 = [5, 6]
inputs_3 = [7, 8, 9]
batch = (
inputs_1,
inputs_2,
inputs_3
)
print(custom_collate_draft_1(batch))

In [None]:
def custom_collate_draft_2(batch,pad_token_id=50256,device="cpu") : 
    batch_max_length = max(len(item)+1 for item in batch)
    inputs_lst , targets_lst = [] , []
    for item in batch : 
        new_item = item.copy()
        new_item += [pad_token_id]
        padded = (
            new_item + [pad_token_id ] * (batch_max_length - len(new_item))
        )
        inputs = torch.tensor(padded[:-1])
        targets = torch.tensor(padded[1:])
        inputs_lst.append(inputs)
        targets_lst.append(targets)
    inputs_tensor = torch.stack(inputs_lst).to(device)
    targets_tensor = torch.stack(targets_lst).to(device)
    return inputs_tensor , targets_tensor

In [None]:
inputs , targets = custom_collate_draft_2(batch)
print(inputs)

print(targets)

In [None]:
"""    
tensor([[    1,     2,     3,     4, 20256],      in order to prevent the padding tokens from contributing to Loss calculation during training 
        [    6, 20256, 20256, 20256, 20256],      we replace them with a placeholder value -100   
        [    8,     9, 20256, 20256, 20256]])     ===> only meaningful tokens can contribute to the loss 
        
tensor([[    1,     2,     3,     4, 20256],
        [    6, 20256, -100, -100, -100],
        [    8,     9, 20256, -100, -100]])

"""
def custom_collate_draft_fn(batch,pad_token_id=50256,device="cpu",ignore_index=-100,allowed_max_length=None) : 
    batch_max_length = max(len(item)+1 for item in batch)
    inputs_lst , targets_lst = [] , []
    for item in batch : 
        new_item = item.copy()
        new_item += [pad_token_id]
        padded = (
            new_item + [pad_token_id ] * (batch_max_length - len(new_item))
        )
        inputs = torch.tensor(padded[:-1])
        targets = torch.tensor(padded[1:])
        
        
        mask = targets == pad_token_id
        indices = torch.nonzero(mask).squeeze()
        if indices.numel() > 1 : 
            targets[indices[1:]] = ignore_index ## replace all placeholders with -100 except the first one
        if allowed_max_length is not None : 
            inputs = inputs[:allowed_max_length]
            targets = targets[:allowed_max_length]
        inputs_lst.append(inputs)
        targets_lst.append(targets)
    inputs_tensor = torch.stack(inputs_lst).to(device)
    targets_tensor = torch.stack(targets_lst).to(device)
    return inputs_tensor , targets_tensor

In [None]:
inputs , targets = custom_collate_draft_fn(batch)
print(inputs)
print(f"\ntargets:")
print(targets)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
from functools import partial
customized_collate_draft_fn = partial(
    custom_collate_draft_fn,
    device=device,
    allowed_max_length = 1024
)

In [None]:
### Creating DataLoaders for instruction dataset : 
from torch.utils.data import DataLoader
num_workers = 0
batch_size  = 8
torch.manual_seed(123)

train_dataset = InstructionDataset(train_data,tokenizer)
train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=batch_size,
    collate_fn=custom_collate_draft_fn,
    shuffle=True,
    drop_last=True,
    num_workers=num_workers
)
val_dataset = InstructionDataset(val_data,tokenizer)
val_loader = DataLoader(
    dataset=val_dataset,
    batch_size=batch_size,
    collate_fn=custom_collate_draft_fn,
    shuffle=False,
    drop_last=False,
    num_workers=num_workers
)
test_dataset = InstructionDataset(test_data,tokenizer)
test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=batch_size,
    collate_fn=custom_collate_draft_fn,
    shuffle=False,
    drop_last=False,
    num_workers=num_workers
)

In [None]:
print(f"Train Loader : ")
i=0
for inputs,targets in train_loader : 
    print(f"Inputs shape : ",inputs.shape,f"-- Targets shape : ",targets.shape)
    i += 1
    if i==5 : break
    

In [None]:
from gpt_download import download_and_load_gpt2
from modules import GPTModel
from modules import load_weights_into_gpt

In [None]:
chosen_model = "gpt2_medium (355M)"
input_prompt = "Every effort moves you"
base_config = {
    "vocab_size":50257,
    "context_length":1024,
    "drop_rate":0.0,
    "qkv_bias":True
}

models_config = {
    "gpt2_small (124M)" : {"emb_dim":768 , "n_layers":12 ,"n_heads":12} , 
    "gpt2_medium (355M)" : {"emb_dim":1024 , "n_layers":24 ,"n_heads":16} , 
    "gpt2_large (774M)" : {"emb_dim":1280 , "n_layers":36,"n_heads":20} , 
    "gpt2_xl (1558M)" : {"emb_dim":1600 , "n_layers":48 ,"n_heads":25} , 
}
base_config.update(models_config[chosen_model])
print(base_config)

In [None]:
model_size = chosen_model.split(" ")[-1].lstrip("(").rstrip(")")
print(f"model size : ",model_size)
settings , params = download_and_load_gpt2(
    model_size=model_size,
    models_dir="gpt2"
)

In [None]:
gpt = GPTModel(base_config)
load_weights_into_gpt(gpt,params)
gpt.eval()

In [None]:
torch.manual_seed(123)
input_text , target_text = format_into_alpaca(val_data[0])
print(input_text)

In [None]:
from modules import generate , text_to_token_ids , ids_token_to_text
token_ids = generate(
    model=gpt,
    idx=text_to_token_ids(input_text,tokenizer),
    max_new_tokens=35,
    context_size=base_config["context_length"],
    eos_id=50256
)


In [None]:
print(f"Generated text : \n{ids_token_to_text(token_ids , tokenizer)}")
### the generate function was originally used in the pre-training stage so it combines the model input and output 
### since the  original task in pre-training is text completion  

In [None]:
response_text = ids_token_to_text(token_ids,tokenizer)[len(input_text):].strip()
print(response_text) ## here we gave only the necessary output which is the response to the input instruction

In [None]:
from importlib import reload
import modules
reload(modules)

from modules import train_model_simple , calc_loss_loader


In [None]:
gpt.to(device)
with torch.no_grad():
    train_loss = calc_loss_loader(train_loader,gpt,device,num_batches=5)
    val_loss = calc_loss_loader(val_loader,gpt,device,num_batches=5)
print(f"Training Loss   : {train_loss:.3f}")
print(f"Validation Loss : {val_loss:.3f}")

In [None]:
"""  
    Training will be done on a Kaggle , than we wil download the weights and complete the work here 
"""
import time 
start_context  , target_context= format_into_alpaca(val_data[0])
start_time = time.time()
optimizer = torch.optim.AdamW(gpt.parameters(),lr=0.00005,weight_decay=0.1)
num_epochs = 5
train_losses , val_losses , tokens_seen = train_model_simple(
    model=gpt,train_loader=train_loader,
    val_loader=val_loader,optimizer=optimizer,
    device=device,num_epochs=num_epochs,
    eval_freq=50,eval_iter=1,
    start_context=start_context,tokenizer=tokenizer
)

end_time = time.time()
exec_time = (end_time - start_time) / 60
print(f"Execution time : {exec_time:.3f} minutes")


In [None]:
model_fin_tuned = GPTModel(base_config)
state_dict= torch.load("instruction.pth" , weights_only=True)
model_fin_tuned.load_state_dict(state_dict)
model_fin_tuned.to(device)
model_fin_tuned.eval()


In [None]:
for entry in test_data[:3] : 
    input_text , target = format_into_alpaca(entry)
    token_ids = generate(
    model=model_fin_tuned,
    idx=text_to_token_ids(input_text,tokenizer).to(device),
    max_new_tokens=35,
    context_size=base_config["context_length"],
    eos_id=50256
    )
    gen_text = ids_token_to_text(token_ids ,tokenizer)

    response_text = (gen_text[len(input_text):].replace("### Response:", "").strip())
    print(input_text)
    print(f"\nCorrect response:\n>> {entry['output']}")
    print(f"\nModel response:\n>> {response_text.strip()}")
    print("-------------------------------------")