Let's load the dataset

In [1]:
import json

dataset_file = "New_data.json"

with open(dataset_file, "r") as f:
    alpaca = json.load(f)

In [2]:
type(alpaca), alpaca[0:3], len(alpaca)

(list,
 [{'instruction': 'Generate search keywords for searching images/videos from the given paragraph. Consider relevant elements and keywords should capture the essence and context of the paragraph while aligning with the overall meaning of the input. Your output should be a list of python strings, and each string will correspond to a sentence in the input with a search keyword. All should have string values. The length of the list and the number of sentences in the given paragraph should be the same.',
   'input': " In a memorable opening match, Germany triumphed over Scotland, leaving the Scots in despair. The final score: Germany 5, Scotland 1. After Wirtz scored in the 10th minute, the game turned into a nightmare for Scotland.   Germany's victory was a history-making event, with Musiala, Havertz, Füllkrug, and Can scoring four additional goals. Scotland's defeat was sealed when Ryan Porteous was sent off, leaving Havertz to score from the penalty spot.   This match was unique i

In [3]:
import wandb

# log to wandb
with wandb.init(project="data_ft_llama3LoRA"):
    at = wandb.Artifact(
        name="data", 
        type="dataset",
        description="Using the data we finetune",
    )
    at.add_file(dataset_file)

    # log as a table
    table = wandb.Table(columns=list(alpaca[0].keys()))
    for row in alpaca:
        table.add_data(*row.values())
    wandb.log({"data_table": table})

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33manishproshort[0m ([33mps-ml-team[0m). Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='31.893 MB of 31.893 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

In [4]:
import random

seed = 42

random.seed(seed)
random.shuffle(alpaca)  # this could also be a parameter

In [5]:
train_dataset = alpaca[:-1000]
eval_dataset = alpaca[-1000:]

In [6]:
import pandas as pd

train_df = pd.DataFrame(train_dataset)
eval_df = pd.DataFrame(eval_dataset)

train_table = wandb.Table(dataframe=train_df)
eval_table  = wandb.Table(dataframe=eval_df)

train_df.to_json("data_train.jsonl", orient='records', lines=True)
eval_df.to_json("data_eval.jsonl", orient='records', lines=True)

with wandb.init(project="data_ft_llama3LoRA", job_type="split_data"):
    at = wandb.Artifact(
        name="data_splitted", 
        type="dataset",
        description="Using the data we finetune",
    )
    at.add_file("data_train.jsonl")
    at.add_file("data_eval.jsonl")
    wandb.log_artifact(at)
    wandb.log({"train_dataset":train_table, "eval_dataset":eval_table})

VBox(children=(Label(value='31.894 MB of 31.894 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

In [7]:
def prompt_input(row):
    return ("Below is an instruction that describes a task, paired with an input that provides further context. "
            "Write a response that appropriately completes the request.\n\n"
            "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n").format_map(row)

In [8]:
'''import json
from wandb import Api

api = Api()
artifact = api.artifact('data_ft/data_splitted', type='dataset')
dataset_dir = artifact.download()'''

def load_jsonl(file_path):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            data.append(json.loads(line))
    return data
    
train_dataset = load_jsonl("data_train.jsonl")
eval_dataset = load_jsonl("data_eval.jsonl")

In [9]:
print(train_dataset[0])

{'instruction': 'Generate search keywords for searching images/videos from the given paragraph. Consider relevant elements and keywords should capture the essence and context of the paragraph while aligning with the overall meaning of the input. Your output should be a list of python strings, and each string will correspond to a sentence in the input with a search keyword. All should have string values. The length of the list and the number of sentences in the given paragraph should be the same.', 'input': " The small village of Eldoria lay nestled in a lush valley, encircled by the towering peaks of the Azure Mountains.   Among the villagers was a young girl named Elara. She was known for her vibrant spirit and insatiable curiosity, always asking questions about the world beyond their serene village.   Her grandmother, Liora, was the village's wise woman, revered for her knowledge of ancient spells and forgotten lore. Elara spent countless hours in her grandmother’s cozy cottage, list

In [10]:
train_prompts = [prompt_input(row) for row in train_dataset]
eval_prompts = [prompt_input(row) for row in eval_dataset]

In [11]:
print(train_prompts[0])

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Generate search keywords for searching images/videos from the given paragraph. Consider relevant elements and keywords should capture the essence and context of the paragraph while aligning with the overall meaning of the input. Your output should be a list of python strings, and each string will correspond to a sentence in the input with a search keyword. All should have string values. The length of the list and the number of sentences in the given paragraph should be the same.

### Input:
 The small village of Eldoria lay nestled in a lush valley, encircled by the towering peaks of the Azure Mountains.   Among the villagers was a young girl named Elara. She was known for her vibrant spirit and insatiable curiosity, always asking questions about the world beyond their serene village.   Her grandmother, Liora, wa

In [12]:
def pad_eos(ds):
    EOS_TOKEN = "</s>"
    return [f"{row['output']}{EOS_TOKEN}" for row in ds]

In [13]:
train_outputs = pad_eos(train_dataset)
eval_outputs = pad_eos(eval_dataset)
train_outputs[0]

"['Small Village Eldoria Azure Mountains', 'Curious young girl village', 'Grandmother telling stories cottage', 'Girl finding old map in forest', 'girl showing map to grandmother', 'Phoenix map leading to Sacred Springs']</s>"

In [14]:
train_dataset = [{"prompt":s, "output":t, "example": s + t} for s, t in zip(train_prompts, train_outputs)]
eval_dataset = [{"prompt":s, "output":t, "example": s + t} for s, t in zip(eval_prompts, eval_outputs)]

In [15]:
print(train_dataset[0]["example"])

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Generate search keywords for searching images/videos from the given paragraph. Consider relevant elements and keywords should capture the essence and context of the paragraph while aligning with the overall meaning of the input. Your output should be a list of python strings, and each string will correspond to a sentence in the input with a search keyword. All should have string values. The length of the list and the number of sentences in the given paragraph should be the same.

### Input:
 The small village of Eldoria lay nestled in a lush valley, encircled by the towering peaks of the Azure Mountains.   Among the villagers was a young girl named Elara. She was known for her vibrant spirit and insatiable curiosity, always asking questions about the world beyond their serene village.   Her grandmother, Liora, wa

In [16]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

In [17]:
model_id = 'meta-llama/Meta-Llama-3-8B-Instruct'
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


### Packing

We will pack multiple short examples into a longer chunk, so we can train more efficiently!

The main idea here is that the instruction/output samples are short, so let's concatenate a bunch of them together separated by the `EOS` token. We can also pre-tokenize and pre-pack the dataset and make everything faster!  If we define a `max_seq_len = 1024` the code to pack would look something like this:

In [18]:
max_sequence_len = 1024

def pack(dataset, max_seq_len=max_sequence_len):
    tkds_ids = tokenizer([s["example"] for s in dataset])["input_ids"]
    
    all_token_ids = []
    for tokenized_input in tkds_ids:
        all_token_ids.extend(tokenized_input)# + [tokenizer.eos_token_id])
    
    print(f"Total number of tokens: {len(all_token_ids)}")
    packed_ds = []
    for i in range(0, len(all_token_ids), max_seq_len+1):
        input_ids = all_token_ids[i : i + max_seq_len+1]
        if len(input_ids) == (max_seq_len+1):
            packed_ds.append({"input_ids": input_ids[:-1], "labels": input_ids[1:]})  # this shift is not needed if using the model.loss
    return packed_ds

In [19]:
train_ds_packed = pack(train_dataset)
eval_ds_packed = pack(eval_dataset)
len(train_ds_packed)

Total number of tokens: 3260445
Total number of tokens: 379440


3180

### DataLoader
As we are training for completion, the labels (or targets) will be the inputs shifted by one. We are going to train with regular Cross Entropy and predict the next token on this packed dataset.

In [20]:
from torch.utils.data import DataLoader
from transformers import default_data_collator

torch.manual_seed(seed)
batch_size = 4  # I have an A100 GPU with 40GB of RAM 😎

train_dataloader = DataLoader(
    train_ds_packed,
    batch_size=batch_size,
    collate_fn=default_data_collator, # we don't need any special collator 😎
)

eval_dataloader = DataLoader(
    eval_ds_packed,
    batch_size=batch_size,
    collate_fn=default_data_collator,
    shuffle=False,
)

It is always a good idea to check how does a batch looks like, you can quickly do this by sampling from the DataLoader

In [21]:
b = next(iter(train_dataloader))
b

{'input_ids': tensor([[128000,  39314,    374,  ...,   3862,   6634, 101565],
         [  5320, 107633,     11,  ...,  50228,    106, 100278],
         [   100,  87648,  28025,  ...,    223,  11372,    100],
         [   223,  36278,    237,  ...,   1988,    430,   5825]]),
 'labels': tensor([[ 39314,    374,    459,  ...,   6634, 101565,  14115],
         [107633,     11,    841,  ...,    106, 100278,  36278],
         [ 87648,  28025,    222,  ...,  11372,    100,  28025],
         [ 36278,    237,  11372,  ...,    430,   5825,   4726]])}

We can alos decode the batch just to be super sure

In [22]:
tokenizer.decode(b["input_ids"][0])[:250]

'<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nGenerate search keywords for searching images/videos from'

In [23]:
tokenizer.decode(b["labels"][0])[:250]

'Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nGenerate search keywords for searching images/videos from the given paragr'

## Train

I like storing all my hyperparameters into a `SimpleNamespace`, it's like a dict but with .dot attribute access. Then I can access my batch size by doing config.batch_size instead of config["batch_size"]. 

In [24]:
from types import SimpleNamespace

gradient_accumulation_steps = 2

config = SimpleNamespace(
    model_id='meta-llama/Meta-Llama-3-8B-Instruct',
    dataset_name="data",
    precision="bf16",  # faster and better than fp16, requires new GPUs
    lr=2e-5,
    n_eval_samples=10, # How many samples to generate on validation
    max_seq_len=max_sequence_len, # Lenght of the sequences to pack
    epochs=3,  # we do 3 pasess over the dataset.
    gradient_accumulation_steps=gradient_accumulation_steps,  # evey how many iterations we update the gradients, simulates larger batch sizes
    batch_size=batch_size,  # what my GPU can handle, depends on how many layers are we training  
    log_model=False,  # upload the model to W&B?
    gradient_checkpointing = True,  # saves even more memory
    freeze_embed = True,  # why train this? let's keep them frozen ❄️
    seed=seed,
)

config.total_train_steps = config.epochs * len(train_dataloader) // config.gradient_accumulation_steps

In [25]:
print(f"We will train for {config.total_train_steps} steps and evaluate every epoch")

We will train for 1192 steps and evaluate every epoch


We first get a pretrained model with some configuration parameters

In [26]:
from transformers import BitsAndBytesConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig, get_peft_model
quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16,)
# Load the model and tokenizer
model_id = 'meta-llama/Meta-Llama-3-8B-Instruct'
model = AutoModelForCausalLM.from_pretrained(model_id, device_map=0,trust_remote_code=True,
    low_cpu_mem_usage=True,
    torch_dtype=torch.bfloat16,
    use_cache=False)

# Configure LoRA
lora_config = LoraConfig(
    r=128,
    lora_alpha=64,
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules= ['k_proj', 'q_proj', 'v_proj', 'o_proj', "gate_proj", "down_proj", "up_proj"]
)

# Wrap the model with LoRA
model = get_peft_model(model, lora_config)


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [27]:
def param_count(m):
    params = sum([p.numel() for p in m.parameters()])/1_000_000
    trainable_params = sum([p.numel() for p in m.parameters() if p.requires_grad])/1_000_000
    print(f"Total params: {params:.2f}M, Trainable: {trainable_params:.2f}M")
    return params, trainable_params

params, trainable_params = param_count(model)

Total params: 8365.81M, Trainable: 335.54M


### Optimizer

We setup the standard optimization stuff...

In [28]:
from transformers import get_cosine_schedule_with_warmup

optim = torch.optim.Adam(model.parameters(), lr=config.lr, betas=(0.9,0.99), eps=1e-5)
scheduler = get_cosine_schedule_with_warmup(
    optim,
    num_training_steps=config.total_train_steps,
    num_warmup_steps=config.total_train_steps // 10,
)

In [29]:
def loss_fn(x, y):
    "A Flat CrossEntropy" 
    return torch.nn.functional.cross_entropy(x.view(-1, x.shape[-1]), y.view(-1))

## Testing during training

We are almost there, let's create a simple function to sample from the model now and then, to visualy see what the models is outputting!
Let's wrap the model.generate method for simplicity. You can grab the defaults sampling parameters from the GenerationConfig and passing the corresponding model_id. This will grab you the defaults for parameters like temperature, top p, etc...


In [30]:
from types import SimpleNamespace
from transformers import GenerationConfig

gen_config = GenerationConfig.from_pretrained(config.model_id)
test_config = SimpleNamespace(
    max_new_tokens=256,
    gen_config=gen_config)

In [31]:
def generate(prompt, max_new_tokens=test_config.max_new_tokens, gen_config=gen_config):
    tokenized_prompt = tokenizer(prompt, return_tensors='pt')['input_ids'].cuda()
    with torch.inference_mode():
        output = model.generate(tokenized_prompt, 
                            max_new_tokens=max_new_tokens, 
                            generation_config=gen_config,
                            pad_token_id=tokenizer.eos_token_id)
    return tokenizer.decode(output[0][len(tokenized_prompt[0]):], skip_special_tokens=True)

LoL 🤷

In [33]:
prompt = eval_dataset[16]["prompt"]
print(prompt + generate(prompt, 128))

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Generate search keywords for searching images/videos from the given paragraph. Consider relevant elements and keywords should capture the essence and context of the paragraph while aligning with the overall meaning of the input. Your output should be a list of python strings, and each string will correspond to a sentence in the input with a search keyword. All should have string values. The length of the list and the number of sentences in the given paragraph should be the same.

### Input:
 The video '7 Days Stranded On An Island' on YouTube shows a group of people surviving in a remote location for a week.   They faced various challenges such as finding food and shelter, and had to rely on their survival skills.   The group managed to build a fire, catch fish for food, and find sources of fresh water.   They al

In [34]:
import wandb
from tqdm.auto import tqdm

def prompt_table(examples, log=False, table_name="predictions"):
    table = wandb.Table(columns=["prompt", "generation", "concat", "output", "max_new_tokens", "temperature", "top_p"])
    for example in tqdm(examples, leave=False):
        prompt, gpt4_output = example["prompt"], example["output"]
        out = generate(prompt, test_config.max_new_tokens, test_config.gen_config)
        table.add_data(prompt, out, prompt+out, gpt4_output, test_config.max_new_tokens, test_config.gen_config.temperature, test_config.gen_config.top_p)
    if log:
        wandb.log({table_name:table})
    return table

def to_gpu(tensor_dict):
    return {k: v.to('cuda') for k, v in tensor_dict.items()}

class Accuracy:
    "A simple Accuracy function compatible with HF models"
    def __init__(self):
        self.count = 0
        self.tp = 0.
    def update(self, logits, labels):
        logits, labels = logits.argmax(dim=-1).view(-1).cpu(), labels.view(-1).cpu()
        tp = (logits == labels).sum()
        self.count += len(logits)
        self.tp += tp
        return tp / len(logits)
    def compute(self):
        return self.tp / self.count

In [35]:
from rouge_score import rouge_scorer

# Initialize the ROUGE scorer for bigrams (ROUGE-2)
scorer = rouge_scorer.RougeScorer(['rouge2'], use_stemmer=True)

def compute_rouge2(predictions, references):
    rouge2_scores = []
    for pred, ref in zip(predictions, references):
        score = scorer.score(ref, pred)['rouge2'].fmeasure
        rouge2_scores.append(score)
    return sum(rouge2_scores) / len(rouge2_scores)

You can also quickly add validation if you feel so, the table can be also created at this step:

In [36]:
@torch.no_grad()
def validate():
    model.eval()
    eval_acc = Accuracy()
    loss, total_steps = 0., 0
    for step, batch in enumerate(pbar:=tqdm(eval_dataloader, leave=False)):
        pbar.set_description(f"doing validation")
        batch = to_gpu(batch)
        total_steps += 1
        with torch.amp.autocast("cuda", dtype=torch.bfloat16):
            out = model(**batch)
            loss += loss_fn(out.logits, batch["labels"])  # you could use out.loss and not shift the dataset
        predictions = tokenizer.batch_decode(out.logits.argmax(dim=-1), skip_special_tokens=True)
        references = tokenizer.batch_decode(batch["labels"], skip_special_tokens=True)
        # Compute ROUGE-2 score
        rouge2_score = compute_rouge2(predictions, references)
    # we log results at the end
    wandb.log({"eval/loss": loss.item() / total_steps,
               "eval/rouge2_score": rouge2_score})
    prompt_table(eval_dataset[:config.n_eval_samples], log=True)
    model.train()

Let's define a loop that compute evaluation and logs a Table with model predictions

## The actual Loop
It's actually nothing fancy, and very short! It has:
- Gradient accumulation and gradient scaling
- sampling and model checkpoint saving (this trains very fast, no need to save multiple checkpoints)
- We compute token accuracy, better metric than loss.

In [None]:
wandb.init(project="data_ft_llama3LoRA", # the project I am working on
           tags=["baseline","8b"],
           job_type="train",
           config=config) # the Hyperparameters I want to keep track of

# Training

acc = Accuracy()
model.train()
train_step = 0
for epoch in tqdm(range(config.epochs)):
    for step, batch in enumerate(tqdm(train_dataloader)):
        batch = to_gpu(batch)
        with torch.amp.autocast("cuda", dtype=torch.bfloat16):
            out = model(**batch)
            loss = loss_fn(out.logits, batch["labels"]) / config.gradient_accumulation_steps  # you could use out.loss and not shift the dataset  
            loss.backward()
        if step%config.gradient_accumulation_steps == 0:
            # Convert logits to predictions and references to text
            predictions = tokenizer.batch_decode(out.logits.argmax(dim=-1), skip_special_tokens=True)
            references = tokenizer.batch_decode(batch["labels"], skip_special_tokens=True)
            
            # Compute ROUGE-2 score
            rouge2_score = compute_rouge2(predictions, references)
            # we can log the metrics to W&B
            wandb.log({"train/loss": loss.item() * config.gradient_accumulation_steps,
                       "train/rouge2_score": rouge2_score,
                       "train/learning_rate": scheduler.get_last_lr()[0],
                       "train/global_step": train_step})
            optim.step()
            scheduler.step()
            optim.zero_grad(set_to_none=True)
            train_step += 1
    validate()    

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/795 [00:00<?, ?it/s]

In [None]:
# we save the model checkpoint at the end
#save_model(model, model_name=config.model_id.replace("/", "_"), models_folder="models/", log=config.log_model)
    
wandb.finish()

This trains in around 60 minutes on an A100. 

## Full Eval Dataset evaluation

Let's log a table with model predictions on the eval_dataset (or at least the 250 first samples)

In [None]:
with wandb.init(project="data_ft_llama3LoRA", # the project I am working on
           job_type="eval",
           config=config): # the Hyperparameters I want to keep track of
    model.eval();
    prompt_table(eval_dataset[:250], log=True, table_name="eval_predictions")

In [None]:
model.save_pretrained("fine_tuned/data_new_ft_llama3QLoRA")
tokenizer.save_pretrained('fine_tuned/data_new_ft_llama3QLoRA')

In [None]:
!python llama.cpp/convert-hf-to-gguf.py fine_tuned/data_new_ft_llama3QLoRA_full --outfile "llama3Q-ft.gguf" --outtype f16

In [None]:
!./llama.cpp/ggml-quants.h "phi3-ft.gguf" "phi3-ft-Q5_K_M.gguf" "q5_k_m"