In [18]:
!pip install -qU datasets trl bitsandbytes accelerate

## Importing Libraries

In [19]:
import torch
import pandas as pd
from datasets import Dataset, load_dataset, load_from_disk

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    DataCollatorForLanguageModeling,
)
from peft import (
    prepare_model_for_kbit_training,
    LoraConfig,
    get_peft_model,
    PeftModel
)
from trl import SFTTrainer, SFTConfig

from google.colab import drive, userdata
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [20]:
from huggingface_hub import login
HUGGINGFACE_API_KEY = userdata.get('HUGGINGFACE_API_KEY')
login(token=HUGGINGFACE_API_KEY)

In [21]:
model_id = "meta-llama/Llama-2-7b-chat-hf"
device = "cuda" if torch.cuda.is_available() else "cpu"

## Model Loading

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    device_map="auto"
)

In [None]:
print(f"Model loaded on {model.device}")

## Model Testing

In [7]:
answering_prompt = [
    {
        "role": "system",
        "content": "You are a helpful AI assistant specialized in brainstorming tasks."
    },
     {
        "role": "user",
        "content": "Why do potato chip bags become stale after opening?"
    }
]

In [8]:
text = tokenizer.apply_chat_template(
    answering_prompt,
    tokenize=False,
    add_generation_prompt=True
)

model_inputs = tokenizer([text], return_tensors="pt").to(device)

generated_ids = model.generate(
    model_inputs.input_ids,
    max_new_tokens=1024,
    do_sample=False, top_k=None, temperature=None, top_p=None,
)

generated_ids = [
    output_ids[len(input_ids):]
    for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

In [9]:
print(response)

 Ah, a fascinating question! 😊 Potato chip bags can become stale after opening due to several reasons. Here are some possible explanations:

1. Oxidation: When you open a bag of potato chips, the air inside the bag is exposed to oxygen in the atmosphere. This can cause the fats and oils in the chips to oxidize, leading to a stale or rancid taste.
2. Moisture absorption: When you open a bag of chips, the air inside the bag can absorb moisture from the surrounding environment. This can cause the chips to become soft and stale more quickly.
3. Enzymatic activity: Some bags of potato chips may contain enzymes that are designed to help preserve the chips' freshness. However, these enzymes can also contribute to the staling process if they are not properly inhibited.
4. Lack of preservatives: Many potato chip bags do not contain preservatives to prevent spoilage. Without these preservatives, the chips can become stale more quickly.
5. Temperature and humidity: The temperature and humidity of

In [10]:
print(text)

<s>[INST] <<SYS>>
You are a helpful AI assistant specialized in brainstorming tasks.
<</SYS>>

Why do potato chip bags become stale after opening? [/INST]


## Dataset Preprocessing

In [11]:
ds = load_dataset("databricks/databricks-dolly-15k")
ds

README.md:   0%|          | 0.00/8.20k [00:00<?, ?B/s]

databricks-dolly-15k.jsonl:   0%|          | 0.00/13.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/15011 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['instruction', 'context', 'response', 'category'],
        num_rows: 15011
    })
})

In [12]:
df = ds["train"].to_pandas()
df.head()

Unnamed: 0,instruction,context,response,category
0,When did Virgin Australia start operating?,"Virgin Australia, the trading name of Virgin A...",Virgin Australia commenced services on 31 Augu...,closed_qa
1,Which is a species of fish? Tope or Rope,,Tope,classification
2,Why can camels survive for long without water?,,Camels use the fat in their humps to keep them...,open_qa
3,"Alice's parents have three daughters: Amy, Jes...",,The name of the third daughter is Alice,open_qa
4,When was Tomoaki Komorida born?,Komorida was born in Kumamoto Prefecture on Ju...,"Tomoaki Komorida was born on July 10,1981.",closed_qa


In [13]:
def data_format(row):
    content = row['instruction']
    if pd.notna(row['context']) and row['context'].strip():
        content = f"{row['instruction']}\n\nContext: {row['context']}"

    formatting_prompt = [
        {
            "role": "system",
            "content": f"You are a helpful AI assistant specialized in {row['category']} tasks."
        },
        {
            "role": "user",
            "content": content
        },
        {
            "role": "assistant",
            "content": row['response']
        }
    ]

    text = tokenizer.apply_chat_template(
        formatting_prompt,
        tokenize=False,
        add_generation_prompt=True
    )

    return text

In [14]:
df["text"] = df.apply(data_format, axis=1)

In [15]:
df = df[["text"]]
df

Unnamed: 0,text
0,<s>[INST] <<SYS>>\nYou are a helpful AI assist...
1,<s>[INST] <<SYS>>\nYou are a helpful AI assist...
2,<s>[INST] <<SYS>>\nYou are a helpful AI assist...
3,<s>[INST] <<SYS>>\nYou are a helpful AI assist...
4,<s>[INST] <<SYS>>\nYou are a helpful AI assist...
...,...
15006,<s>[INST] <<SYS>>\nYou are a helpful AI assist...
15007,<s>[INST] <<SYS>>\nYou are a helpful AI assist...
15008,<s>[INST] <<SYS>>\nYou are a helpful AI assist...
15009,<s>[INST] <<SYS>>\nYou are a helpful AI assist...


In [16]:
df['text'][0]

"<s>[INST] <<SYS>>\nYou are a helpful AI assistant specialized in closed_qa tasks.\n<</SYS>>\n\nWhen did Virgin Australia start operating?\n\nContext: Virgin Australia, the trading name of Virgin Australia Airlines Pty Ltd, is an Australian-based airline. It is the largest airline by fleet size to use the Virgin brand. It commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route. It suddenly found itself as a major airline in Australia's domestic market after the collapse of Ansett Australia in September 2001. The airline has since grown to directly serve 32 cities in Australia, from hubs in Brisbane, Melbourne and Sydney. [/INST] Virgin Australia commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route. </s>"

In [17]:
df['text'][5]

"<s>[INST] <<SYS>>\nYou are a helpful AI assistant specialized in information_extraction tasks.\n<</SYS>>\n\nIf I have more pieces at the time of stalemate, have I won?\n\nContext: Stalemate is a situation in chess where the player whose turn it is to move is not in check and has no legal move. Stalemate results in a draw. During the endgame, stalemate is a resource that can enable the player with the inferior position to draw the game rather than lose. In more complex positions, stalemate is much rarer, usually taking the form of a swindle that succeeds only if the superior side is inattentive.[citation needed] Stalemate is also a common theme in endgame studies and other chess problems.\n\nThe outcome of a stalemate was standardized as a draw in the 19th century. Before this standardization, its treatment varied widely, including being deemed a win for the stalemating player, a half-win for that player, or a loss for that player; not being permitted; and resulting in the stalemated p

In [18]:
df_path = "/content/drive/MyDrive/llama2-finetune/llama2_qa.csv"
df.to_csv(df_path, index=False)

In [19]:
ds = Dataset.from_pandas(df)
ds

Dataset({
    features: ['text'],
    num_rows: 15011
})

In [20]:
ds_path = "/content/drive/MyDrive/llama2-finetune/llama2-qa-ds/"
ds.save_to_disk(ds_path)

Saving the dataset (0/1 shards):   0%|          | 0/15011 [00:00<?, ? examples/s]

## Finetuning

In [5]:
ds_path = "/content/drive/MyDrive/llama2-finetune/llama2-qa-ds/"
ds = load_from_disk(ds_path)
ds

Dataset({
    features: ['text'],
    num_rows: 15011
})

In [6]:
ds = ds.map(lambda x: {"length": len(x["text"])})

In [7]:
ds = ds.sort("length")
ds = ds.select(range(10000))

In [8]:
ds = ds.remove_columns(["length"])
ds

Dataset({
    features: ['text'],
    num_rows: 10000
})

In [22]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)

In [10]:
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [11]:
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=torch.float16,
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [12]:
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

In [13]:
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

In [17]:
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/llama2-finetune/model-checkpoints",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=True,
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    lr_scheduler_type="cosine",
    report_to="tensorboard",
)

In [18]:
trainer = SFTTrainer(
    model=model,
    train_dataset=ds,
    args=training_args,
    peft_config=lora_config
)

Tokenizing train dataset:   0%|          | 0/10000 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/10000 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
25,2.4265
50,1.136
75,1.2947
100,0.9325
125,1.2886
150,0.8942
175,1.2749
200,0.8763
225,1.2756
250,0.8228


## LoRA Model

In [None]:
lora_checkpoint = "/content/drive/MyDrive/llama2-finetune/model-checkpoints/checkpoint-500"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=torch.float16,
)

In [None]:
model = PeftModel.from_pretrained(
    model,
    lora_checkpoint
)

In [None]:
model = model.merge_and_unload()

In [None]:
answering_prompt = [
    {
        "role": "system",
        "content": "You are a helpful AI assistant specialized in open_qa tasks."
    },
     {
        "role": "user",
        "content": "What makes a formula one car so fast?"
    }
]

In [None]:
text = tokenizer.apply_chat_template(
    answering_prompt,
    tokenize=False,
    add_generation_prompt=True
)

model_inputs = tokenizer([text], return_tensors="pt").to(device)

generated_ids = model.generate(
    model_inputs.input_ids,
    max_new_tokens=1024,
    do_sample=False, top_k=None, temperature=None, top_p=None,
)

generated_ids = [
    output_ids[len(input_ids):]
    for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

In [None]:
print(response)