In [1]:
%%capture
%pip install -U transformers accelerate

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, TextStreamer
import torch


base_model = "/kaggle/input/llama-3.2/transformers/3b-instruct/1"

tokenizer = AutoTokenizer.from_pretrained(base_model)

model = AutoModelForCausalLM.from_pretrained(
    base_model,
    return_dict=True,
    low_cpu_mem_usage=True,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True,
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
#Setting pad_token_id to avoid receiving messages
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id
if model.config.pad_token_id is None:
    model.config.pad_token_id = model.config.eos_token_id

In [4]:
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    device_map="auto",
)

**Beginning fine tune model**

In [5]:
%%capture
%pip install -U transformers 
%pip install -U datasets 
%pip install -U accelerate 
%pip install -U peft 
%pip install -U trl 
%pip install -U bitsandbytes 
%pip install -U wandb

In [6]:
#Load the Python packages and functions we will use throughout the fine-tuning and evaluation process
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)
import os, torch, wandb
from datasets import load_dataset
from trl import SFTTrainer, setup_chat_format

In [40]:
from huggingface_hub import login


hf_token = "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"
login(token = hf_token)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [8]:
# wandb api key - c1654cbca8f17919e30147f14e109f52673945e9


In [9]:
wb_token = "c1654cbca8f17919e30147f14e109f52673945e9"

wandb.login(key=wb_token)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [10]:
# PROJECT name- Fine-tune-LLAMA3.2
run = wandb.init(
    project='Fine-tune-LLAMA3.2', 
    job_type="training", 
    anonymous="allow"
)

[34m[1mwandb[0m: Currently logged in as: [33mdyssjsnke[0m ([33mdyssjsnke-panjab-univeristy[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [11]:
base_model = "/kaggle/input/llama-3.2/transformers/3b-instruct/1"
new_model = "llama-3.2-3b-uj712"

In [12]:
dataset_name = load_dataset("Ujjwal671021/jac-chandigarh-information-brochure")

README.md:   0%|          | 0.00/368 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/145k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/48.4k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1300 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/422 [00:00<?, ? examples/s]

In [13]:
dataset_name

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 1300
    })
    test: Dataset({
        features: ['text'],
        num_rows: 422
    })
})

In [14]:
# Set torch dtype and attention implementation
if torch.cuda.get_device_capability()[0] >= 8:
    !pip install -qqq flash-attn
    torch_dtype = torch.bfloat16
    attn_implementation = "flash_attention_2"
else:
    torch_dtype = torch.float16
    attn_implementation = "eager"

In [15]:
# QLoRA config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
)
# Load model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    attn_implementation=attn_implementation
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [16]:
# %pip install -U bitsandbytes
dataset_name


DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 1300
    })
    test: Dataset({
        features: ['text'],
        num_rows: 422
    })
})

In [18]:
instruction = "You are a helpful assistant. Format the following text for chat."

def format_chat_template(row):
    row_json = [
        {"role": "system", "content": instruction},
        {"role": "user", "content": row["text"]},  # Assuming "text" field exists in the dataset
        {"role": "assistant", "content": "Your response here."}  # Replace this with an appropriate response logic
    ]
    
    # Apply the chat template to the "text" field
    row["text"] = tokenizer.apply_chat_template(row_json, tokenize=False)
    return row

# Process the dataset with multiprocessing
dataset_name = dataset_name.map(
    format_chat_template,
    num_proc=4,
)

  self.pid = os.fork()


Map (num_proc=4):   0%|          | 0/1300 [00:00<?, ? examples/s]

  self.pid = os.fork()


Map (num_proc=4):   0%|          | 0/422 [00:00<?, ? examples/s]

In [19]:
torch_dtype = torch.float16
attn_implementation = "eager"

In [20]:
# QLoRA config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
)

# Load model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    attn_implementation=attn_implementation
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [22]:
from transformers import AutoTokenizer

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model)

# Reset chat template to None if it already exists
if hasattr(tokenizer, "chat_template") and tokenizer.chat_template is not None:
    tokenizer.chat_template = None  # Reset the chat template

# Set up the model and tokenizer with the desired chat format
model, tokenizer = setup_chat_format(model, tokenizer)

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [26]:
dataset_name

KeyError: 'text'

In [27]:
import bitsandbytes as bnb

def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names:  # needed for 16 bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

modules = find_all_linear_names(model)

In [29]:
# Define LoRA configuration
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=modules,
)

# Reset chat_template if it exists
if hasattr(tokenizer, "chat_template") and tokenizer.chat_template is not None:
    tokenizer.chat_template = None  # Reset the chat template

# Set up the chat format
model, tokenizer = setup_chat_format(model, tokenizer)

# Apply LoRA configuration
model = get_peft_model(model, peft_config)

In [30]:
#Hyperparamter
training_arguments = TrainingArguments(
    output_dir=new_model,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    num_train_epochs=1,
    eval_strategy="steps",
    eval_steps=0.2,
    logging_steps=1,
    warmup_steps=10,
    logging_strategy="steps",
    learning_rate=2e-4,
    fp16=False,
    bf16=False,
    group_by_length=True,
    report_to="wandb"
)

In [32]:
# Setting sft parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset_name["train"],
    eval_dataset=dataset_name["test"],
    peft_config=peft_config,
    max_seq_length= 512,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/1300 [00:00<?, ? examples/s]

Map:   0%|          | 0/422 [00:00<?, ? examples/s]

In [33]:
trainer.train()



Step,Training Loss,Validation Loss
130,0.8226,0.969285
260,1.0976,0.934418
390,0.4936,0.914597
520,0.8459,0.897863
650,0.2729,0.889605


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


TrainOutput(global_step=650, training_loss=0.9297352568919842, metrics={'train_runtime': 876.1505, 'train_samples_per_second': 1.484, 'train_steps_per_second': 0.742, 'total_flos': 1988118613991424.0, 'train_loss': 0.9297352568919842, 'epoch': 1.0})

In [34]:
wandb.finish()

0,1
eval/loss,█▅▃▂▁
eval/runtime,█▇▂▁▃
eval/samples_per_second,▁▂▇█▆
eval/steps_per_second,▁▂▇█▆
train/epoch,▁▁▁▁▁▂▂▃▃▃▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇█
train/global_step,▁▁▁▂▂▂▂▂▃▃▃▄▄▄▄▄▄▄▄▄▄▄▄▄▄▅▅▅▆▆▆▆▇▇▇▇▇███
train/grad_norm,▆▇█▁▂▃▂▃▇▁▃▃▂▃▃▃▅▃▃▄▄▃▃▃▄▂▁▂▂▂▃▁▂▂▃▃▁▅▄▂
train/learning_rate,▂▃███▇▇▇▇▇▇▇▆▆▆▅▅▅▄▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▂▁▁▁
train/loss,█▂▃▁▃▂▂▂▂▂▂▃▂▃▂▂▂▂▁▂▃▂▃▃▃▃▁▁▂▁▂▃▃▂▁▃▂▂▂▂

0,1
eval/loss,0.8896
eval/runtime,67.3476
eval/samples_per_second,6.266
eval/steps_per_second,6.266
total_flos,1988118613991424.0
train/epoch,1.0
train/global_step,650.0
train/grad_norm,0.62568
train/learning_rate,0.0
train/loss,0.2729


In [35]:
messages = [{"role": "system", "content": instruction},
    {"role": "user", "content": "What is full form of CCET College"}]

prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    
inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True).to("cuda")

outputs = model.generate(**inputs, max_new_tokens=150, num_return_sequences=1)

text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(text.split("assistant")[1])

. Format the following text for chat.
user
What is full form of CCET College



In [41]:
trainer.model.save_pretrained(new_model)
trainer.model.push_to_hub(new_model, use_temp_dir=False)

adapter_model.safetensors:   0%|          | 0.00/1.67G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Ujjwal671021/llama-3.2-3b-uj712/commit/c9fc6834a3b34be149391f128dae13e4020017e5', commit_message='Upload model', commit_description='', oid='c9fc6834a3b34be149391f128dae13e4020017e5', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Ujjwal671021/llama-3.2-3b-uj712', endpoint='https://huggingface.co', repo_type='model', repo_id='Ujjwal671021/llama-3.2-3b-uj712'), pr_revision=None, pr_num=None)