In [19]:
DATASET_NAME = "TRnlp/MixSub"
MODEL_NAME = "unsloth/Llama-3.2-1B-Instruct-bnb-4bit"
TRAINED_MODEL_NAME = "Llama-3.2-1B-Instruct-bnb-4bit-MixSub"
TRAINED_MODEL_REPO = f"AdityaMayukhSom/{TRAINED_MODEL_NAME}"
MAX_SEQ_LEN = 2048
LOAD_IN_4BIT = True
DTYPE = None

In [20]:
from huggingface_hub import login, create_repo
from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()
hf_token = user_secrets.get_secret("HF_TOKEN")
login(token=hf_token)
# create_repo(TRAINED_MODEL_REPO)

In [21]:
import os
import pandas as pd
from pathlib import Path
from datasets import load_dataset

In [22]:
from packaging.version import Version as V

try:
    import torch
    from torch.version import cuda
except Exception as e:
    raise ImportError("Install torch via `pip install torch`")

v = V(torch.__version__)
is_ampere = torch.cuda.get_device_capability()[0] >= 8
xformers = "xformers==0.0.27" if v < V("2.4.0") else "xformers"
device = "cuda" if torch.cuda.is_available() else "cpu"

if cuda != "12.1" and cuda != "11.8" and cuda != "12.4":
    raise RuntimeError(f"CUDA = {cuda} not supported!")
if   v <= V('2.1.0'):
    raise RuntimeError(f"Torch = {v} too old!")
elif v <= V('2.1.1'):
    x = 'cu{}{}-torch211'
elif v <= V('2.1.2'):
    x = 'cu{}{}-torch212'
elif v  < V('2.3.0'):
    x = 'cu{}{}-torch220'
elif v  < V('2.4.0'):
    x = 'cu{}{}-torch230'
elif v  < V('2.5.0'):
    x = 'cu{}{}-torch240'
elif v  < V('2.6.0'):
    x = 'cu{}{}-torch250'
else:
    raise RuntimeError(f"Torch = {v} too new!")

x = x.format(cuda.replace(".", ""), "-ampere" if is_ampere else "")
print(f'pip install --upgrade pip && pip install "unsloth[{x}] @ git+https://github.com/unslothai/unsloth.git"')

pip install --upgrade pip && pip install "unsloth[cu121-torch250] @ git+https://github.com/unslothai/unsloth.git"


In [23]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install --upgrade pip
!pip install --no-deps {xformers} trl peft accelerate bitsandbytes triton
!pip install "unsloth[cu124-torch250] @ git+https://github.com/unslothai/unsloth.git"

### **Reference Links For Fine Tunning Llama 3.2 With Unsloth** 

1. [Fine-tuning Llama 3.2 Using Unsloth](https://www.kdnuggets.com/fine-tuning-llama-using-unsloth)
2. [Fine-tuning Llama 3 with Unsloth: A Beginner’s Guide](https://medium.com/@seekmeai/fine-tuning-llama-3-with-unsloth-a-beginners-guide-d239d48eaf71)

In [24]:
from unsloth import FastLanguageModel

fast_language_model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL_NAME,
    max_seq_length = MAX_SEQ_LEN,
    dtype = DTYPE,
    load_in_4bit = LOAD_IN_4BIT
)

==((====))==  Unsloth 2025.3.9: Fast Llama patching. Transformers: 4.49.0.
   \\   /|    Tesla T4. Num GPUs = 2. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/1.03G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.7k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

In [25]:
model = FastLanguageModel.get_peft_model(
    fast_language_model, 
    r = 16,
    target_modules = [
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "up_proj",
        "down_proj",
        "gate_proj",
    ],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 69,
    use_rslora = False,
    loftq_config = None,
)

Unsloth 2025.3.9 patched 16 layers with 16 QKV layers, 16 O layers and 16 MLP layers.


In [26]:
import os
from pathlib import Path
from datasets import load_dataset, load_from_disk, Dataset

# https://huggingface.co/docs/datasets/en/loading#hugging-face-hub
dataset = load_dataset(DATASET_NAME)
# Changing all the column names to have uniform singular forms
dataset = dataset.rename_column("Highlights", "Highlight")

# Only select 10 for training, 4 for testing, if everything goes well, 
# we can fine tune on a larger dataset, this if for easier handling only
train_dataset = dataset["train"].select(range(10))
eval_dataset = dataset["test"].select(range(4))

# Check train dataset before appending 'Prompt' column
# train_dataset.to_pandas().head()
# eval_dataset.to_pandas().head()

In [27]:
INSTRUCTIONS = """
You are instructed to generate a scientifically accurate highlight of the provided passage without additional 
sentences such as headings or introductions before or after the generated text as it will be used as summary 
in a custom dataset. The highlight should sound plausible and should not contain incorrect information. Generate 
3-5 concise highlight points from the provided research paper abstract, covering key contributions, methods and 
outcomes. Each point should contain 10 to 15 words only. Return the points in plain text format without bullets.

No Additional Commentary: Exclude lines like "Here are 3-5 concise highlight points".
"""

EOS_TOKEN = tokenizer.eos_token

def format_abstract_highlight_as_prompt(examples: list):  
    prompts: list[str] = []

    abstracts = examples["Abstract"]
    highlights = examples['Highlight']
    
    for abstract, highlight in zip(abstracts, highlights):
        row_json = [
            {"role": "system", "content": INSTRUCTIONS},
            {"role": "user", "content": abstract},
            # Must add EOS_TOKEN, otherwise your generation will go on forever!
            {"role": "assistant", "content": highlight + EOS_TOKEN}
        ]
        
        prompt = tokenizer.apply_chat_template(
            row_json, 
            tokenize=False, 
            add_generation_prompt=False,
            return_tensors="pt"
        )

        prompts.append(prompt)
        
    return { 
        "Prompt": prompts,
    }

In [28]:
# Append Prompt column on which the model will be trained

train_dataset = train_dataset.map(format_abstract_highlight_as_prompt, batched=True)
eval_dataset = eval_dataset.map(format_abstract_highlight_as_prompt, batched=True) 

# Check train dataset after adding 'Prompt' column
# train_dataset.to_pandas().head()
# eval_dataset.to_pandas().head()

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

In [29]:
train_dataset[0]['Prompt']

'<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 10 Mar 2025\n\nYou are instructed to generate a scientifically accurate highlight of the provided passage without additional \nsentences such as headings or introductions before or after the generated text as it will be used as summary \nin a custom dataset. The highlight should sound plausible and should not contain incorrect information. Generate \n3-5 concise highlight points from the provided research paper abstract, covering key contributions, methods and \noutcomes. Each point should contain 10 to 15 words only. Return the points in plain text format without bullets.\n\nNo Additional Commentary: Exclude lines like "Here are 3-5 concise highlight points".<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nRecent field data analyses have shown that lumbar spine fractures occurred more frequently in late model vehicles than the early ones in frontal crashes . Therefor

In [30]:
from trl import SFTTrainer
from unsloth import is_bfloat16_supported
from unsloth.chat_templates import train_on_responses_only
from transformers import TrainingArguments, DataCollatorForSeq2Seq

In [31]:
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    eval_dataset = eval_dataset,
    dataset_text_field = "Prompt",
    max_seq_length = MAX_SEQ_LEN,
    data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
    dataset_num_proc = 2,
    packing = False,
    args = TrainingArguments(
        per_device_train_batch_size=2,
        per_device_eval_batch_size=2,
        gradient_accumulation_steps=4,
        eval_strategy="steps",
        eval_steps=0.2,
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = TRAINED_MODEL_NAME,
        report_to = "none",
    )
)

Unsloth: We found double BOS tokens - we shall remove one automatically.


Tokenizing to ["Prompt"] (num_proc=2):   0%|          | 0/10 [00:00<?, ? examples/s]

Unsloth: We found double BOS tokens - we shall remove one automatically.


Tokenizing to ["Prompt"] (num_proc=2):   0%|          | 0/4 [00:00<?, ? examples/s]

In [32]:
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<|start_header_id|>system<|end_header_id|>",
    response_part = "<|start_header_id|>assistant<|end_header_id|>",
)

Map (num_proc=4):   0%|          | 0/10 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/4 [00:00<?, ? examples/s]

In [33]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.741 GB.
4.602 GB of memory reserved.


In [34]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 10 | Num Epochs = 60 | Total steps = 60
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 4 x 1) = 16
 "-____-"     Trainable parameters = 11,272,192/760,547,328 (1.48% trained)


Step,Training Loss,Validation Loss


In [35]:
# @title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

147.048 seconds used for training.
2.45 minutes used for training.
Peak reserved memory = 4.602 GB.
Peak reserved memory for training = 0.0 GB.
Peak reserved memory % of max memory = 31.219 %.
Peak reserved memory for training % of max memory = 0.0 %.


In [36]:
trainer.push_to_hub(
    commit_message="first epoch fine tuning on mixsub",
    model_name=TRAINED_MODEL_NAME,
    # language="en",
    # finetuned_from=MODEL_NAME,
    # dataset=DATASET_NAME
)

training_args.bin:   0%|          | 0.00/5.62k [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/45.1M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/AdityaMayukhSom/Llama-3.2-1B-Instruct-bnb-4bit-MixSub/commit/e59d4f4c59cc1b049a5b050ad3d6150f7bb0507b', commit_message='first epoch fine tuning on mixsub', commit_description='', oid='e59d4f4c59cc1b049a5b050ad3d6150f7bb0507b', pr_url=None, repo_url=RepoUrl('https://huggingface.co/AdityaMayukhSom/Llama-3.2-1B-Instruct-bnb-4bit-MixSub', endpoint='https://huggingface.co', repo_type='model', repo_id='AdityaMayukhSom/Llama-3.2-1B-Instruct-bnb-4bit-MixSub'), pr_revision=None, pr_num=None)