In [1]:
from unsloth import FastLanguageModel
import torch
 


max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/mistral-7b-v0.3-bnb-4bit",      # New Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/llama-3-8b-bnb-4bit",           # Llama-3 15 trillion tokens model 2x faster!
    "unsloth/llama-3-8b-Instruct-bnb-4bit",
    "unsloth/llama-3-70b-bnb-4bit",
    "unsloth/Phi-3-mini-4k-instruct",        # Phi-3 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/mistral-7b-bnb-4bit",
    "unsloth/gemma-7b-bnb-4bit",             # Gemma 2.2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-3B-Instruct",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2024.11.10: Fast Llama patching. Transformers:4.46.3.
   \\   /|    GPU: NVIDIA RTX A4000. Max memory: 15.992 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [2]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.11.10 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


In [3]:
from datasets import load_dataset

file_path = "/mnt/c/Users/IoT_lab_YU/dengyiliu/llama3/llama3.2finetune/llama3.2/data/train_dataset.jsonl"


# dataset = load_dataset("databricks/databricks-dolly-15k", split="train")  
dataset = load_dataset("json", data_files=file_path, split="train")

print(dataset.column_names)

['instruction', 'context', 'response', 'category']


In [4]:
from unsloth import to_sharegpt
dataset = to_sharegpt(
    dataset,
    merged_prompt = "{instruction}[[\nYour input is:\n{context}]]",
    output_column_name = "response",
    conversation_extension = 3, # Select more to handle longer conversations
)

In [5]:
from unsloth import standardize_sharegpt
dataset = standardize_sharegpt(dataset)

In [6]:
chat_template = """Below are some instructions that describe some tasks. Write responses that appropriately complete each request.

### Instruction:
{INPUT}

### Response:
{OUTPUT}"""

from unsloth import apply_chat_template
dataset = apply_chat_template(
    dataset,
    tokenizer = tokenizer,
    chat_template = chat_template,
    # default_system_message = "You are a helpful assistant", << [OPTIONAL]
)

Unsloth: We automatically added an EOS token to stop endless generations.
Map: 100%|██████████| 12008/12008 [00:00<00:00, 12810.99 examples/s]


In [7]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 1,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 200,
        num_train_epochs = 200, # For longer training runs!
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

Map (num_proc=2): 100%|██████████| 12008/12008 [00:10<00:00, 1119.49 examples/s]
max_steps is given, it will override any value given in num_train_epochs


In [8]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 12,008 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 4
\        /    Total batch size = 4 | Total steps = 200
 "-____-"     Number of trainable parameters = 24,313,856


Step,Training Loss
1,2.347
2,2.4284
3,2.5746
4,2.6107
5,2.1117
6,2.1381
7,1.8118
8,2.1601
9,1.7478
10,2.3649


In [9]:
model.save_pretrained("../../checkpoints/Llama-3.2-3B-Instructe") # Local saving
tokenizer.save_pretrained("../../tokenizer/lora_llama3b-instructe")

('../../tokenizer/lora_llama3b-instructe/tokenizer_config.json',
 '../../tokenizer/lora_llama3b-instructe/special_tokens_map.json',
 '../../tokenizer/lora_llama3b-instructe/tokenizer.json')

In [10]:
from unsloth import FastLanguageModel
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "../../checkpoints/Llama-3.2-3B-Instructe", # YOUR MODEL YOU USED FOR TRAINING
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

==((====))==  Unsloth 2024.11.10: Fast Llama patching. Transformers:4.46.3.
   \\   /|    GPU: NVIDIA RTX A4000. Max memory: 15.992 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 3072, padding_idx=128004)
        (layers): ModuleList(
          (0-27): 28 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=3072, out_features=3072, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=3072, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=3072, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lor

In [None]:
import json
import evaluate
from bert_score import score as bertscore
from sentence_transformers import SentenceTransformer, util

# Load evaluation metrics
bleu_metric = evaluate.load("bleu")
accuracy_metric = evaluate.load("accuracy")
rouge_metric = evaluate.load("rouge")

# Load predictions from your JSON file
def load_predictions(file_path):
    with open(file_path, "r") as f:
        return json.load(f)

# Function to compute contextual appropriateness (embedding-based similarity)
def compute_contextual_appropriateness(ground_truth, responses, model_name="all-MiniLM-L6-v2"):
    # Load embedding model
    embedding_model = SentenceTransformer(model_name)

    # Compute embeddings
    ground_truth_embeddings = embedding_model.encode(ground_truth, convert_to_tensor=True)
    response_embeddings = embedding_model.encode(responses, convert_to_tensor=True)

    # Compute cosine similarity for corresponding pairs
    similarities = []
    for gt_embed, resp_embed in zip(ground_truth_embeddings, response_embeddings):
        similarity = util.cos_sim(gt_embed, resp_embed).item()  # Get similarity as a scalar value
        similarities.append(similarity)

    # Print the individual similarities for each pair
    # print(similarities)

    # Return the average similarity
    return sum(similarities) / len(similarities) if similarities else 0.0

# Function to run evaluation
def evaluate_predictions(predictions):
    predicted_outputs = []
    references = []
    contexts = []
    instructions = []

    for sample in predictions:
        # Collect predicted and reference responses
        predicted_outputs.append(sample["predicted_response"])
        references.append(sample["response"])  # Ground truth is in the 'response' field

    # Compute BLEU score
    bleu_score = bleu_metric.compute(predictions=predicted_outputs, references=references)

    # Compute Exact Match (EM)
    exact_match_accuracy = sum([1 for p, r in zip(predicted_outputs, references) if p == r[0]]) / len(references)

    # Compute ROUGE scores
    rouge_scores = rouge_metric.compute(predictions=predicted_outputs, references=[ref[0] for ref in references])

    # Compute BERTScore
    P, R, F1 = bertscore(predicted_outputs, [ref[0] for ref in references], lang="en")

    # Compute Contextual Appropriateness (Embedding-Based)

    contextual_appropriateness = compute_contextual_appropriateness(references, predicted_outputs)

    print("Evaluation Metrics:")
    print("====================")
    print(f"BLEU Score: {bleu_score['bleu']:.2f}")
    print(f"Exact Match Accuracy: {exact_match_accuracy:.2f}")
    print("ROUGE Scores:")
    for key, value in rouge_scores.items():
        print(f"  {key.upper()}: {value:.2f}")
    print("BERTScore:")
    print(f"  Precision: {P.mean():.2f}")
    print(f"  Recall: {R.mean():.2f}")
    print(f"  F1 Score: {F1.mean():.2f}")
    print(f"Cosine Similarity (Contextual Appropriateness): {contextual_appropriateness:.2f}")