In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from datasets import load_dataset
from datasets import Dataset
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 2x faster
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # 4bit for 405b!
    "unsloth/Mistral-Small-Instruct-2409",     # Mistral 22b 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!

    "unsloth/Llama-3.2-1B-bnb-4bit",           # NEW! Llama 3.2 models
    "unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
    "unsloth/Llama-3.2-3B-bnb-4bit",
    "unsloth/Llama-3.2-3B-Instruct-bnb-4bit",
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-Instruct-bnb-4bit", # or choose "unsloth/Llama-3.2-1B-Instruct"
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2024.11.10: Fast Llama patching. Transformers:4.46.3.
   \\   /|    GPU: NVIDIA RTX A4000. Max memory: 15.992 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [3]:
from datasets import load_dataset
from unsloth.chat_templates import get_chat_template

# Define a LLaMA 3.2 compatible template
template_tokenizer = get_chat_template(
    tokenizer=tokenizer,  # Replace with your tokenizer if already defined
    chat_template="llama",  # Specify LLaMA template
    mapping={"role": "from", "content": "value", "user": "user", "assistant": "assistant"},
    map_eos_token=True,  # Map <|end|> to </s> token
)

# Function to format the dataset
def format_to_llama32(examples):
    formatted_texts = []
    for instruction, context, response in zip(examples["instruction"], examples["context"], examples["response"]):
        user_text = f"<|user|> Instruction: {instruction}\nContext: {context}"
        assistant_text = f"<|assistant|> {response}"
        formatted_texts.append(f"{user_text}\n{assistant_text}")
    return {"text": formatted_texts}

In [4]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.11.10 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [5]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1",
)
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
    {"role": "user", "content": "Instruction: What is the capital of Australia?"},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

outputs = model.generate(input_ids = inputs, max_new_tokens = 128, use_cache = True,
                         temperature = 1.5, min_p = 0.1)
decoded_output = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0] # Get the first (and likely only) string from the list
response = decoded_output.split('assistant')[1].strip() if 'assistant' in decoded_output else decoded_output.strip()
print(response)  # Print the response to check

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


The capital of Australia is Canberra.


In [6]:
import json
from transformers import AutoTokenizer, AutoModelForCausalLM
from unsloth.chat_templates import get_chat_template
import pandas as pd
from datasets import Dataset
from tqdm import tqdm


# Function to generate response using the fine-tuned model
def generate_response(model, tokenizer, input_text):
    # Apply chat template using get_chat_template function
    tokenizer = get_chat_template(tokenizer, chat_template="llama-3.1")
    # Prepare messages in the chat format
    messages = [{"role": "user", "content": input} for input in input_text]
    # Tokenize the input text
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,  # Add prompt for generation
        return_tensors="pt",
    ).to("cuda")
    print(inputs.size())
    # Generate model output
    outputs = model.generate(
        input_ids=inputs,
        max_new_tokens=128,  # Adjust as necessary
        use_cache=True,
        temperature=1.5,
        min_p=0.1
    )
    # Decode model output
    decoded_output = tokenizer.batch_decode(outputs, skip_special_tokens=False)[0]
    return decoded_output

# Generate predictions and store them in a JSON file
def generate_predictions_and_store(dataset, model, tokenizer, output_file="predictions.json"):
    predictions = []
    input_text = []
    for sample in tqdm(dataset, desc="Generating predictions"): # Added tqdm here
        instruction = sample["instruction"]
        text = sample["text"]
        context = sample["context"]
        input = f"Instruction: {instruction}\nContext: {context}"
        # Combine the instruction and context with the text field for evaluation
        input_text.append(input)
    # Generate prediction using the model
    predicted = generate_response(model, tokenizer, input_text)
    print(predicted)
    predicted = predicted.split('<|assistant|>')
    for i, sample in enumerate(dataset):
        instruction = sample["instruction"]
        context = sample["context"]
        text = sample["text"]
        response = sample["response"]

        # predicted_response = predicted[i+1].split('<|eot_id|>')[0]
        # Store results in predictions
        predictions.append({
            "instruction": instruction,
            "context": context,
            "text": text,
            "predicted_response": response,
            "response": response
        })

    # Write predictions to JSON
    with open(output_file, "w") as f:
        json.dump(predictions, f, indent=4)
    print(f"Predictions saved to {output_file}")

# def format_to_llama32_test(examples):
#     formatted_texts = []
#     for instruction, context, response in zip(examples["instruction"], examples["context"], examples["response"]):
#         user_text = f"<|user|> Instruction: {instruction}\nContext: {context}"
#         assistant_text = f"<|assistant|> {response}"
#         formatted_texts.append(f"{user_text}")
#     return {"text": formatted_texts}

In [7]:
import json
from transformers import AutoTokenizer, AutoModelForCausalLM
from unsloth.chat_templates import get_chat_template
import pandas as pd
from datasets import Dataset
from tqdm import tqdm


# Function to generate response using the fine-tuned model
def generate_response(model, tokenizer, input_text):
    # Apply chat template using get_chat_template function
    tokenizer = get_chat_template(tokenizer, chat_template="llama-3.1")
    # Prepare messages in the chat format
    messages = [{"role": "user", "content": input_text},]
    # Tokenize the input text
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,  # Add prompt for generation
        return_tensors="pt",
    ).to("cuda")

    # Generate model output
    outputs = model.generate(
        input_ids=inputs,
        max_new_tokens=128,  # Adjust as necessary
        use_cache=True,
        temperature=1.5,
        min_p=0.1
    )
    # Decode model output
    decoded_output = tokenizer.batch_decode(outputs, skip_special_tokens=False)[0]
    return decoded_output

# Generate predictions and store them in a JSON file
def generate_predictions_and_store(dataset, model, tokenizer, output_file="predictions.json"):
    predictions = []
    input_text = []
    for sample in tqdm(dataset, desc="Generating predictions"): # Added tqdm here
        instruction = sample["instruction"]
        text = sample["text"]
        context = sample["context"]
        input = f"Instruction: {instruction}\nContext: {context}"
        response = sample["response"]

        # Generate prediction using the model
        predicted = generate_response(model, tokenizer, input)
        predicted = predicted.split('<|start_header_id|>assistant<|end_header_id|>')[1]
        predictions.append({
            "instruction": instruction,
            "context": context,
            "text": text,
            "predicted_response": predicted,
            "response": response
        })

    # Write predictions to JSON
    with open(output_file, "w") as f:
        json.dump(predictions, f, indent=4)
    print(f"Predictions saved to {output_file}")




In [None]:
test_data_dir = "/mnt/c/Users/IoT_lab_YU/dengyiliu/llama3/llama3.2finetune/llama3.2/data/test_categories"  # Replace with your actual directory

for filename in os.listdir(test_data_dir):
    if filename.endswith(".jsonl"):
        file_path = os.path.join(test_data_dir, filename)
        try:
            df = pd.read_json(file_path, lines=True)
            dataset_test = Dataset.from_pandas(df)
            dataset_test = dataset_test.map(format_to_llama32, batched=True)
            output_filename = f"/mnt/c/Users/IoT_lab_YU/dengyiliu/llama3/llama3.2finetune/llama3.2/evaluate/predictions_8b_beforeft_{filename.split('.')[0]}.jsonl" #create output filename
            generate_predictions_and_store(dataset_test, model, tokenizer, output_file=output_filename)
        except Exception as e:
            print(f"Error processing {filename}: {e}")

Map: 100%|██████████| 353/353 [00:00<00:00, 80357.63 examples/s]
Generating predictions:   0%|          | 0/353 [00:00<?, ?it/s]

Generating predictions: 100%|██████████| 353/353 [58:40<00:00,  9.97s/it]


Predictions saved to /mnt/c/Users/IoT_lab_YU/dengyiliu/llama3/llama3.2finetune/llama3.2/evaluate/predictions_8b_beforeft_test_dataset_brainstorming.jsonl


Map: 100%|██████████| 427/427 [00:00<00:00, 35613.51 examples/s]
Generating predictions: 100%|██████████| 427/427 [50:16<00:00,  7.06s/it] 


Predictions saved to /mnt/c/Users/IoT_lab_YU/dengyiliu/llama3/llama3.2finetune/llama3.2/evaluate/predictions_8b_beforeft_test_dataset_classification.jsonl


Map: 100%|██████████| 355/355 [00:00<00:00, 37218.87 examples/s]
Generating predictions: 100%|██████████| 355/355 [32:44<00:00,  5.53s/it]


Predictions saved to /mnt/c/Users/IoT_lab_YU/dengyiliu/llama3/llama3.2finetune/llama3.2/evaluate/predictions_8b_beforeft_test_dataset_closed_qa.jsonl


Map: 100%|██████████| 142/142 [00:00<00:00, 16048.91 examples/s]
Generating predictions: 100%|██████████| 142/142 [23:54<00:00, 10.11s/it]


Predictions saved to /mnt/c/Users/IoT_lab_YU/dengyiliu/llama3/llama3.2finetune/llama3.2/evaluate/predictions_8b_beforeft_test_dataset_creative_writing.jsonl


Map: 100%|██████████| 438/438 [00:00<00:00, 18893.46 examples/s]
Generating predictions: 100%|██████████| 438/438 [1:02:36<00:00,  8.58s/it]


Predictions saved to /mnt/c/Users/IoT_lab_YU/dengyiliu/llama3/llama3.2finetune/llama3.2/evaluate/predictions_8b_beforeft_test_dataset_general_qa.jsonl


Map: 100%|██████████| 301/301 [00:00<00:00, 6375.64 examples/s]
Generating predictions: 100%|██████████| 301/301 [24:51<00:00,  4.96s/it]


Predictions saved to /mnt/c/Users/IoT_lab_YU/dengyiliu/llama3/llama3.2finetune/llama3.2/evaluate/predictions_8b_beforeft_test_dataset_information_extraction.jsonl


Map: 100%|██████████| 749/749 [00:00<00:00, 28359.08 examples/s]
Generating predictions: 100%|██████████| 749/749 [1:38:56<00:00,  7.93s/it]


Predictions saved to /mnt/c/Users/IoT_lab_YU/dengyiliu/llama3/llama3.2finetune/llama3.2/evaluate/predictions_8b_beforeft_test_dataset_open_qa.jsonl


Map: 100%|██████████| 238/238 [00:00<00:00, 11781.06 examples/s]
Generating predictions: 100%|██████████| 238/238 [32:41<00:00,  8.24s/it]

Predictions saved to /mnt/c/Users/IoT_lab_YU/dengyiliu/llama3/llama3.2finetune/llama3.2/evaluate/predictions_8b_beforeft_test_dataset_summarization.jsonl





: 