# APRIL ONLY OFFER 🎁

First we check the GPU version available in the environment and install specific dependencies that are compatible with the detected GPU to prevent version conflicts.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
%%capture
import torch
major_version, minor_version = torch.cuda.get_device_capability()
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
if major_version >= 8:
    !pip install --no-deps packaging ninja einops flash-attn xformers trl peft accelerate bitsandbytes
else:
    !pip install --no-deps xformers trl peft accelerate bitsandbytes
pass

In [4]:
import torch
from unsloth import FastLanguageModel

# ---------------------------- PART 2: Load Model ----------------------------
max_seq_length = 512
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = None,
    load_in_4bit = True,
)


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
Unsloth: Failed to patch Gemma3ForConditionalGeneration.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.3.19: Fast Mistral patching. Transformers: 4.51.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/4.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/155 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.13k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/438 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

In [5]:
# ---------------------------- PART 3: Apply LoRA Adapter ----------------------------
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 16,
    lora_dropout = 0.0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 42,
    use_rslora = False,
    loftq_config = None,
)

Unsloth 2025.3.19 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [None]:
import pandas as pd

try:
    news_data = pd.read_csv('/content/train.txt', delimiter='\t',header=None)
    display(news_data.head())
except FileNotFoundError:
    print("Error: 'train.txt' not found.")
    df_train = None


Unnamed: 0,0
0,{'sentence': 'The worst hotspot for violence h...
1,"{'sentence': ""The BBC's Middle East Correspond..."
2,"{'sentence': ""Human rights groups believe that..."
3,"{'sentence': 'Vanessa Bryant, the widow of Kob..."
4,{'sentence': 'Ms Bryant said that she had aske...


In [None]:
import pandas as pd

try:
    news_data = pd.read_csv('/content/train.txt', delimiter='\t',header=None)
    # Print the actual column names of your DataFrame
    print(news_data[0][0])
    display(news_data.head())
except FileNotFoundError:
    print("Error: 'train.txt' not found.")
    df_train = None

{'sentence': 'The worst hotspot for violence has been in the Palestinian village of Beita, where locals began organising the recent protests after hard-line Jewish settlers set up a new outpost on land claimed by Palestinian olive farmers.', 'triple': [['Palestinian', 'job_title', 'olive farmers']]}


Unnamed: 0,0
0,{'sentence': 'The worst hotspot for violence h...
1,"{'sentence': ""The BBC's Middle East Correspond..."
2,"{'sentence': ""Human rights groups believe that..."
3,"{'sentence': 'Vanessa Bryant, the widow of Kob..."
4,{'sentence': 'Ms Bryant said that she had aske...


In [None]:
# ---------------------------- PART 4: Format Dataset ----------------------------
from datasets import Dataset
import json
import ast

def convert_newskg21_to_alpaca(data):
    formatted = []
    column_name = data.columns[0]

    for index, row in data.iterrows():
        stringified_dict = row[column_name]
        parsed = ast.literal_eval(stringified_dict)  # Convert string to real dict

        sentence = parsed.get("sentence", "")
        triples = parsed.get("triple", [])

        formatted.append({
            "instruction": "Extract triples from the sentence.",
            "input": sentence,
            "output": str(triples)
        })
    return formatted


formatted_data = convert_newskg21_to_alpaca(news_data) # Pass the entire news_data DataFrame
dataset = Dataset.from_list(formatted_data)

alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token

def format_prompt(example):
    return {
        "text": alpaca_prompt.format(example["instruction"], example["input"], example["output"]) + EOS_TOKEN
    }

dataset = dataset.map(format_prompt)


Map:   0%|          | 0/414 [00:00<?, ? examples/s]

In [None]:
dataset

Dataset({
    features: ['instruction', 'input', 'output', 'text'],
    num_rows: 414
})

In [None]:
# ---------------------------- PART 5: Fine-Tune ----------------------------
from trl import SFTTrainer
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="news_llm_outputs",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    learning_rate=2e-4,
    logging_steps=10,
    fp16=not torch.cuda.is_bf16_supported(),
    bf16=torch.cuda.is_bf16_supported(),
    optim="adamw_8bit",
    lr_scheduler_type="linear",
)

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    args=training_args,
    packing=False,
    max_seq_length=max_seq_length,
)

trainer.train()

Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/414 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 414 | Num Epochs = 3 | Total steps = 153
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 41,943,040/7,000,000,000 (0.60% trained)
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mamvamsi1308[0m ([33mamvamsi1308-zhaw[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
10,1.337
20,0.9231
30,0.8935
40,0.8711
50,0.8065
60,0.753
70,0.5979
80,0.6037
90,0.5558
100,0.5738


TrainOutput(global_step=153, training_loss=0.6411398512865204, metrics={'train_runtime': 776.83, 'train_samples_per_second': 1.599, 'train_steps_per_second': 0.197, 'total_flos': 7317971320504320.0, 'train_loss': 0.6411398512865204})

In [None]:
model.save_pretrained_merged(
    save_directory = "full_merged_3",
    tokenizer = tokenizer,
    save_method = "merged_16bit",  # Options: "merged_16bit", "merged_4bit", "lora"
)


Unsloth: You have 1 CPUs. Using `safe_serialization` is 10x slower.
We shall switch to Pytorch saving, which might take 3 minutes and not 30 minutes.
To force `safe_serialization`, set it to `None` instead.
Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 4.1G


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 5.18 out of 12.67 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


 47%|████▋     | 15/32 [00:01<00:01, 15.59it/s]
We will save to Disk and not RAM now.
100%|██████████| 32/32 [00:57<00:00,  1.79s/it]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving full_merged_3/pytorch_model-00001-of-00003.bin...
Unsloth: Saving full_merged_3/pytorch_model-00002-of-00003.bin...
Unsloth: Saving full_merged_3/pytorch_model-00003-of-00003.bin...
Done.


In [None]:
from google.colab import files
files.download("/content/full_merged_model")


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!cp -r full_merged_3 /content/drive/MyDrive/


## Testing

In [6]:
import pandas as pd
import html
import ast

try:
    news_test_data = pd.read_csv('/content/test.txt', delimiter='\t', header=None)
    print(" Test data loaded. Columns:")
    print(news_test_data.columns)
    display(news_test_data.head())

    # Clean and parse the second column (column with index 0)
    # Changed from news_test_data[1] to news_test_data[news_test_data.columns[0]]
    raw_text_column = news_test_data[news_test_data.columns[0]].apply(html.unescape)
    parsed_data = raw_text_column.apply(ast.literal_eval)

    # Create structured DataFrame with columns 'sentence' and 'triple'
    test_data = pd.DataFrame(parsed_data.tolist())

except FileNotFoundError:
    print(" Error: 'test.txt' not found.")
    test_data = None

 Test data loaded. Columns:
Index([0], dtype='int64')


Unnamed: 0,0
0,{'sentence': 'The rarified space is the cavern...
1,{'sentence': 'Strieck moved to Calgary and 199...
2,{'sentence': 'Board president Debra Wright say...
3,{'sentence': 'Ben Rabidoux of Edge Realty Anal...
4,{'sentence': 'The experience is just as stress...


In [None]:
def convert_to_alpaca_format(df):
    formatted = []
    for _, row in df.iterrows():
        formatted.append({
            "instruction": "Extract triples from the sentence.",
            "input": row["sentence"],
            "output": str(row["triple"])  # ground truth
        })
    return formatted

if test_data is not None:
    formatted_test = convert_to_alpaca_format(test_data)


In [8]:
#using the saved trained model

# Update the path to where your model is saved
model_path = "/content/drive/MyDrive/full_merged_3"

# Load the model
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_path,
    max_seq_length=2048,
    dtype=None,
    load_in_4bit=True
)

# Prepare model for inference
FastLanguageModel.for_inference(model)


==((====))==  Unsloth 2025.3.19: Fast Mistral patching. Transformers: 4.51.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


ValueError: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. 

In [None]:
from tqdm import tqdm
FastLanguageModel.for_inference(model)

predictions = []
ground_truths = []

for item in tqdm(formatted_test):
    prompt = alpaca_prompt.format(item["instruction"], item["input"], "")
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    output = model.generate(**inputs, max_new_tokens=128)
    decoded = tokenizer.decode(output[0], skip_special_tokens=True)

    predictions.append(decoded.strip())
    ground_truths.append(item["output"])


NameError: name 'FastLanguageModel' is not defined

In [None]:
import json

with open("newsKG21_predictions.jsonl", "w") as f:
    for item, pred, gold in zip(formatted_test, predictions, ground_truths):
        f.write(json.dumps({
            "sentence": item["input"],
            "predicted_triples": pred,
            "true_triples": gold
        }) + "\n")


In [None]:
import os
import json

# Create folder in Google Drive if it doesn't exist
save_path = "/content/drive/MyDrive/newsKG21"
os.makedirs(save_path, exist_ok=True)

# Define full output path
output_file = os.path.join(save_path, "newsKG21_predictions.jsonl")

# Write predictions and ground truths
with open(output_file, "w") as f:
    for item, pred, gold in zip(formatted_test, predictions, ground_truths):
        f.write(json.dumps({
            "sentence": item["input"],
            "predicted_triples": pred,
            "true_triples": gold
        }) + "\n")

print(f"Predictions saved to: {output_file}")


✅ Predictions saved to: /content/drive/MyDrive/newsKG21/newsKG21_predictions.jsonl


In [None]:
import ast
import re

# STEP 1️: Normalize smart quotes (optional but helpful)
def normalize_quotes(text):
    return re.sub(r"[‘’]", "'", text)

# STEP 2️: Extract only the response (last part of the prompt output)
def extract_response(text):
    lines = text.strip().split('\n')
    for line in reversed(lines):
        line = line.strip()
        if line and not line.startswith("###"):
            return normalize_quotes(line)
    return ""

# STEP 3️: Safely parse list of triples
def safe_parse(text):
    try:
        return set(tuple(x) for x in ast.literal_eval(text))
    except Exception as e:
        print(f"\n⚠️ Parse error:\n{text}\nError: {e}")
        return set()

# STEP 4️: Run evaluation
parsed_preds = [safe_parse(extract_response(p)) for p in predictions]
parsed_truths = [safe_parse(gt) for gt in ground_truths]

tp, fp, fn = 0, 0, 0
for pred_set, truth_set in zip(parsed_preds, parsed_truths):
    tp += len(pred_set & truth_set)  # correct
    fp += len(pred_set - truth_set)  # predicted extra
    fn += len(truth_set - pred_set)  # missed ground truth

precision = tp / (tp + fp + 1e-8)
recall = tp / (tp + fn + 1e-8)
f1 = 2 * precision * recall / (precision + recall + 1e-8)

# STEP 5️: Display results
print("\n Evaluation Metrics:")
print(f" Precision: {precision:.4f}")
print(f" Recall:    {recall:.4f}")
print(f" F1 Score:  {f1:.4f}")


In [None]:
import ast

# ---- STEP 1: Parse Predictions and Ground Truths Safely ----
def safe_parse(text):
    try:
        return set(tuple(triple) for triple in ast.literal_eval(text))
    except Exception as e:
        print(f"Parse error: {text}\nError: {e}")
        return set()

parsed_preds = [safe_parse(p) for p in predictions]
parsed_truths = [safe_parse(gt) for gt in ground_truths]

# ---- STEP 2: Calculate TP, FP, FN ----
tp = fp = fn = 0

for pred_set, truth_set in zip(parsed_preds, parsed_truths):
    tp += len(pred_set & truth_set)        # Correct predictions
    fp += len(pred_set - truth_set)        # Extra predictions
    fn += len(truth_set - pred_set)        # Missed correct triples

# ---- STEP 3: Compute Precision, Recall, F1 ----
precision = tp / (tp + fp + 1e-8)
recall    = tp / (tp + fn + 1e-8)
f1        = 2 * precision * recall / (precision + recall + 1e-8)

# ---- STEP 4: Print Results ----
print("\n Evaluation Metrics:")
print(f" Precision: {precision:.4f}")
print(f Recall:    {recall:.4f}")
print(f"F1 Score:  {f1:.4f}")


Parse error: Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Extract triples from the sentence.

### Input:
The rarified space is the cavernous outdoor concert location in the middle of the Calgary Stampede grounds that’s known as Nashville North, where people lingering at picnic tables sip Molson Canadian and Budweiser as musicians belt out covers of Tim McGraw and Toby Keith.

### Response:
[['Nashville North', 'part_of', 'Calgary Stampede']]
Error: invalid character '’' (U+2019) (<unknown>, line 7)
Parse error: Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Extract triples from the sentence.

### Input:
Strieck moved to Calgary and 1992 and landed at the airport on a Stampede Saturday and fell in love with her new home then and th

In [None]:
import json

with open("predictions.jsonl", "w") as f:
    for input_text, pred, true in zip(formatted_test, predictions, ground_truths):
        f.write(json.dumps({
            "sentence": input_text["input"],
            "predicted_triples": pred,
            "true_triples": true
        }) + "\n")


In [None]:
# ---------------------------- PART 6: Inference ----------------------------
FastLanguageModel.for_inference(model)

def extract_triples(sentence):
    prompt = alpaca_prompt.format(
        "Extract triples from the sentence.",
        sentence,
        ""
    )
    inputs = tokenizer([prompt], return_tensors="pt").to("cuda")
    outputs = model.generate(**inputs, max_new_tokens=128)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example usage
print(extract_triples("Elon Musk is the CEO of SpaceX."))

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Extract triples from the sentence.

### Input:
Elon Musk is the CEO of SpaceX.

### Response:
[['Elon Musk', 'CEO_of', 'SpaceX']]


#STOP_HERE

Next we need to prepare to load a range of quantized language models, including a new 15 trillion token LLama-3 model, optimized for memory efficiency with 4-bit quantization.


In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! Llama 3 is up to 8k
dtype = None
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

fourbit_models = [
    "unsloth/mistral-7b-bnb-4bit",
    "unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
    "unsloth/llama-2-7b-bnb-4bit",
    "unsloth/gemma-7b-bnb-4bit",
    "unsloth/gemma-7b-it-bnb-4bit",
    "unsloth/gemma-2b-bnb-4bit",
    "unsloth/gemma-2b-it-bnb-4bit",
    "unsloth/llama-3-8b-bnb-4bit",
]

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-bnb-4bit", # Llama-3 70b also works (just change the model name)
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

config.json:   0%|          | 0.00/1.14k [00:00<?, ?B/s]

==((====))==  Unsloth: Fast Llama patching release 2024.4
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. Xformers = 0.0.25.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/121 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/449 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.




---



Next, we integrate LoRA adapters into our model, which allows us to efficiently update just a fraction of the model's parameters, enhancing training speed and reducing computational load.

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

Unsloth 2024.4 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


<a name="Data"></a>
### Data Prep
We now use the Alpaca dataset from [yahma](https://huggingface.co/datasets/yahma/alpaca-cleaned), which is a filtered version of 52K of the original [Alpaca dataset](https://crfm.stanford.edu/2023/03/13/alpaca.html). You can replace this code section with your own data prep.

Then, we define a system prompt that formats tasks into instructions, inputs, and responses, and apply it to a dataset to prepare our inputs and outputs for the model, with an EOS token to signal completion.


In [None]:
# this is basically the system prompt
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # do not forget this part!
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN # without this token generation goes on forever!
        texts.append(text)
    return { "text" : texts, }
pass

from datasets import load_dataset
dataset = load_dataset("yahma/alpaca-cleaned", split = "train")
dataset = dataset.map(formatting_prompts_func, batched = True,)

Downloading readme:   0%|          | 0.00/11.6k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/44.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/51760 [00:00<?, ? examples/s]

Map:   0%|          | 0/51760 [00:00<?, ? examples/s]

<a name="Train"></a>
### Train the model
- We do 60 steps to speed things up, but you can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`.
- At this stage, we're configuring our model's training setup, where we define things like batch size and learning rate, to teach our model effectively with the data we have prepared.

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = None, # increase this to make the model learn "better"
        num_train_epochs=4
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

  self.pid = os.fork()


Map (num_proc=2):   0%|          | 0/51760 [00:00<?, ? examples/s]

In [None]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.748 GB.
5.668 GB of memory reserved.


In [None]:
# We're now kicking off the actual training of our model, which will spit out some statistics showing us how well it learns
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 51,760 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
1,1.8146
2,2.2932
3,1.6895
4,1.9524
5,1.6457
6,1.6399
7,1.2177
8,1.2469
9,1.0693
10,1.1739


In [None]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

476.2261 seconds used for training.
7.94 minutes used for training.
Peak reserved memory = 8.982 GB.
Peak reserved memory for training = 3.314 GB.
Peak reserved memory % of max memory = 60.903 %.
Peak reserved memory for training % of max memory = 22.471 %.


<a name="Inference"></a>
### Inference
Let's run the model! You can change the instruction and input - leave the output blank!

In [None]:
FastLanguageModel.for_inference(model)
inputs = tokenizer(
[
    alpaca_prompt.format(
        "List the prime numbers contained within the range.", # instruction
        "1-50", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 128, use_cache = True)
tokenizer.batch_decode(outputs)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


['Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nList the prime numbers contained within the range.\n\n### Input:\n1-50\n\n### Response:\n2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47<|end_of_text|>']

 You can also use a `TextStreamer` for continuous inference - so you can see the generation token by token, instead of waiting the whole time!

In [None]:
FastLanguageModel.for_inference(model)
inputs = tokenizer(
[
    alpaca_prompt.format(
        "Convert these binary numbers to decimal.", # instruction
        "1010, 1101, 1111", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Convert these binary numbers to decimal.

### Input:
1010, 1101, 1111

### Response:
The decimal equivalent of 1010 is 10. The decimal equivalent of 1101 is 13. The decimal equivalent of 1111 is 15.<|end_of_text|>


<a name="Save"></a>
### Saving, loading finetuned models
To save the final model as LoRA adapters, either use Huggingface's `push_to_hub` for an online save or `save_pretrained` for a local save.

**[NOTE]** This ONLY saves the LoRA adapters, and not the full model. To save to 16bit or GGUF, scroll down!

In [None]:
model.save_pretrained("lora_model") # Local saving
# model.push_to_hub("your_name/lora_model", token = "...") # Online saving

Now if you want to load the LoRA adapters we just saved for inference, set `False` to `True`:

In [None]:
if False:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "lora_model", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model)

# alpaca_prompt = You MUST run cells from above!

inputs = tokenizer(
[
    alpaca_prompt.format(
        "What is a famous tall tower in Paris?", # instruction
        "", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


["Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nWhat is a famous tall tower in Paris?\n\n### Input:\n\n\n### Response:\nOne of the most famous tall towers in Paris is the Eiffel Tower. It is a wrought iron tower located on the Champ de Mars in Paris, France. It was built in 1889 as the entrance to the 1889 World's Fair, and it was designed by the French engineers Gustave Eiff"]

You can also use Hugging Face's `AutoModelForPeftCausalLM`. Only use this if you do not have `unsloth` installed. It can be hopelessly slow, since `4bit` model downloading is not supported, and Unsloth's **inference is 2x faster**.

In [None]:
if False:
    # I highly do NOT suggest - use Unsloth if possible
    from peft import AutoPeftModelForCausalLM
    from transformers import AutoTokenizer
    model = AutoPeftModelForCausalLM.from_pretrained(
        "lora_model", # YOUR MODEL YOU USED FOR TRAINING
        load_in_4bit = load_in_4bit,
    )
    tokenizer = AutoTokenizer.from_pretrained("lora_model")

We're preparing to save our trained model in a more compact format and then upload it to a cloud platform, which allows us to use less storage and computational power.

In [None]:
# Merge to 16bit
if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_16bit",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_16bit", token = "")

# Merge to 4bit
if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_4bit",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_4bit", token = "")

# Just LoRA adapters
if False: model.save_pretrained_merged("model", tokenizer, save_method = "lora",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "lora", token = "")

We're ready to compress our model using various quantization methods to make it leaner and then upload it to the cloud for easy sharing and access.

In [None]:
# Save to 8bit Q8_0
if False: model.save_pretrained_gguf("model", tokenizer,)
if False: model.push_to_hub_gguf("hf/model", tokenizer, token = "")

# Save to 16bit GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "f16")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "f16", token = "")

# Save to q4_k_m GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "q4_k_m")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "q4_k_m", token = "")

Now, use the `model-unsloth.gguf` file or `model-unsloth-Q4_K_M.gguf` file in `llama.cpp` or a UI based system like `GPT4All`. You can install GPT4All by going [here](https://gpt4all.io/index.html).

And we're done! If you have any questions on Unsloth, join their [Discord](https://discord.gg/u54VK8m8tk) channel!