# Installations and setup

In [1]:
%%capture
import os, re
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    import torch; v = re.match(r"[0-9\.]{3,}", str(torch.__version__)).group(0)
    xformers = "xformers==" + ("0.0.32.post2" if v == "2.8.0" else "0.0.29.post3")
    !pip install --no-deps bitsandbytes accelerate {xformers} peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" "huggingface_hub>=0.34.0" hf_transfer
    !pip install --no-deps unsloth
!pip install transformers==4.55.4
!pip install --no-deps trl==0.22.2
!pip install --no-deps evaluate rouge_score

In [2]:
from evaluate import load # Import from evaluate instead of datasets
import numpy as np
from tqdm import tqdm
import json
from unsloth import FastLanguageModel
import torch

random_state = 3407

max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # We also uploaded 4bit for 405b!
    "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", # New Mistral 12b 2x faster!
    "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
    "unsloth/mistral-7b-v0.3-bnb-4bit",        # Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.10.12: Fast Llama patching. Transformers: 4.55.4.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.96G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/235 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

In [5]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 8, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = random_state,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2025.10.12 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


## Dataset Prep

In [6]:
prompt = """
### Human:
{}

### Assistant
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    outputs      = examples["output"]
    texts = []
    for instruction, output in zip(instructions, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = prompt.format(instruction, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }

from datasets import load_dataset
dataset = load_dataset("azimidokht/recipe-recom", split = "train")
dataset = dataset.map(formatting_prompts_func, batched = True,)

split_dataset = dataset.train_test_split(test_size=0.2, shuffle=True, seed=random_state)

train_dataset = split_dataset["train"]
temp_dataset  = split_dataset["test"]

val_test_split = temp_dataset.train_test_split(test_size=0.5, shuffle=True, seed=random_state)

val_dataset  = val_test_split["train"]
test_dataset = val_test_split["test"]


recipe_recom.parquet:   0%|          | 0.00/214k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5149 [00:00<?, ? examples/s]

Map:   0%|          | 0/5149 [00:00<?, ? examples/s]

# Utilities and functions

In [7]:
prompt = """
### Human:
{}

### Assistant
{}"""

rouge = load("rouge")
bleu = load("bleu")

inferenced_data = {}

def generate_predictions(dataset, model, tokenizer, max_new_tokens=128):
    predictions = []
    references = []
    FastLanguageModel.for_inference(model)
    for example in tqdm(dataset):
        instruction = example["instruction"]
        output = example["output"]
        inputs = tokenizer(
            [
                prompt.format(
                    instruction, # instruction
                    "", # output - leave this blank for generation!
                )
            ], return_tensors = "pt").to("cuda")

        outputs = model.generate(**inputs, max_new_tokens = max_new_tokens, use_cache = True)
        generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

        # Extract the assistant's response
        assistant_response = generated_text.split("### Assistant\n")[-1].strip()

        predictions.append(assistant_response)
        references.append(output)
    return predictions, references


def evaluate(predictions, references, metrics: list[str]):
    """
    Evaluate the predictions against the references using the specified metrics.
    """
    results = {}
    if "rouge" in metrics:
      results["rouge"] = rouge.compute(predictions=predictions, references=references)
    if "bleu" in metrics:
      results["bleu"] = bleu.compute(predictions=predictions, references=[[ref] for ref in references]) # BLEU expects a list of references
    else:
      raise ValueError("Invalid metric")

    return results

def store(data, path):
  with open(path, "w") as f:
    json.dump(data, f)



Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

# Base model evaluation

In [4]:

print("\nGenerating predictions for the validation dataset...")
val_predictions, val_references = generate_predictions(val_dataset, model, tokenizer)
print("Generating predictions for the test dataset...")
test_predictions, test_references = generate_predictions(test_dataset, model, tokenizer)

inferenced_data["test"] = [val_predictions, val_references]
inferenced_data["val"] = [test_predictions, test_references]

store(inferenced_data, "./base-inference.json")

val_results = evaluate(val_predictions, val_references, metrics=["rouge", "bleu"])
print("Validation Rouge results:", val_results["rouge"])
print("Validation BLEU results:", val_results["bleu"])

test_results = evaluate(test_predictions, test_references, metrics=["rouge", "bleu"])
print("Test Rouge results:", test_results["rouge"])
print("Test BLEU results:", test_results["bleu"])

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

KeyboardInterrupt: 

# Fine-tined model evaluation

In [8]:
# Load the model and tokenizer from the Hugging Face Hub
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "azimidokht/recipe-recom-fine-tuned",  # your HF repo
    max_seq_length = 2048,  # or the same you used during training
    dtype = None,           # or "float16"/"bfloat16" depending on your GPU
    load_in_4bit = True,    # set to True if you trained/quantized with 4-bit
)

==((====))==  Unsloth 2025.10.12: Fast Llama patching. Transformers: 4.55.4.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


adapter_model.safetensors:   0%|          | 0.00/83.9M [00:00<?, ?B/s]

In [9]:
inferenced_data_tuned = {}

print("\nGenerating predictions for the validation dataset...")
val_predictions, val_references = generate_predictions(val_dataset, model, tokenizer)
print("Generating predictions for the test dataset...")
test_predictions, test_references = generate_predictions(test_dataset, model, tokenizer)

inferenced_data_tuned["test"] = [val_predictions, val_references]
inferenced_data_tuned["val"] = [test_predictions, test_references]

store(inferenced_data_tuned, "./fine-tuned-inference-r8.json")

val_results_tuned = evaluate(val_predictions, val_references, metrics=["rouge", "bleu"])
print("Validation Rouge results:", val_results_tuned["rouge"])
print("Validation BLEU results:", val_results_tuned["bleu"])

test_results_tuned = evaluate(test_predictions, test_references, metrics=["rouge", "bleu"])
print("Test Rouge results:", test_results_tuned["rouge"])
print("Test BLEU results:", test_results_tuned["bleu"])


Generating predictions for the validation dataset...


100%|██████████| 515/515 [18:59<00:00,  2.21s/it]


Generating predictions for the test dataset...


100%|██████████| 515/515 [18:49<00:00,  2.19s/it]


Validation Rouge results: {'rouge1': np.float64(0.40453685922545124), 'rouge2': np.float64(0.14331750751282368), 'rougeL': np.float64(0.3867874845240147), 'rougeLsum': np.float64(0.38656809820506405)}
Validation BLEU results: {'bleu': 0.18616916303170597, 'precisions': [0.578895096213532, 0.33156498673740054, 0.12156583629893239, 0.060829493087557605], 'brevity_penalty': 0.9591448031459188, 'length_ratio': 0.9599570968895245, 'translation_length': 8055, 'reference_length': 8391}
Test Rouge results: {'rouge1': np.float64(0.3961609293564492), 'rouge2': np.float64(0.1340261275115724), 'rougeL': np.float64(0.3781028237338431), 'rougeLsum': np.float64(0.37816379311891846)}
Test BLEU results: {'bleu': 0.1753465742384433, 'precisions': [0.5765054294175715, 0.32468045855843986, 0.11082838563754595, 0.05320933069065406], 'brevity_penalty': 0.9619947380591407, 'length_ratio': 0.9626989783796627, 'translation_length': 8104, 'reference_length': 8418}


In [11]:
store(inferenced_data_tuned, "./fine-tuned-inference-r8.json")