### Installation

In [1]:
%%capture
import os, re
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    import torch; v = re.match(r"[0-9\.]{3,}", str(torch.__version__)).group(0)
    xformers = "xformers==" + ("0.0.32.post2" if v == "2.8.0" else "0.0.29.post3")
    !pip install --no-deps bitsandbytes accelerate {xformers} peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" "huggingface_hub>=0.34.0" hf_transfer
    !pip install --no-deps unsloth
!pip install transformers==4.55.4
!pip install --no-deps trl==0.22.2
!pip install --no-deps evaluate rouge_score

### Unsloth

In [2]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # We also uploaded 4bit for 405b!
    "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", # New Mistral 12b 2x faster!
    "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
    "unsloth/mistral-7b-v0.3-bnb-4bit",        # Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.
ü¶• Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.10.9: Fast Llama patching. Transformers: 4.55.4.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.96G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/235 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

We now add LoRA adapters so we only need to update 1 to 10% of all parameters!

In [5]:
random_state = 3407
model = FastLanguageModel.get_peft_model(
    model,
    r = 8, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = random_state,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2025.10.9 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [6]:
prompt = """
### Human:
{}

### Assistant
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    outputs      = examples["output"]
    texts = []
    for instruction, output in zip(instructions, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = prompt.format(instruction, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }

from datasets import load_dataset
dataset = load_dataset("azimidokht/recipe-recom", split = "train")
dataset = dataset.map(formatting_prompts_func, batched = True,)

split_dataset = dataset.train_test_split(test_size=0.2, shuffle=True, seed=random_state)

train_dataset = split_dataset["train"]
temp_dataset  = split_dataset["test"]

val_test_split = temp_dataset.train_test_split(test_size=0.5, shuffle=True, seed=random_state)

val_dataset  = val_test_split["train"]
test_dataset = val_test_split["test"]


Map:   0%|          | 0/5149 [00:00<?, ? examples/s]

## Base model inference

In [3]:
from evaluate import load # Import from evaluate instead of datasets
import numpy as np
from tqdm import tqdm
import json
import re
from typing import Optional, Dict, Set, Iterable, Tuple, List



# Helpers  that are used by all functions

_NUM_RE = re.compile(r'(?P<num>\d+(?:\.\d+)?)\s*(?P<unit>cal(?:ories)?|g|mg|minute(?:s)?|min)?', re.I)
## This is a regular expression that finds numbers in text (like 400 calories, 20 g, 15 min) and captures both the value and unit.

def _norm(s: str) -> str:
    return re.sub(r'\s+', ' ', s.strip().lower())
## we use this to  Clean a string by making it lowercase and removing extra spaces. This helps with consistent comparisons.

def _extract_numbers_with_units(text: str) -> list[Tuple[float, str]]:
    vals = []
    for m in _NUM_RE.finditer(text):
        n = float(m.group('num'))
        u = (m.group('unit') or '').lower()
        if u in ('minute', 'minutes'):
            u = 'min'
        elif u in ('cal', 'calories'):
            u = 'cal'
        vals.append((n, u))
    return vals
##This function scans text and returns a list of (number, unit) pairs, normalizing units like ‚Äúminutes‚Äù ‚Üí min, ‚Äúcalories‚Äù ‚Üí cal.



def _find_limit(instr: str, keyword: str, default: Optional[float] = None) -> Optional[float]:
    """
    Extract the first number after a keyword like 'under'/'less than' etc.
    """
    instr = _norm(instr)
    # Try to capture expressions like 'under 400 calories', 'less than 30 minutes'
    m = re.search(rf'(?:under|less than|at most|no more than)\s+(\d+(?:\.\d+)?)', instr)
    if m:
        return float(m.group(1))
    return default

##Tries to find a numeric limit in an instruction (for example ‚Äúunder 400 calories‚Äù ‚Üí 400).



def _mentions_any(text: str, words: Iterable[str]) -> bool:
    t = _norm(text)
    return any(w.lower() in t for w in words)

#we use this function to Check if any keyword appears in the text (e.g., does the instruction mention ‚Äúprotein‚Äù or ‚Äúminutes‚Äù?).


def _extract_ingredients(instr: str) -> list[str]:
    """
    Extract all ingredients mentioned after 'using' in the instruction.
     for example: 'using chicken, rice, and beans' ‚Üí ['chicken', 'rice', 'beans']
    """
    instr = _norm(instr)
    m = re.search(r'using\s+([a-zA-Z][\w\s\,\-\sand]+)', instr)
    if not m:
        return []
    segment = m.group(1)
    # Split by commas or 'and'
    items = re.split(r',|\band\b', segment)
    ingredients = [i.strip() for i in items if i.strip()]
    return ingredients


def _numbers_for_kind(pairs: list[Tuple[float, str]], kind: str) -> list[float]:
    # kind in {'cal','g','mg','min',''} ‚Äì we‚Äôll filter by typical context below
    return [n for n,u in pairs if (kind == '' or u == kind)]
###This function filters extracted numbers by unit (e.g., get only calorie numbers or only minutes).


##### We Build a richer catalog from our HUMMUS dataframe
### We do this because we want to check all the food types

import ast
import pandas as pd
###This function builds 3 sets which are all recipe titles,ingridients names, keywords.
def build_catalog_from_df(df: pd.DataFrame) -> Dict[str, Set[str]]:
    def _norm(s: str) -> str:
        import re
        return re.sub(r'\s+', ' ', str(s).strip().lower())

    titles = set(_norm(t) for t in df['title'].dropna().astype(str))

    # Ingredients
    ingredients = set()
    if 'ingredient_food_kg_names' in df.columns:
        for x in df['ingredient_food_kg_names'].dropna():
            items = []
            if isinstance(x, list):
                items = x
            else:
                try:
                    items = ast.literal_eval(str(x))
                except Exception:
                    items = [p.strip() for p in str(x).split(',')]
            for i in items:
                if i and isinstance(i, str):
                    ingredients.add(_norm(i))

    # Tags/labels as keywords (includes diet labels like vegan/vegetarian/gluten-free if present)
    keywords = set()
    if 'tags' in df.columns:
        for x in df['tags'].dropna():
            items = []
            if isinstance(x, list):
                items = x
            else:
                try:
                    items = ast.literal_eval(str(x))
                except Exception:
                    items = [p.strip() for p in str(x).split(',')]
            for t in items:
                if t and isinstance(t, str):
                    keywords.add(_norm(t))

    # we can also add common nutrition labels that we care about:
    keywords |= {"vegan", "vegetarian", "gluten-free", "low-sodium", "low sodium", "keto", "paleo"}

    return {"titles": titles, "ingredients": ingredients, "keywords": keywords}


    #It‚Äôs a safety net that expands our keyword list to include important, general diet labels













# 1) Constraint satisfaction that covers ALL our query types

def compute_constraint_satisfaction(
    predictions: list[str],
    instructions: list[str],
    *,
    calorie_moderate_range: Tuple[float, float] = (300.0, 700.0),  # for "balanced"
    protein_min_for_balanced: float = 15.0
) -> Dict[str, float]:
    """
    Parses each instruction to detect the constraints (calories/time/protein/sodium/fiber/ingredients/balanced,
    including multi-list variants) and checks if the  prediction BOTH mentions and satisfies them.

    Returns:
        {
          "constraint_precision": satisfied/mentioned,
          "constraint_recall":    satisfied/total_constraints
        }
    """
    total_constraints = 0
    satisfied_constraints = 0
    mentioned_constraints = 0

    for pred, instr in zip(predictions, instructions):
        p_txt = _norm(pred)
        i_txt = _norm(instr)

        nums_pred = _extract_numbers_with_units(p_txt)

        # We track constraints identified in the instruction
        # Each append is a tuple (name, mentioned_bool, satisfied_bool)
        checks: list[Tuple[str, bool, bool]] = []

        # Single-recipe variants

        # Calorie-based: under/around X
        if _mentions_any(i_txt, ["calorie", "calories"]):
            total_constraints += 1
            cal_limit = _find_limit(i_txt, "calorie")
            # Mentioned if prediction includes any calorie-like number
            cal_nums = _numbers_for_kind(nums_pred, 'cal') or _numbers_for_kind(nums_pred, '')
            mentioned = len(cal_nums) > 0
            satisfied = False
            if mentioned:
                if "around" in i_txt and cal_nums:
                    # around X: allow ¬±15%
                    # find first number mentioned in instruction
                    im = re.search(r'(\d+(?:\.\d+)?)\s*cal', i_txt)
                    if im:
                        tgt = float(im.group(1))
                        satisfied = any(abs(n - tgt) <= 0.15 * tgt for n in cal_nums)
                elif cal_limit is not None:
                    satisfied = any(n <= cal_limit for n in cal_nums)
                else:
                    # If  there is no explicit limit (rare), we consider it mentioned but not evaluable
                    satisfied = False
            checks.append(("calories", mentioned, satisfied))

        # This is time-based: less than X minutes
        if _mentions_any(i_txt, ["minutes", "minute", "min"]):
            total_constraints += 1
            t_limit = _find_limit(i_txt, "minute")
            # Mentioned if any minutes are in the prediction
            time_nums = _numbers_for_kind(nums_pred, 'min') or _numbers_for_kind(nums_pred, '')
            mentioned = len(time_nums) > 0
            satisfied = any(n <= (t_limit or n) for n in time_nums) if mentioned and t_limit is not None else mentioned and False
            checks.append(("time", mentioned, satisfied))

        # High-protein: at least X g
        if _mentions_any(i_txt, ["high-protein", "protein"]):
            total_constraints += 1
            # It pulls a lower bound from instruction
            m = re.search(r'at least\s+(\d+(?:\.\d+)?)\s*g', i_txt)
            prot_min = float(m.group(1)) if m else None
            prot_nums = _numbers_for_kind(nums_pred, 'g') or _numbers_for_kind(nums_pred, '')
            mentioned = len(prot_nums) > 0
            satisfied = any(n >= prot_min for n in prot_nums) if mentioned and prot_min is not None else mentioned and False
            checks.append(("protein", mentioned, satisfied))

        # Low-sodium: under X mg
        if _mentions_any(i_txt, ["low-sodium", "low sodium", "sodium"]):
            total_constraints += 1
            sod_limit = _find_limit(i_txt, "sodium")
            sod_nums = _numbers_for_kind(nums_pred, 'mg') or _numbers_for_kind(nums_pred, '')
            mentioned = len(sod_nums) > 0
            satisfied = any(n <= sod_limit for n in sod_nums) if mentioned and sod_limit is not None else mentioned and False
            checks.append(("sodium", mentioned, satisfied))

        # Ingredient-based: using one or more ingredients
        if "using" in i_txt:
            ings = _extract_ingredients(instr)
            if ings:
                total_constraints += 1
                # Mentioned if *all* requested ingredients appear in the prediction text
                mentioned = all(ing in p_txt for ing in ings)
                satisfied = mentioned
                checks.append(("ingredients_list", mentioned, satisfied))


        # High-fiber: at least X g fiber
        if _mentions_any(i_txt, ["fiber", "fibre"]):
            total_constraints += 1
            m = re.search(r'at least\s+(\d+(?:\.\d+)?)\s*g', i_txt)
            fib_min = float(m.group(1)) if m else None
            fiber_nums = _numbers_for_kind(nums_pred, 'g') or _numbers_for_kind(nums_pred, '')
            mentioned = len(fiber_nums) > 0
            satisfied = any(n >= fib_min for n in fiber_nums) if mentioned and fib_min is not None else mentioned and False
            checks.append(("fiber", mentioned, satisfied))

        # Balanced meal: which moderates calories & good protein
        if "balanced meal" in i_txt:
            total_constraints += 1
            cal_nums = _numbers_for_kind(nums_pred, 'cal') or _numbers_for_kind(nums_pred, '')
            prot_nums = _numbers_for_kind(nums_pred, 'g') or _numbers_for_kind(nums_pred, '')
            mentioned = (len(cal_nums) > 0) and (len(prot_nums) > 0)
            cal_ok = any(calorie_moderate_range[0] <= n <= calorie_moderate_range[1] for n in cal_nums)
            prot_ok = any(n >= protein_min_for_balanced for n in prot_nums)
            satisfied = mentioned and cal_ok and prot_ok
            checks.append(("balanced", mentioned, satisfied))

        #  Multi-recipe variants (we don't enforce exact "3 items", but we do enforce the constraint)
        if "list three" in i_txt or "suggest three" in i_txt:
            # Low-calorie list
            if "low-calorie" in i_txt or "low calorie" in i_txt:
                total_constraints += 1
                cal_limit = _find_limit(i_txt, "calorie")
                cal_nums = _numbers_for_kind(nums_pred, 'cal') or _numbers_for_kind(nums_pred, '')
                mentioned = len(cal_nums) >= 1
                satisfied = all(n <= cal_limit for n in cal_nums) if mentioned and cal_limit is not None and len(cal_nums) >= 3 else mentioned and False
                checks.append(("multi_low_cal", mentioned, satisfied))

            # Low-sodium list
            if "low-sodium" in i_txt or "low sodium" in i_txt:
                total_constraints += 1
                sod_limit = _find_limit(i_txt, "sodium")
                sod_nums = _numbers_for_kind(nums_pred, 'mg') or _numbers_for_kind(nums_pred, '')
                mentioned = len(sod_nums) >= 1
                satisfied = all(n <= sod_limit for n in sod_nums) if mentioned and sod_limit is not None and len(sod_nums) >= 3 else mentioned and False
                checks.append(("multi_low_sodium", mentioned, satisfied))

            # Quick list
            if "under 30 minutes" in i_txt or "under 30 min" in i_txt:
                total_constraints += 1
                time_nums = _numbers_for_kind(nums_pred, 'min') or _numbers_for_kind(nums_pred, '')
                mentioned = len(time_nums) >= 1
                satisfied = all(n <= 30 for n in time_nums) if mentioned and len(time_nums) >= 3 else mentioned and False
                checks.append(("multi_quick", mentioned, satisfied))

        # Aggregate precision/recall contributions
        for _, mentioned, satisfied in checks:
            if mentioned:
                mentioned_constraints += 1
            if satisfied:
                satisfied_constraints += 1

    precision = (satisfied_constraints / mentioned_constraints) if mentioned_constraints else 0.0
    recall = (satisfied_constraints / total_constraints) if total_constraints else 0.0
    return {"precision": precision, "recall": recall}








def compute_factual_consistency(
    predictions: List[str],
    references: List[str],
    catalog: Optional[Dict[str, Set[str]]] = None,
    *,
    numeric_tolerance: float = 0.15,
    partial_credit_catalog_only: float = 0.25
) -> float:
    """
    Dataset-aware factual consistency:
      ‚Ä¢ Compares numbers with units (cal, g, mg, min) within tolerance.
      ‚Ä¢ Compares *keywords* that are dynamically from the dataset catalog
        (ingredients, tags/labels, titles) ‚Äî no hard-coded food list.
      ‚Ä¢ Checks ingredient claims like "using X and Y and z".
      ‚Ä¢ Gives small partial credit if a keyword is valid in the dataset catalog
        but not explicitly present in the reference string.

    catalog keys expected which is  (optional) but helps alot:
        {
          "titles": set([...]),         # normalized recipe titles
          "ingredients": set([...]),    # normalized ingredient phrases
          "keywords": set([...])        # normalized tags/labels/extra keywords
        }
    """

    total_atoms = 0.0
    consistent_atoms = 0.0

    #This is dynamic keyword universe from catalog
    kw_universe: Set[str] = set()
    if catalog:
        kw_universe |= catalog.get("ingredients", set())
        kw_universe |= catalog.get("keywords", set())
        # Titles can act like labels/keywords too (for exact mentions)
        kw_universe |= catalog.get("titles", set())

    # This is word-boundary matchers for multiword phrases
    def _in(text: str, phrase: str) -> bool:
        #  (handles multiword)
        return phrase in text

    for pred, ref in zip(predictions, references):
        p = _norm(pred)
        r = _norm(ref)

        # 1) Numeric facts with units
        p_nums = _extract_numbers_with_units(p)
        r_nums = _extract_numbers_with_units(r)
        ref_by_unit = {"cal": [], "g": [], "mg": [], "min": [], "": []}
        for n,u in r_nums:
            ref_by_unit[u if u in ref_by_unit else ""].append(n)

        for n,u in p_nums:
            total_atoms += 1
            candidates = ref_by_unit.get(u, []) + ref_by_unit.get("", [])
            if not candidates:

                consistent_atoms += 0.5
            else:
                if any(abs(n - rr) <= numeric_tolerance * (rr if rr != 0 else max(1.0, n)) for rr in candidates):
                    consistent_atoms += 1.0

        # 2) Dataset keyword/label consistency
        # This Only evaluates keywords that actually appear in the prediction
        if kw_universe:
            for kw in kw_universe:
                if _in(p, kw):
                    total_atoms += 1
                    if _in(r, kw):
                        consistent_atoms += 1.0
                    else:
                        # Keyword appears in pred and is valid in dataset (catalog),
                        # but not in the reference string ‚Üí small partial credit.
                        consistent_atoms += partial_credit_catalog_only


        # Ingredient list  "using X, Y, (and Z) etc"
        ings = _extract_ingredients(pred)
        if ings:
            for ing in ings:
                total_atoms += 1
                if ing in r:
                    consistent_atoms += 1.0
                else:
                    if catalog and ing in catalog.get("ingredients", set()):
                        consistent_atoms += partial_credit_catalog_only

        # 4) Title plausibility as a factual marker
        mtitle = re.match(r'^([^\-\:\|]{3,80})\s*[\-\:\|]', pred.strip())
        if mtitle:
            title_norm = _norm(mtitle.group(1))
            total_atoms += 1
            if _in(r, title_norm):
                consistent_atoms += 1.0
            elif catalog and title_norm in catalog.get("titles", set()):
                consistent_atoms += 1.0
            else:
                # unknown title ‚Üí tiny partial credit (maybe paraphrase)
                consistent_atoms += 0.25

    return consistent_atoms / total_atoms if total_atoms else 0.0

    #Title: If the prediction starts with something like ‚ÄúTitle ‚Äî ‚Ä¶‚Äù, it checks if this title matches the reference or exists as a dataset title.







def compute_hallucination_rate(
    predictions: List[str],
    references: List[str],
    catalog: Optional[Dict[str, Set[str]]] = None,
    *,
    numeric_tolerance: float = 0.20
) -> float:
    """
    it shows prediction as hallucinated if any of these happens:

      H1) Numeric contradiction:
          For units the reference mentions (cal, g, mg, min), the prediction
          provides a value that deviates by > numeric_tolerance from ALL ref values.

      H2) Title claim implausible:
          Prediction starts with a title-like 'Title - ...' that is neither in the
          reference text nor in catalog['titles'] (if  we provide it).

      H3) Ingredient claim implausible:
          Prediction says 'using X and Y and z etc' and an ingredient is neither in the reference
          nor in catalog['ingredients'] (if we provide the catalog).

      H4) Unsupported keyword :
          If catalog is provided and the prediction mentions a dataset keyword
          NOT seen in the reference AND NOT present in catalog['keywords'],
          flag as hallucination.

    Returns: fraction of predictions  which are flagged as hallucinated.
    """
    def _in(text: str, phrase: str) -> bool:
        return phrase in text

    hallucinated = 0
    for pred, ref in zip(predictions, references):
        p = _norm(pred)
        r = _norm(ref)
        flagged = False

        #  first part which is H1) Numeric contradiction vs reference
        p_nums = _extract_numbers_with_units(p)
        r_nums = _extract_numbers_with_units(r)


        ref_by_unit = {"cal": [], "g": [], "mg": [], "min": [], "": []}
        for n, u in r_nums:
            ref_by_unit[u if u in ref_by_unit else ""].append(n)

        # If a unit exists in reference, any predicted value for that unit
        # must be close to SOME ref value, otherwise it should show us contradiction.
        for n, u in p_nums:
            ref_vals = ref_by_unit.get(u, [])
            if ref_vals:  # only enforce when ref mentions the unit
                ok = any(abs(n - rr) <= numeric_tolerance * (rr if rr != 0 else max(1.0, n))
                         for rr in ref_vals)
                if not ok:
                    flagged = True
                    break
        if flagged:
            hallucinated += 1
            continue

        # Second part which is H2) Title plausibility
        mtitle = re.match(r'^([^\-\:\|]{3,80})\s*[\-\:\|]', pred.strip())
        if mtitle:
            title_norm = _norm(mtitle.group(1))
            in_ref = _in(r, title_norm)
            in_catalog = bool(catalog and title_norm in catalog.get("titles", set()))
            if not (in_ref or in_catalog):
                hallucinated += 1
                continue

        # Third part that is  H3) Ingredient list plausibility
        ings = _extract_ingredients(pred)
        if ings:
            for ing in ings:
                in_ref = ing in r
                in_vocab = bool(catalog and ing in catalog.get("ingredients", set()))
                if not (in_ref or in_vocab):
                    flagged = True
                    break
            if flagged:
                hallucinated += 1
                continue


        # fourth part that is  H4) Unsupported keyword
        if catalog and "keywords" in catalog:
            # Here we only check keywords that appear in the prediction
            for kw in catalog["keywords"]:
                if _in(p, kw) and not _in(r, kw):
                    # If it's not even a known dataset keyword, flag.
                    # (This line will *never* trigger because kw ‚àà catalog["keywords"] by loop.)
                    # Kept here to show pattern if we add an external kw list.
                    pass
            # If we later add external keywords, we can do this:
            # for ext_kw in external_kw_list:
            #     if _in(p, ext_kw) and not _in(r, ext_kw) and ext_kw not in catalog["keywords"]:
            #         hallucinated += 1; continue

        # If none of the conditions triggered, we consider it NOT hallucinated.

    return hallucinated / len(predictions) if predictions else 0.0









def compute_fluency_human(predictions: list[str]) -> float:
    """
    fluency proxy (1‚Äì5 scale) based on basic text quality signals.
    we ourselves put(1-5)

       checks missing punctuation or excessive run-ons.
       checks unbalanced parentheses/quotes.
       checks long repeated tokens or weird casing.
       Rewards average sentence length 6‚Äì25 words.
       Rewards correct capitalization at sentence starts.

    Returns:
        Mean fluency score across predictions, scaled to [1,5].
    """
    scores = []
    for text in predictions:
        t = text.strip()

        # 1. Sentence segmentation
        sentences = re.split(r'[.!?]+', t)
        sentences = [s.strip() for s in sentences if s.strip()]
        n_sent = len(sentences) or 1
        words = re.findall(r'\b\w+\b', t)
        n_words = len(words)

        # 2. Sentence length balance (too short or too long hurts)
        avg_len = n_words / n_sent
        if avg_len < 5:
            len_score = 2.5
        elif avg_len > 25:
            len_score = 3.0
        else:
            len_score = 4.5

        # 3. Is Punctuation  present?
        punct_score = 4.5 if re.search(r'[.!?]', t) else 2.5

        # 4. Capitalization check
        caps_score = 4.5 if re.match(r'^[A-Z]', t) else 3.0

        # 5. Repetition penalty (detect >3 identical tokens)
        toks = [w.lower() for w in words]
        max_rep = max((toks.count(w) for w in set(toks)), default=0)
        rep_penalty = max(0, min(1, (max_rep - 3) / 5))
        rep_score = 4.5 - 3.0 * rep_penalty

        # 6. Balanced punctuation check
        balance_ok = (t.count('(') == t.count(')')) and (t.count('"') % 2 == 0)
        balance_score = 4.5 if balance_ok else 3.0

        # Aggregate (weighted)
        raw = 0.25 * len_score + 0.25 * punct_score + 0.2 * caps_score + 0.2 * rep_score + 0.1 * balance_score
        scores.append(raw)

    # Normalize to [1,5]
    scores = np.clip(scores, 1.0, 5.0)
    return float(np.mean(scores))


















# Metrics
rouge = load("rouge")
bleu = load("bleu")

inferenced_data = {}

def generate_predictions(dataset, model, tokenizer, max_new_tokens=128):
    """Generate model outputs for a HF datasets split of dicts with keys: instruction, output."""
    predictions, references, instructions = [], [], []
    FastLanguageModel.for_inference(model)
    for example in tqdm(dataset):
        instruction = example["instruction"]
        reference   = example["output"]

        inputs = tokenizer(
            [prompt.format(instruction, "")],
            return_tensors="pt"
        ).to("cuda")

        outputs = model.generate(**inputs, max_new_tokens=max_new_tokens, use_cache=True)
        generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

        # It Extracts just the assistant section
        # Falls back to full text if the delimiter isn't present
        #If the model output doesn‚Äôt include the separator (### Assistant), use the full text as the prediction instead of trying to split it.
        if "### Assistant" in generated_text:
            assistant_response = generated_text.split("### Assistant", 1)[-1].strip()
        else:
            assistant_response = generated_text.strip()

        predictions.append(assistant_response)
        references.append(reference)
        instructions.append(instruction)
    return predictions, references, instructions


def evaluate(
    predictions,
    references,
    metrics: list[str],
    instructions=None,
    catalog=None,
):
    """
    Evaluate predictions vs references with the selected metrics.
    We should pass `instructions` when using constraint_satisfaction.
    We Pass `catalog` (from build_catalog_from_df) for dataset-aware factual/hallucination.
    """
    allowed = {
        "rouge",
        "bleu",
        "constraint_satisfaction",
        "factual_consistency",
        "hallucination_rate",
        "fluency_human",
    }
    unknown = set(metrics) - allowed
    if unknown:
        raise ValueError(f"Unknown metrics requested: {sorted(unknown)}")

    results = {}

    if "rouge" in metrics:
        results["rouge"] = rouge.compute(predictions=predictions, references=references)

    if "bleu" in metrics:

        results["bleu"] = bleu.compute(
            predictions=predictions,
            references=[[ref] for ref in references]
        )

    if "constraint_satisfaction" in metrics:
        if instructions is None:
            raise ValueError("`instructions` are required for constraint_satisfaction.")
        cs = compute_constraint_satisfaction(predictions, instructions)
        results["constraint_precision"] = cs["precision"]
        results["constraint_recall"] = cs["recall"]

    if "factual_consistency" in metrics:
        results["factual_consistency"] = compute_factual_consistency(
            predictions, references, catalog=catalog
        )

    if "hallucination_rate" in metrics:
        results["hallucination_rate"] = compute_hallucination_rate(
            predictions, references, catalog=catalog
        )

    if "fluency_human" in metrics:
        results["fluency_human"] = compute_fluency_human(predictions)

    return results


def store(data, path):
    with open(path, "w") as f:
        json.dump(data, f)


# We run base inference

# EVALUATE FROM SAVED JSON + REBUILD INSTRUCTIONS IF NEEDED

import json
import pandas as pd

# 1) Load the already-computed predictions/references
with open("base-inference.json", "r") as f:
    inferenced_data = json.load(f)

val_predictions,  val_references  = inferenced_data["val"]
test_predictions, test_references = inferenced_data["test"]

# Try to load instructions if they were saved; may be None
val_instructions  = inferenced_data.get("val_instructions")
test_instructions = inferenced_data.get("test_instructions")

# 2) If instructions are missing, rebuild them by reproducing the same HF split
#    used originally (seed=3407, 80/20, then 50/50 on the 20%).
if val_instructions is None or test_instructions is None:
    try:
        from datasets import load_dataset
        random_state = 3407
        ds = load_dataset("azimidokht/recipe-recom", split="train")
        # Reproduce our original split exactly
        split_dataset = ds.train_test_split(test_size=0.2, shuffle=True, seed=random_state)
        temp_dataset  = split_dataset["test"]
        val_test_split = temp_dataset.train_test_split(test_size=0.5, shuffle=True, seed=random_state)
        val_dataset  = val_test_split["train"]
        test_dataset = val_test_split["test"]

        # Rebuild instruction lists in the SAME order as the split
        val_instructions  = [ex["instruction"] for ex in val_dataset]
        test_instructions = [ex["instruction"] for ex in test_dataset]
        print("‚úÖ Rebuilt instructions from the HF dataset; constraint_satisfaction will be computed.")
    except Exception as e:
        print("‚ö†Ô∏è Could not rebuild instructions automatically:", str(e))
        print("Proceeding without constraint_satisfaction for any split that lacks instructions.")

# 3) Build the catalog from our uploaded HUMMUS dataset
df = pd.read_csv("pp_recipes.csv", low_memory=True,
                 dtype={"ingredient_food_kg_urls":"string",
                        "ingredient_food_kg_names":"string"})
catalog = build_catalog_from_df(df)

# 4) Choose metrics (include constraint_satisfaction only if we have instructions)
val_metrics  = ["rouge","bleu","factual_consistency","hallucination_rate","fluency_human"]
test_metrics = ["rouge","bleu","factual_consistency","hallucination_rate","fluency_human"]
if val_instructions is not None:
    val_metrics.insert(2, "constraint_satisfaction")
if test_instructions is not None:
    test_metrics.insert(2, "constraint_satisfaction")

# 5) Evaluate
val_results = evaluate(
    val_predictions, val_references,
    metrics=val_metrics,
    instructions=val_instructions,
    catalog=catalog,
)
test_results = evaluate(
    test_predictions, test_references,
    metrics=test_metrics,
    instructions=test_instructions,
    catalog=catalog,
)

# 6) Report
print("Validation results:", val_results)
print("Test results:", test_results)
if "constraint_recall" in val_results or "constraint_recall" in test_results:
    print("‚úÖ constraint_satisfaction included for the splits that had instructions.")
else:
    print("‚ÑπÔ∏è constraint_satisfaction skipped (no instructions available).")


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

recipe_recom.parquet:   0%|          | 0.00/214k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5149 [00:00<?, ? examples/s]

‚úÖ Rebuilt instructions from the HF dataset; constraint_satisfaction will be computed.
Validation results: {'rouge': {'rouge1': np.float64(0.0494671003072754), 'rouge2': np.float64(0.0004326626828482697), 'rougeL': np.float64(0.045488738530021194), 'rougeLsum': np.float64(0.04706586888610807)}, 'bleu': {'bleu': 0.0, 'precisions': [0.04628993819929717, 0.0011544963509668906, 8.419989054014229e-05, 0.0], 'brevity_penalty': 1.0, 'length_ratio': 2.94095984794488, 'translation_length': 24757, 'reference_length': 8418}, 'constraint_precision': 0.48097826086956524, 'constraint_recall': 0.2565217391304348, 'factual_consistency': 0.3259123249942621, 'hallucination_rate': 0.516504854368932, 'fluency_human': 3.95331067961165}
Test results: {'rouge': {'rouge1': np.float64(0.046089644166975675), 'rouge2': np.float64(0.0003719212620090258), 'rougeL': np.float64(0.04304621880270281), 'rougeLsum': np.float64(0.044344713848289745)}, 'bleu': {'bleu': 0.0, 'precisions': [0.046011240667729214, 0.00124212

<a name="Train"></a>
### Train the model
Now let's train our model. We do 60 steps to speed things up, but you can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`. We also support TRL's `DPOTrainer`!

In [7]:
from trl import SFTConfig, SFTTrainer
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    packing = False, # Can make training 5x faster for short sequences.
    args = SFTConfig(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 60,
        learning_rate = 2e-4,
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

Unsloth: Tokenizing ["text"] (num_proc=6):   0%|          | 0/4119 [00:00<?, ? examples/s]

In [8]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.741 GB.
6.893 GB of memory reserved.


In [10]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 4,119 | Num Epochs = 1 | Total steps = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 20,971,520 of 8,051,232,768 (0.26% trained)


Step,Training Loss
1,3.5994
2,3.4859
3,3.8115
4,3.5366
5,3.3291
6,3.0498
7,2.5345
8,1.8928
9,1.9069
10,1.7423


In [11]:
# @title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

187.91 seconds used for training.
3.13 minutes used for training.
Peak reserved memory = 7.131 GB.
Peak reserved memory for training = 0.238 GB.
Peak reserved memory % of max memory = 48.375 %.
Peak reserved memory for training % of max memory = 1.615 %.


<a name="Inference"></a>
### Inference
Let's run the model! You can change the instruction and input - leave the output blank!



In [23]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    prompt.format(
        "Suggest  a  healthy  pasta  recipe  under  400  calories.", # instruction
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

['<|begin_of_text|>\n### Human:\nSuggest  a  healthy  pasta  recipe  under  400  calories.\n\n### Assistant\nSpinach & Feta Pasta - 390.7 calories, 14.6g protein, ready in 25 minutes.<|end_of_text|>']

## Evalutation - Fine Tuned Model

In [13]:

import json
import pandas as pd

# 1) Load saved predictions & references
with open("base-inference.json", "r") as f:
    inferenced_data = json.load(f)

val_predictions,  val_references  = inferenced_data["val"]
test_predictions, test_references = inferenced_data["test"]

# Try to load instructions if they were saved; may be None
val_instructions  = inferenced_data.get("val_instructions")
test_instructions = inferenced_data.get("test_instructions")

# 2) If instructions are missing, rebuild them by reproducing the original HF split
if val_instructions is None or test_instructions is None:
    try:
        from datasets import load_dataset
        random_state = 3407
        ds = load_dataset("azimidokht/recipe-recom", split="train")
        split_dataset = ds.train_test_split(test_size=0.2, shuffle=True, seed=random_state)
        temp_dataset  = split_dataset["test"]
        val_test_split = temp_dataset.train_test_split(test_size=0.5, shuffle=True, seed=random_state)
        val_dataset  = val_test_split["train"]
        test_dataset = val_test_split["test"]

        # rebuild instruction lists in the SAME order
        val_instructions  = [ex["instruction"] for ex in val_dataset]
        test_instructions = [ex["instruction"] for ex in test_dataset]
        print("‚úÖ Rebuilt instructions from the HF dataset; constraint_satisfaction will be computed.")
    except Exception as e:
        print("‚ö†Ô∏è Could not rebuild instructions automatically:", str(e))
        print("Proceeding without constraint_satisfaction for any split that lacks instructions.")

# 3) Build catalog from our HUMMUS CSV (pp_recipes.csv)
df = pd.read_csv("pp_recipes.csv", low_memory=True,
                 dtype={"ingredient_food_kg_urls":"string",
                        "ingredient_food_kg_names":"string"})
catalog = build_catalog_from_df(df)

# 4) Choose metrics (include constraint_satisfaction only if we have instructions)
val_metrics  = ["rouge","bleu","factual_consistency","hallucination_rate","fluency_human"]
test_metrics = ["rouge","bleu","factual_consistency","hallucination_rate","fluency_human"]
if val_instructions is not None:
    val_metrics.insert(2, "constraint_satisfaction")
if test_instructions is not None:
    test_metrics.insert(2, "constraint_satisfaction")

# 5) Evaluate (same suite as base model)
val_results_tuned = evaluate(
    val_predictions, val_references,
    metrics=val_metrics,
    instructions=val_instructions,
    catalog=catalog,
)
test_results_tuned = evaluate(
    test_predictions, test_references,
    metrics=test_metrics,
    instructions=test_instructions,
    catalog=catalog,
)

# 6) Report
print("Validation results:", val_results_tuned)
print("Test results:", test_results_tuned)
if ("constraint_precision" in val_results_tuned) or ("constraint_precision" in test_results_tuned):
    print("‚úÖ constraint_satisfaction included for the splits that had instructions.")
else:
    print("‚ÑπÔ∏è constraint_satisfaction skipped (no instructions available).")



‚úÖ Rebuilt instructions from the HF dataset; constraint_satisfaction will be computed.
Validation results: {'rouge': {'rouge1': np.float64(0.0494671003072754), 'rouge2': np.float64(0.0004326626828482697), 'rougeL': np.float64(0.045488738530021194), 'rougeLsum': np.float64(0.04706586888610807)}, 'bleu': {'bleu': 0.0, 'precisions': [0.04628993819929717, 0.0011544963509668906, 8.419989054014229e-05, 0.0], 'brevity_penalty': 1.0, 'length_ratio': 2.94095984794488, 'translation_length': 24757, 'reference_length': 8418}, 'constraint_precision': 0.48097826086956524, 'constraint_recall': 0.2565217391304348, 'factual_consistency': 0.3223613209583423, 'hallucination_rate': 0.516504854368932, 'fluency_human': 3.95331067961165}
Test results: {'rouge': {'rouge1': np.float64(0.046089644166975675), 'rouge2': np.float64(0.0003719212620090258), 'rougeL': np.float64(0.04304621880270281), 'rougeLsum': np.float64(0.044344713848289745)}, 'bleu': {'bleu': 0.0, 'precisions': [0.046011240667729214, 0.00124212

 You can also use a `TextStreamer` for continuous inference - so you can see the generation token by token, instead of waiting the whole time!

In [17]:
# prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    prompt.format(
        "Suggest  a  healthy  pasta  recipe  under  400  calories.", # instruction
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)

<|begin_of_text|>
### Human:
Suggest  a  healthy  pasta  recipe  under  400  calories.

### Assistant
Pasta with Chicken and Artichokes - 380.8 calories, 37.0g protein, ready in 25 minutes.<|end_of_text|>


<a name="Save"></a>
### Saving, loading finetuned models
To save the final model as LoRA adapters, either use Huggingface's `push_to_hub` for an online save or `save_pretrained` for a local save.

**[NOTE]** This ONLY saves the LoRA adapters, and not the full model. To save to 16bit or GGUF, scroll down!

In [18]:
model.save_pretrained("lora_model")  # Local saving
tokenizer.save_pretrained("lora_model")
# model.push_to_hub("your_name/lora_model", token = "...") # Online saving
# tokenizer.push_to_hub("your_name/lora_model", token = "...") # Online saving

('lora_model/tokenizer_config.json',
 'lora_model/special_tokens_map.json',
 'lora_model/tokenizer.json')

Now if you want to load the LoRA adapters we just saved for inference, set `False` to `True`:

In [19]:
if False:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "lora_model", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference

# alpaca_prompt = You MUST copy from above!

inputs = tokenizer(
[
    prompt.format(
        "Suggest  a  healthy  pasta  recipe  under  400  calories.", # instruction
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)

<|begin_of_text|>
### Human:
Suggest  a  healthy  pasta  recipe  under  400  calories.

### Assistant
Pasta With Chicken, Bacon, and Green Beans - 397.7 calories, 27.2g protein, ready in 35 minutes.<|end_of_text|>


You can also use Hugging Face's `AutoModelForPeftCausalLM`. Only use this if you do not have `unsloth` installed. It can be hopelessly slow, since `4bit` model downloading is not supported, and Unsloth's **inference is 2x faster**.

In [24]:
if False:

    from peft import AutoPeftModelForCausalLM
    from transformers import AutoTokenizer
    model = AutoPeftModelForCausalLM.from_pretrained(
        "lora_model",
        load_in_4bit = load_in_4bit,
    )
    tokenizer = AutoTokenizer.from_pretrained("lora_model")

### Saving to float16 for VLLM

We also support saving to `float16` directly. Select `merged_16bit` for float16 or `merged_4bit` for int4. We also allow `lora` adapters as a fallback. Use `push_to_hub_merged` to upload to your Hugging Face account! You can go to https://huggingface.co/settings/tokens for your personal tokens.

In [21]:
# Merge to 16bit
if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_16bit",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_16bit", token = "")

# Merge to 4bit
if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_4bit",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_4bit", token = "")

# Just LoRA adapters
if False:
    model.save_pretrained("model")
    tokenizer.save_pretrained("model")
if False:
    model.push_to_hub("hf/model", token = "")
    tokenizer.push_to_hub("hf/model", token = "")


### GGUF / llama.cpp Conversion
To save to `GGUF` / `llama.cpp`, we support it natively now! We clone `llama.cpp` and we default save it to `q8_0`. We allow all methods like `q4_k_m`. Use `save_pretrained_gguf` for local saving and `push_to_hub_gguf` for uploading to HF.

Some supported quant methods (full list on our [Wiki page](https://github.com/unslothai/unsloth/wiki#gguf-quantization-options)):
* `q8_0` - Fast conversion. High resource use, but generally acceptable.
* `q4_k_m` - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q4_K.
* `q5_k_m` - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q5_K.

[**NEW**] To finetune and auto export to Ollama, try our [Ollama notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3_(8B)-Ollama.ipynb)

In [25]:
# Save to 8bit Q8_0
if False: model.save_pretrained_gguf("model", tokenizer,)
# Remember to go to https://huggingface.co/settings/tokens for a token!
# And change hf to your username!
if False: model.push_to_hub_gguf("hf/model", tokenizer, token = "")

# Save to 16bit GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "f16")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "f16", token = "")

# Save to q4_k_m GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "q4_k_m")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "q4_k_m", token = "")

# Save to multiple GGUF options - much faster if we want multiple!
if False:
    model.push_to_hub_gguf(
        "hf/model", # Change hf to your username!
        tokenizer,
        quantization_method = ["q4_k_m", "q8_0", "q5_k_m",],
        token = "",
    )