In [None]:
import huggingface_hub
hf_token = '...' # put your User Access Tokens here
# ابتدا login کنید
huggingface_hub.login(token=hf_token)

# سپس وضعیت ورود را بررسی کنید
!hf auth whoami

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[1muser: [0m AM-Nateghi


In [73]:
# Load model directly
import torch
import numpy as np
import pandas as pd
import datasets
import evaluate
import bitsandbytes as bnb
from peft import get_peft_model, LoraConfig, TaskType, prepare_model_for_kbit_training
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    pipeline,
    Gemma3ForCausalLM,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig,
    DataCollatorWithPadding,
    DataCollatorForLanguageModeling,
    EarlyStoppingCallback,
)

In [74]:
# QLoRa Configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  # Enables 4-bit quantization
    bnb_4bit_use_double_quant=True,  # Use double quantization for potentially higher accuracy (optional)
    bnb_4bit_quant_type="nf4",  # Quantization type (specifics depend on hardware and library, now, our library is QLoRa)
    bnb_4bit_compute_dtype=torch.bfloat16  # Compute dtype for improved efficiency (optional)
)

In [75]:
ckpt = "google/gemma-3-1b-pt"
tokenizer = AutoTokenizer.from_pretrained(ckpt)
model = AutoModelForCausalLM.from_pretrained(
    ckpt,
    dtype=torch.bfloat16,
    device_map="auto",
    quantization_config=bnb_config,
    attn_implementation="eager",
)

model.config.use_cache = False  # برای gradient checkpointing

In [76]:
prompt = "احساس این جمله را بیان کنید: امروز هوای تهران آفتابی است و من بسیار خوشحال هستم زیرا با دوستانم به پارک می‌روم."
model_inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
inp_len = model_inputs.input_ids.shape[1]

with torch.inference_mode():
    outputs = model.generate(
        **model_inputs,
        max_new_tokens=64,
        do_sample=True,
        top_p=0.8,
        temperature=0.45,
        repetition_penalty=1.2,
        pad_token_id=tokenizer.eos_token_id,
    )
    outputs = outputs[0][inp_len:]

decoded_output = tokenizer.decode(outputs, skip_special_tokens=True)
print(decoded_output)



در حال حاضر، ما در یک مکان هستیم که می خواهیم از آن لذت ببریم. اما آیا شما هم مثل من فکر می‌کنید؟ اگر چه اکنون برایتان سخت‌تر خواهد بود، اما پس‌ازاینکه یاد گرفتید چگونه از فضای باز لذت ببرند، می‌توانید هرگز ن


# Fine Tune The Gemma 3 1B parameters Model

In [77]:
dataset = datasets.load_dataset("knkarthick/dialogsum")


def preporcess_function(examples):
    start_prompt = "Summarize the following dialogue:\n\n"
    end_prompt = "\n\nSummary: "
    inputs = [start_prompt + ex + end_prompt for ex in examples["dialogue"]]
    model_inputs = tokenizer(
        inputs,
        max_length=512,
        padding="max_length",
        truncation=True,
    )

    # Setup the tokenizer for targets
    # with tokenizer.as_target_tokenizer():
    labels = tokenizer(
        examples["summary"],
        max_length=128,
        padding="max_length",
        truncation=True,
    )
    label_ids = []
    for label in labels["input_ids"]:
        label = [
            (l if l != tokenizer.pad_token_id else -100) for l in label
        ]  # we want to ignore pad tokens in the loss
        label_ids.append(label)
    model_inputs["labels"] = label_ids
    
    return model_inputs

In [78]:
tokenized_datasets = dataset.map(preporcess_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(
    ["dialogue", "summary", "id", "topic"]
)
print("Columns: ", tokenized_datasets["train"].column_names)
tokenized_datasets.set_format("torch")
print("\n\nSample data: ", tokenized_datasets["train"][0])

Columns:  ['input_ids', 'attention_mask', 'labels']


Sample data:  {'input_ids': tensor([     0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0

In [79]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
metric = evaluate.combine(["accuracy", "f1", "precision", "recall"])
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)  # Convert probabilities to predicted labels
    return metric.compute(predictions=predictions, references=labels)

In [80]:
model = prepare_model_for_kbit_training(model)

In [81]:
def find_linear_names(model):
    """
    This function identifies all linear layer names within a model that use 4-bit quantization.
    Args:
        model (torch.nn.Module): The PyTorch model to inspect.
    Returns:
        list: A list containing the names of all identified linear layers with 4-bit quantization.
    """
    cls = bnb.nn.Linear4bit  

    # Set to store identified layer names
    lora_module_names = set()

    # Iterate through named modules in the model
    for name, module in model.named_modules():
        # Check if the current module is an instance of the 4-bit linear layer class
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])

        # Special case: remove 'lm_head' if present
        if 'lm_head' in lora_module_names: 
            lora_module_names.remove('lm_head')
    return list(lora_module_names)

# Example usage:
modules = find_linear_names(model)
print(modules)

['up_proj', 'gate_proj', 'o_proj', 'k_proj', 'down_proj', 'v_proj', 'q_proj']


In [82]:
# Lora Settings
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=32,
    lora_alpha=32,
    lora_dropout=.05,
    bias="none",
    target_modules=['v_proj', 'q_proj']
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 2,981,888 || all params: 1,002,867,840 || trainable%: 0.2973


In [84]:
output_dir = "./gemma3-lora-4bit"

training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=2e-4,
    num_train_epochs=3,
    per_device_train_batch_size=4,  # بسته به RAM GPU
    per_device_eval_batch_size=4,
    warmup_ratio=0.05,  # 10% از steps برای warmup
    weight_decay=0.01,
    logging_steps=10,
    eval_strategy="steps",
    save_strategy="steps",
    eval_steps=25,
    save_steps=25,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    # fp16=torch.cuda.is_available()
    gradient_checkpointing=True,  # برای کاهش مصرف حافظه
)

callbacks = [
    EarlyStoppingCallback(early_stopping_patience=3, early_stopping_threshold=0)
]

In [85]:
torch.cuda.empty_cache()

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    callbacks=callbacks,
)

trainer.train()

OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU 0 has a total capacity of 11.59 GiB of which 577.12 MiB is free. Including non-PyTorch memory, this process has 10.78 GiB memory in use. Of the allocated memory 8.31 GiB is allocated by PyTorch, and 2.24 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)