### Installation

In [1]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1" huggingface_hub hf_transfer
    !pip install --no-deps unsloth

### Unsloth

In [3]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # We also uploaded 4bit for 405b!
    "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", # New Mistral 12b 2x faster!
    "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
    "unsloth/mistral-7b-v0.3-bnb-4bit",        # Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
     token ="YOUR-API-KEY")

==((====))==  Unsloth 2025.8.5: Fast Llama patching. Transformers: 4.55.0.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


We now add LoRA adapters so we only need to update 1 to 10% of all parameters!

In [4]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2025.8.5 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [5]:
from datasets import load_dataset


dataset = load_dataset("Ekinezya2025/ilac-soru-cevap")


print(dataset["train"][0])


Generating train split:   0%|          | 0/310366 [00:00<?, ? examples/s]

{'ilac_adi': '% 0.4 LİDODEKS % 5 DEKSTROZ İÇİNDE İ.V. İNFÜZYON İÇİN ÇÖZELTİ', 'question': '% 0.4 LİDODEKS % 5 DEKSTROZ İÇİNDE İ.V. İNFÜZYON İÇİN ÇÖZELTİ ilacının etkin maddesi nedir?', 'answer': 'Her 100 ml çözelti 0,4 g lidokain hidroklorür içerir.'}


In [6]:
train_dataset = dataset["train"]


subset_dataset = train_dataset.select(range(50_000))

print(subset_dataset)

Dataset({
    features: ['ilac_adi', 'question', 'answer'],
    num_rows: 50000
})


In [7]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    instructions = examples["question"]
    inputs       = [""] * len(instructions)  # boş input
    outputs      = examples["answer"]

    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts }


dataset = subset_dataset.map(formatting_prompts_func, batched = True,)

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

<a name="Train"></a>
### Train the model
Now let's use Huggingface TRL's `SFTTrainer`! More docs here: [TRL SFT docs](https://huggingface.co/docs/trl/sft_trainer). We do 60 steps to speed things up, but you can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`. We also support TRL's `DPOTrainer`!

In [14]:
from trl import SFTConfig, SFTTrainer


trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=1,
    packing=False,
    args=SFTConfig(
        per_device_train_batch_size=8,
        gradient_accumulation_steps=8,
        warmup_ratio=0.1,
        num_train_epochs=1,

        learning_rate=1e-4,
        logging_steps=1,
        optim="paged_adamw_32bit",
        weight_decay=0.0,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
        save_steps=200,
        report_to="tensorboard",
        push_to_hub=True,
        hub_token="YOUR-API-KEY",
        hub_model_id="Ekinezya2025/EczaciLlamaModel",
        hub_strategy="end"
    ),
)

In [None]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA A100-SXM4-40GB. Max memory = 39.557 GB.
7.135 GB of memory reserved.


In [10]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1 | Total steps = 782
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 8 x 1) = 64
 "-____-"     Trainable parameters = 41,943,040 of 8,072,204,288 (0.52% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,2.3634
2,2.0738
3,2.2525
4,2.1672
5,2.3494
6,2.4478
7,2.1942
8,2.316
9,2.2234
10,2.289


Step,Training Loss
1,2.3634
2,2.0738
3,2.2525
4,2.1672
5,2.3494
6,2.4478
7,2.1942
8,2.316
9,2.2234
10,2.289


In [11]:
trainer.push_to_hub()

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...t/outputs/adapter_model.safetensors:   0%|          |  558kB /  168MB            

  /content/outputs/tokenizer.json       : 100%|##########| 17.2MB / 17.2MB            

  ...ents.1755159124.261d75fac0de.1372.0:   2%|1         |   227B / 13.8kB            

  ...ents.1755159285.261d75fac0de.1372.1:   2%|1         |   244B / 14.8kB            

  ...nts.1755160377.261d75fac0de.16489.0:   2%|1         | 2.82kB /  171kB            

  /content/outputs/training_args.bin    :   2%|1         |  96.0B / 5.82kB            

CommitInfo(commit_url='https://huggingface.co/Ekinezya2025/EczaciLlamaModel/commit/b82d424e9f3437a06bc93336b09ca38738dda155', commit_message='End of training', commit_description='', oid='b82d424e9f3437a06bc93336b09ca38738dda155', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Ekinezya2025/EczaciLlamaModel', endpoint='https://huggingface.co', repo_type='model', repo_id='Ekinezya2025/EczaciLlamaModel'), pr_revision=None, pr_num=None)

In [None]:
# @title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

441.6894 seconds used for training.
7.36 minutes used for training.
Peak reserved memory = 32.078 GB.
Peak reserved memory for training = 24.943 GB.
Peak reserved memory % of max memory = 81.093 %.
Peak reserved memory for training % of max memory = 63.056 %.
