### Installation

First, let's install the required packages:


In [1]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" "huggingface_hub>=0.34.0" hf_transfer
    !pip install --no-deps unsloth

# Install latest transformers for Gemma 3N
!pip install --no-deps --upgrade timm


### Load Gemma Model

Now we'll load the base Gemma model:


In [1]:
from unsloth import FastModel
import torch

model, tokenizer = FastModel.from_pretrained(
    model_name = "unsloth/gemma-3n-E2B-it",
    dtype = None, # None for auto detection
    max_seq_length = 2048, # Longer context for detailed conversations
    full_finetuning = False,
)


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.8.1: Fast Gemma3N patching. Transformers: 4.53.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.748 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.1+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.3.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.31.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Gemma3N does not support SDPA - switching to eager!


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

### Add LoRA Adapters

We'll add LoRA adapters to efficiently finetune the model while keeping memory usage low:


In [2]:
model = FastModel.get_peft_model(
    model,
    finetune_vision_layers = False,
    finetune_language_layers = True,
    finetune_attention_modules = True,
    finetune_mlp_modules = True,

    r = 16,
    lora_alpha = 16,
    lora_dropout = 0.1,
    bias = "none",
    random_state = 3407,
)


Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.1.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.


Unsloth: Making `model.base_model.model.model.language_model` require gradients


### Load and Prepare Our Dataset

Now we'll load our trauma assessment dataset and convert it to the format Gemma expects:


In [3]:
from datasets import Dataset
import json

# Load the pre-generated dataset
dataset = Dataset.from_json("trauma_assessment_training.jsonl")

# Transform the data to match Gemma's format
def transform_to_gemma_format(example):
    # Convert the messages format
    transformed_messages = []
    for msg in example['messages']:
        transformed_messages.append({
            "role": "user" if msg['role'] == "user" else "model",
            "content": [{"type": "text", "text": msg['content']}]
        })
    return {"conversations": transformed_messages}

# Apply the transformation
dataset = dataset.map(transform_to_gemma_format)
print(f"Loaded and transformed {len(dataset)} training examples")

Loaded and transformed 1440 training examples


### Prepare Chat Template

We'll set up the Gemma chat template and format our conversations accordingly:


In [4]:
from unsloth.chat_templates import get_chat_template
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "gemma-3",
)

def formatting_prompts_func(examples):
    convos_batch = examples["messages"]
    texts = []
    for convos in convos_batch:
      try:
        # The apply_chat_template function expects a list of lists of messages
        # We are processing one example at a time within the batch
        text = tokenizer.apply_chat_template(convos, tokenize=False, add_generation_prompt=False).removeprefix('<bos>')
        texts.append(text)
      except Exception as e:
        print(f"Error formatting conversation: {e}")
        # Append None for examples that cause errors
        texts.append(None)
    return {"text": texts}

# Apply the transformation and filter out None values
dataset = dataset.map(formatting_prompts_func, batched=True).filter(lambda x: x['text'] is not None)
print(f"Loaded and transformed {len(dataset)} training examples")

Loaded and transformed 1438 training examples


Out of the 1440 conversations only 2 had bad formatting. so we will discard them

### Configure Training

Set up the training configuration with appropriate parameters for our use case:


In [5]:
from trl import SFTTrainer, SFTConfig
import torch._dynamo
torch._dynamo.config.cache_size_limit = 64  # or higher  

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    args = SFTConfig(
        dataset_text_field = "text",
        per_device_train_batch_size = 1,
        gradient_accumulation_steps = 4,
        warmup_steps = 10,
        num_train_epochs = 3,  # Multiple epochs for better learning
        learning_rate = 2e-4,
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        report_to = "none",
    ),
)

# Train only on assistant responses
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<start_of_turn>user\n",
    response_part = "<start_of_turn>model\n",
)


### Train the Model

Let's start the training process:


In [6]:
# Show GPU stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")


GPU = Tesla T4. Max memory = 14.748 GB.
10.988 GB of memory reserved.


In [7]:
# Start training
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,438 | Num Epochs = 3 | Total steps = 1,080
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 4 x 1) = 4
 "-____-"     Trainable parameters = 21,135,360 of 5,460,573,632 (0.39% trained)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,8.2316
2,7.2232
3,8.1169
4,7.6492
5,6.7153
6,8.2695
7,6.6483
8,7.282
9,7.8531
10,7.6789


### Save the Model

Save the finetuned model locally:

In [8]:
# Save locally
output_dir = "child_trauma_assessment_gemma"
model.save_pretrained_merged(output_dir, tokenizer)


Found HuggingFace hub cache directory: /teamspace/studios/this_studio/.cache/huggingface/hub
Checking cache directory for required files...
Cache check failed: model-00001-of-00003.safetensors not found in local cache.
Not all required files found in cache. Will proceed with downloading.
Downloading safetensors index for unsloth/gemma-3n-e2b-it...


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Unsloth: Merging weights into 16bit:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/3.08G [00:00<?, ?B/s]

Unsloth: Merging weights into 16bit:  33%|███▎      | 1/3 [00:21<00:43, 21.54s/it]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Unsloth: Merging weights into 16bit:  67%|██████▋   | 2/3 [01:02<00:33, 33.09s/it]

model-00003-of-00003.safetensors:   0%|          | 0.00/2.82G [00:00<?, ?B/s]

Unsloth: Merging weights into 16bit: 100%|██████████| 3/3 [01:30<00:00, 30.06s/it]


### Test the Model

Let's test the model with a sample trauma assessment case:


In [9]:
from transformers import TextStreamer

def test_model(prompt):
    messages = [{
        "role": "user",
        "content": [{"type": "text", "text": prompt}]
    }]
    inputs = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt = True,
        return_tensors = "pt",
        tokenize = True,
        return_dict = True,
    ).to("cuda")

    print("Generating response...")
    _ = model.generate(
        **inputs,
        max_new_tokens = 512,  # Longer responses for detailed assessments
        temperature = 0.7,  # Slightly lower temperature for more focused responses
        top_p = 0.95,
        top_k = 64,
        streamer = TextStreamer(tokenizer, skip_prompt = True),
    )

# Test with a sample case
test_prompt = """أنا قلقان على ابني عمره 8 سنوات. منذ القصف الأخير في منطقتنا في غزة، أصبح يعاني من مشاكل في النوم ويصبح عدواني أحيانا. كان طفلا هادئا من قبل. ماذا علي أن أفعل؟"""

test_model(test_prompt)

# Test with English prompt
test_prompt_en = """I'm worried about my 8-year-old son. Since the recent bombings in our area in Gaza, he's been having trouble sleeping and gets very aggressive sometimes. He used to be such a calm child. What should I do?"""

test_model(test_prompt_en)


Generating response...
أفهم قلقك تمامًا. ما يصفه عن سلوك عمر قد يكون علامة على الصدمة. من المهم أن نتعمق في هذه المشكلات ونبحث عن مساعدة مناسبة له. إليك بعض الخطوات التي يمكن اتخاذها:

**1. خلق بيئة آمنة:**

*   **الاستقرار:** تأكد من أن عمر يعيش في بيئة مستقرة قدر الإمكان، حتى لو كانت مؤقتة. قد يكون هذا مفيدًا له في التكيف.
*   **الرسم والتعبير:** شجعه على الرسم أو الكتابة عن مشاعره. هذه طرق غير لفظية للتعبير عن الصدمة.
*   **الأنشطة الممتعة:** قم بتشجيعه على ممارسة الأنشطة التي كان يحبها قبل القصف، مثل اللعب في الحديقة أو مشاهدة الرسوم المتحركة.

**2. مراقبة سلوكه:**

*   **التسجيل:** قم بتدوين أي تغييرات في سلوكه، مثل عدد مرات الاستيقاظ في الليل، أو أي تصرفات عدوانية.
*   **التفاعل:** تحدث معه عن مشاعره، حتى لو كان يتردد في ذلك. كن صبورًا وداعمه.
*   **الاستماع:** استمع إليه جيدًا دون مقاطعة، ودعه يعرف أنه ليس وحده.

**3. البحث عن مساعدة متخصصة:**

*   **مقدمي الرعاية الاجتماعية:** يمكنهم تقديم الدعم الأولي والتوجه إلى متخصصين.
*   **أخصائي نفسي:** يمكنه إجراء تقييم شامل لسعفه عمر و

Pushing to Hugging Face hub in GGUF format

In [10]:
model.push_to_hub_merged("SoufianeDahimi/child_trauma_gemma_finetune", tokenizer,
                        token = "here_I_put_my_token")

No files have been modified since last commit. Skipping to prevent empty commit.


No files have been modified since last commit. Skipping to prevent empty commit.


Found HuggingFace hub cache directory: /teamspace/studios/this_studio/.cache/huggingface/hub
Checking cache directory for required files...
Cache check failed: model-00001-of-00003.safetensors not found in local cache.
Not all required files found in cache. Will proceed with downloading.
Downloading safetensors index for unsloth/gemma-3n-e2b-it...


model.safetensors.index.json: 0.00B [00:00, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.
Unsloth: Merging weights into 16bit:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/3.08G [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.
Unsloth: Merging weights into 16bit:  33%|███▎      | 1/3 [00:33<01:06, 33.34s/it]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Unsloth: Merging weights into 16bit:  67%|██████▋   | 2/3 [01:48<00:57, 57.86s/it]

model-00003-of-00003.safetensors:   0%|          | 0.00/2.82G [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/2.82G [00:00<?, ?B/s]

Unsloth: Merging weights into 16bit: 100%|██████████| 3/3 [02:27<00:00, 49.06s/it]


In [11]:
metadata = model.save_pretrained_gguf(
        save_directory= output_dir,
        quantization_method="fast_quantized"
    )

Unsloth GGUF:hf-to-gguf:Loading model: child_trauma_assessment_gemma
Unsloth GGUF:hf-to-gguf:Model architecture: Gemma3nForConditionalGeneration
Unsloth GGUF:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
Unsloth GGUF:hf-to-gguf:Exporting model...
Unsloth GGUF:hf-to-gguf:gguf: loading model weight map from 'model.safetensors.index.json'
Unsloth GGUF:hf-to-gguf:gguf: loading model part 'model-00001-of-00003.safetensors'
Unsloth GGUF:hf-to-gguf:altup_proj.weight,                 torch.bfloat16 --> Q8_0, shape = {2048, 2048, 3}
Unsloth GGUF:hf-to-gguf:altup_unembd_proj.weight,          torch.bfloat16 --> Q8_0, shape = {2048, 2048, 3}
Unsloth GGUF:hf-to-gguf:token_embd.weight,                 torch.bfloat16 --> Q8_0, shape = {2048, 262144}
Unsloth GGUF:hf-to-gguf:gguf: loading model part 'model-00002-of-00003.safetensors'
Unsloth GGUF:hf-to-gguf:per_layer_token_embd.weight,       torch.bfloat16 --> Q8_0, shape = {7680, 262144}
Unsloth GGUF:hf-to-gguf:output_norm.weight,   

Unsloth: GGUF conversion:   0%|          | 0/100 [00:00<?, ?it/s]

Unsloth GGUF:hf-to-gguf:Model successfully exported to ./
Unsloth: Converted to child_trauma_assessment_gemma.Q8_0.gguf with size = 4.7G
Unsloth: Successfully saved GGUF to:
child_trauma_assessment_gemma.Q8_0.gguf


In [12]:
metadata = model.push_to_hub_gguf(
        save_directory= output_dir,
        repo_id="SoufianeDahimi/child_trauma_gemma_finetune-gguf",
        quantization_method="fast_quantized",
        token="here_I_put_my_token"
    )

Unsloth GGUF:hf-to-gguf:Loading model: child_trauma_assessment_gemma
Unsloth GGUF:hf-to-gguf:Model architecture: Gemma3nForConditionalGeneration
Unsloth GGUF:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
Unsloth GGUF:hf-to-gguf:Exporting model...
Unsloth GGUF:hf-to-gguf:gguf: loading model weight map from 'model.safetensors.index.json'
Unsloth GGUF:hf-to-gguf:gguf: loading model part 'model-00001-of-00003.safetensors'
Unsloth GGUF:hf-to-gguf:altup_proj.weight,                 torch.bfloat16 --> Q8_0, shape = {2048, 2048, 3}
Unsloth GGUF:hf-to-gguf:altup_unembd_proj.weight,          torch.bfloat16 --> Q8_0, shape = {2048, 2048, 3}
Unsloth GGUF:hf-to-gguf:token_embd.weight,                 torch.bfloat16 --> Q8_0, shape = {2048, 262144}
Unsloth GGUF:hf-to-gguf:gguf: loading model part 'model-00002-of-00003.safetensors'
Unsloth GGUF:hf-to-gguf:per_layer_token_embd.weight,       torch.bfloat16 --> Q8_0, shape = {7680, 262144}
Unsloth GGUF:hf-to-gguf:output_norm.weight,   

Unsloth: GGUF conversion:   0%|          | 0/100 [00:00<?, ?it/s]

Unsloth GGUF:hf-to-gguf:Model successfully exported to ./
Unsloth: Converted to child_trauma_assessment_gemma.Q8_0.gguf with size = 4.7G
Unsloth: Successfully saved GGUF to:
child_trauma_assessment_gemma.Q8_0.gguf


No files have been modified since last commit. Skipping to prevent empty commit.
No files have been modified since last commit. Skipping to prevent empty commit.


For some reason the gguf file wasn't successfully pushed to hub (I only got a GGUF file saved locally). let's push it manually.

In [15]:
import os
from huggingface_hub import HfApi, create_repo

# Define paths and parameters
file_path = metadata[0]
repo_id = "SoufianeDahimi/" + output_dir + "-GGUF"
hf_token = "here_I_put_my_token"

if not os.path.exists(file_path):
    print(f"Error: GGUF file not found at {file_path}")
    exit(1)

print("path exists")
# Initialize HfApi
api = HfApi(token=hf_token)

try:
    create_repo(repo_id=repo_id, token=hf_token, repo_type="model", private=False, exist_ok=True)
    print(f"Repository {repo_id} created or already exists.")
except Exception as e:
    print(f"Error creating repository: {e}")
    exit(1)

# Upload the GGUF file to the Hugging Face Hub
try:
    api.upload_file(
        path_or_fileobj=file_path,
        path_in_repo=file_path,
        repo_id=repo_id,
        repo_type="model",
        token=hf_token,
        commit_message="Upload new version of trauma_assessment_gemma.Q8_0.gguf"
    )
    print(f"Successfully uploaded new version {file_path} to {repo_id}")
except Exception as e:
    print(f"Error uploading file: {e}")
    exit(1)

path exists
Repository SoufianeDahimi/child_trauma_assessment_gemma-GGUF created or already exists.


  0%|          | 0/1 [00:00<?, ?it/s]

child_trauma_assessment_gemma.Q8_0.gguf:   0%|          | 0.00/4.74G [00:00<?, ?B/s]

Successfully uploaded new version child_trauma_assessment_gemma.Q8_0.gguf to SoufianeDahimi/child_trauma_assessment_gemma-GGUF


In [16]:
print(tokenizer._ollama_modelfile)


FROM {__FILE_LOCATION__}
TEMPLATE """{{- range $i, $_ := .Messages }}
{{- $last := eq (len (slice $.Messages $i)) 1 }}
{{- if or (eq .Role "user") (eq .Role "system") }}<start_of_turn>user
{{ .Content }}<end_of_turn>
{{ if $last }}<start_of_turn>model
{{ end }}
{{- else if eq .Role "assistant" }}<start_of_turn>model
{{ .Content }}{{ if not $last }}<end_of_turn>
{{ end }}
{{- end }}
{{- end }}"""
PARAMETER stop "<end_of_turn>"
PARAMETER stop "<eos>"
PARAMETER temperature 0.1
PARAMETER min_p 0.0
PARAMETER top_k 64
PARAMETER top_p 0.95
PARAMETER num_predict 32768

