In [None]:
!pip -q install --upgrade pip
!pip -q install unsloth
!apt install tree

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tree is already the newest version (2.0.2-1).
0 upgraded, 0 newly installed, 0 to remove and 41 not upgraded.


In [None]:
import torch, os, platform


## GPU Health

In [None]:
print("Python:", platform.python_version())
print("Torch:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))
else:
    print("⚠️ Sem GPU. Vá em Runtime → Change runtime type → GPU.")

Python: 3.12.12
Torch: 2.9.1+cu128
CUDA available: True
GPU: Tesla T4


In [None]:
from google.colab import drive
drive.mount("/content/drive")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import unsloth
from unsloth import FastLanguageModel

import gc, os, inspect
from datasets import load_dataset
from trl import SFTTrainer, SFTConfig

DATA_PATH = "/content/drive/MyDrive/Spaced Repetition Project/training_data_formatted.jsonl"
print("Dataset:", DATA_PATH)


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
Dataset: /content/drive/MyDrive/Spaced Repetition Project/training_data_formatted.jsonl


In [None]:
MODEL_NAME = "unsloth/qwen3:4b-instruct"
OUTPUT_DIR = "/content/drive/MyDrive/Spaced Repetition Project/qwen_flashcard_finetuned_lora"

MAX_SEQ_LENGTH = 1024
BATCH_SIZE = 2
GRAD_ACC = 4
MAX_STEPS = 120
LR = 2e-4

def clear_gpu():
    torch.cuda.empty_cache()
    torch.cuda.synchronize()
    gc.collect()

dataset = load_dataset("json", data_files=DATA_PATH, split="train")
print("Exemplos:", len(dataset))
print(dataset[0]["text"][:200], "...")


Generating train split: 0 examples [00:00, ? examples/s]

Exemplos: 6
<|im_start|>system

Effective learning: Twenty rules of formulating knowledge
language learning flashcards
Dr Piotr Wozniak, February, 1999 (updated)

This article will help you overcome one of the gr ...


In [None]:
clear_gpu()

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=MODEL_NAME,
    max_seq_length=MAX_SEQ_LENGTH,
    dtype=None,
    load_in_4bit=True,
    device_map="auto",
)

# treino não precisa de KV cache
try:
    model.config.use_cache = False
except Exception:
    pass

# pad token safe
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token or tokenizer.unk_token
if getattr(model.config, "pad_token_id", None) is None and tokenizer.pad_token_id is not None:
    model.config.pad_token_id = tokenizer.pad_token_id

model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=42,
)
print("Modelo + LoRA prontos.")


==((====))==  Unsloth 2025.12.6: Fast Qwen2 patching. Transformers: 4.57.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/1.53G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/270 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

Unsloth 2025.12.6 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


Modelo + LoRA prontos.


In [None]:
MIN_TOKENS = 64

def tokenize_and_chunk(batch):
    out_input_ids, out_attention = [], []
    for text in batch["text"]:
        ids = tokenizer(text, add_special_tokens=False)["input_ids"]
        if tokenizer.eos_token_id is not None:
            ids = ids + [tokenizer.eos_token_id]

        for i in range(0, len(ids), MAX_SEQ_LENGTH):
            chunk = ids[i:i + MAX_SEQ_LENGTH]
            if len(chunk) < MIN_TOKENS:
                continue
            out_input_ids.append(chunk)
            out_attention.append([1] * len(chunk))

    return {"input_ids": out_input_ids, "attention_mask": out_attention}

chunked = dataset.map(
    tokenize_and_chunk,
    batched=True,
    remove_columns=dataset.column_names,
    num_proc=2,
    desc="tokenize+chunk",
)

print("Chunks:", len(chunked))


tokenize+chunk (num_proc=2):   0%|          | 0/6 [00:00<?, ? examples/s]

Chunks: 66


In [None]:
sft_args = SFTConfig(
    output_dir=OUTPUT_DIR,
    max_seq_length=MAX_SEQ_LENGTH,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACC,
    max_steps=MAX_STEPS,
    warmup_steps=10,
    learning_rate=LR,
    logging_steps=10,
    optim="adamw_8bit",
    fp16=True,
    bf16=False,  # em T4 geralmente não tem bf16
    report_to="none",
    save_steps=50,
    save_total_limit=2,

    # dataset pretokenizado:
    remove_unused_columns=False,
    dataset_kwargs={"skip_prepare_dataset": True},
)

trainer_kwargs = dict(model=model, train_dataset=chunked, args=sft_args)

sig = inspect.signature(SFTTrainer.__init__)
if "processing_class" in sig.parameters:
    trainer_kwargs["processing_class"] = tokenizer
else:
    trainer_kwargs["tokenizer"] = tokenizer

trainer = SFTTrainer(**trainer_kwargs)
trainer.train()


The model is already on multiple devices. Skipping the move to device specified in `args`.


🦥 Unsloth: Padding-free auto-enabled, enabling faster training.


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 66 | Num Epochs = 14 | Total steps = 120
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 18,464,768 of 1,562,179,072 (1.18% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
10,2.6854
20,2.4046
30,1.958
40,1.3399
50,0.6325
60,0.1769
70,0.0582
80,0.0253
90,0.0148
100,0.0128


TrainOutput(global_step=120, training_loss=0.7771838516617815, metrics={'train_runtime': 561.6604, 'train_samples_per_second': 1.709, 'train_steps_per_second': 0.214, 'total_flos': 6861738352803840.0, 'train_loss': 0.7771838516617815, 'epoch': 13.363636363636363})

In [None]:
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print("Salvo em:", OUTPUT_DIR)

In [None]:
from google.colab import drive
drive.mount("/content/drive")

import torch, gc
assert torch.cuda.is_available()
print("GPU:", torch.cuda.get_device_name(0))

import unsloth
from unsloth import FastLanguageModel

ADAPTER_DIR = "/content/drive/MyDrive/Spaced Repetition Project/qwen_flashcard_finetuned_lora"

# Salve primeiro no disco local (evita o Drive causar overhead durante o merge)
MERGED_DIR_LOCAL = "/content/qwen_flashcard_merged_16bit"

def clear():
    torch.cuda.empty_cache()
    gc.collect()

clear()

# Pode carregar em 4-bit pra economizar GPU; o merge vai produzir 16-bit no disco
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=ADAPTER_DIR,
    max_seq_length=2048,
    dtype=None,
    load_in_4bit=True,
    device_map="auto",
)

# Reduz picos durante o save (Unsloth sugere reduzir maximum_memory_usage se crashar) :contentReference[oaicite:1]{index=1}
model.save_pretrained_merged(
    MERGED_DIR_LOCAL,
    tokenizer,
    save_method="merged_16bit",
    maximum_memory_usage=0.5,
)

# Garante que tokenizer/config estão no mesmo diretório do modelo merged
tokenizer.save_pretrained(MERGED_DIR_LOCAL)

print("Merged 16-bit salvo em:", MERGED_DIR_LOCAL)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
GPU: Tesla T4
==((====))==  Unsloth 2025.12.6: Fast Qwen2 patching. Transformers: 4.57.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/1.53G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/270 [00:00<?, ?B/s]

Unsloth 2025.12.6 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

Found HuggingFace hub cache directory: /root/.cache/huggingface/hub
Checking cache directory for required files...
Cache check failed: model.safetensors not found in local cache.
Not all required files found in cache. Will proceed with downloading.
Checking cache directory for required files...
Cache check failed: tokenizer.model not found in local cache.
Not all required files found in cache. Will proceed with downloading.


Unsloth: Preparing safetensor model files:   0%|          | 0/1 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

Unsloth: Preparing safetensor model files: 100%|██████████| 1/1 [00:47<00:00, 47.09s/it]


Note: tokenizer.model not found (this is OK for non-SentencePiece models)


Unsloth: Merging weights into 16bit: 100%|██████████| 1/1 [00:23<00:00, 23.04s/it]


Unsloth: Merge process complete. Saved to `/content/qwen_flashcard_merged_16bit`
Merged 16-bit salvo em: /content/qwen_flashcard_merged_16bit


In [None]:
!mkdir -p "/content/drive/MyDrive/Spaced Repetition Project/qwen_flashcard_merged_16bit"
!rsync -a --info=progress2 /content/qwen_flashcard_merged_16bit/ "/content/drive/MyDrive/Spaced Repetition Project/qwen_flashcard_merged_16bit/"


  3,103,347,843 100%  139.32MB/s    0:00:21 (xfr#13, to-chk=0/17)


In [None]:
from google.colab import drive
drive.mount("/content/drive")

MERGED_DIR = "/content/drive/MyDrive/Spaced Repetition Project/qwen_flashcard_merged_16bit"
OUT_DIR = "/content/drive/MyDrive/Spaced Repetition Project/qwen_flashcard_gguf"
!mkdir -p "{OUT_DIR}"


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!apt-get -qq update
!apt-get -qq install -y build-essential cmake git python3-pip

!rm -rf llama.cpp
!git clone https://github.com/ggml-org/llama.cpp

!pip -q install -r llama.cpp/requirements.txt

# Converte HF merged -> GGUF f16
!python llama.cpp/convert_hf_to_gguf.py "{MERGED_DIR}" --outfile /content/model-f16.gguf --outtype f16

# Compila só o quantizador
!cmake -S llama.cpp -B llama.cpp/build -DCMAKE_BUILD_TYPE=Release
!cmake --build llama.cpp/build -j 2 --target llama-quantize

# Quantiza para q4_k_m
!./llama.cpp/build/bin/llama-quantize /content/model-f16.gguf /content/model-q4_k_m.gguf q4_k_m

# Copia pro Drive
!cp /content/model-q4_k_m.gguf "{OUT_DIR}/model-q4_k_m.gguf"
!ls -lh "{OUT_DIR}"
print("GGUF pronto em:", OUT_DIR)


W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Selecting previously unselected package javascript-common.
(Reading database ... 121697 files and directories currently installed.)
Preparing to unpack .../0-javascript-common_11+nmu1_all.deb ...
Unpacking javascript-common (11+nmu1) ...
Selecting previously unselected package libjs-underscore.
Preparing to unpack .../1-libjs-underscore_1.13.2~dfsg-2_all.deb ...
Unpacking libjs-underscore (1.13.2~dfsg-2) ...
Selecting previously unselected package libjs-sphinxdoc.
Preparing to unpack .../2-libjs-sphinxdoc_4.3.2-1_all.deb ...
Unpacking libjs-sphinxdoc (4.3.2-1) ...
Selecting previously unselected package python3.10-dev.
Preparing to unpack .../3-python3.10-dev_3.10.12-1~22.04.12_amd64.deb ...
Unpacking python3.10-dev (3.10.12-1~22.04.12) ...
Selecting previously unselected package python3-dev.
Prepari

In [None]:
!tree "/content/drive/MyDrive/Spaced Repetition Project"


[01;34m/content/drive/MyDrive/Spaced Repetition Project[0m
├── [01;34mqwen_flashcard_finetuned_lora[0m
│   ├── [00madapter_config.json[0m
│   ├── [00madapter_model.safetensors[0m
│   ├── [00madded_tokens.json[0m
│   ├── [00mchat_template.jinja[0m
│   ├── [01;34mcheckpoint-100[0m
│   │   ├── [00madapter_config.json[0m
│   │   ├── [00madapter_model.safetensors[0m
│   │   ├── [00madded_tokens.json[0m
│   │   ├── [00mchat_template.jinja[0m
│   │   ├── [00mmerges.txt[0m
│   │   ├── [00moptimizer.pt[0m
│   │   ├── [00mREADME.md[0m
│   │   ├── [00mrng_state.pth[0m
│   │   ├── [00mscaler.pt[0m
│   │   ├── [00mscheduler.pt[0m
│   │   ├── [00mspecial_tokens_map.json[0m
│   │   ├── [00mtokenizer_config.json[0m
│   │   ├── [00mtokenizer.json[0m
│   │   ├── [00mtrainer_state.json[0m
│   │   ├── [00mtraining_args.bin[0m
│   │   └── [00mvocab.json[0m
│   ├── [01;34mcheckpoint-120[0m
│   │   ├── [00madapter_config.json[0m
│   │   ├── [00madapter_model