In [None]:
!pip -q install --upgrade pip
!pip -q install -U unsloth
# Qwen3 precisa de transformers recente (>= 4.51.0)
!pip -q install -U "transformers>=4.51.0" trl datasets accelerate peft bitsandbytes

!apt-get -qq update
!apt-get -qq install -y tree


In [None]:
import torch, os, platform


## GPU Health

In [None]:
print("Python:", platform.python_version())
print("Torch:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))
else:
    print("⚠️ Sem GPU. Vá em Runtime → Change runtime type → GPU.")

# Versions importantes pro Qwen3/Unsloth
import transformers, unsloth
print("Transformers:", transformers.__version__)
print("Unsloth:", getattr(unsloth, "__version__", "unknown"))


In [None]:
from google.colab import drive
drive.mount("/content/drive")


In [None]:
import unsloth
from unsloth import FastLanguageModel

import gc, os, inspect
from datasets import load_dataset
from trl import SFTTrainer, SFTConfig

DATA_PATH = "/content/drive/MyDrive/Spaced Repetition Project/training_data_formatted.jsonl"
print("Dataset:", DATA_PATH)


In [None]:
# Base model (Hugging Face / Unsloth)
MODEL_NAME = "unsloth/Qwen3-4B-Instruct-2507-unsloth-bnb-4bit"

# Saída (LoRA adapters)
OUTPUT_DIR = "/content/drive/MyDrive/Spaced Repetition Project/qwen3_4b_flashcard_finetuned_lora"

# Para T4/L4, 2048 costuma ser um bom compromisso.
# Se der OOM, reduza MAX_SEQ_LENGTH para 1024 ou diminua batch/grad_acc.
MAX_SEQ_LENGTH = 2048
BATCH_SIZE = 1
GRAD_ACC = 8
MAX_STEPS = 120
LR = 2e-4

def clear_gpu():
    torch.cuda.empty_cache()
    torch.cuda.synchronize()
    gc.collect()

dataset = load_dataset("json", data_files=DATA_PATH, split="train")
print("Exemplos:", len(dataset))
print(dataset[0]["text"][:200], "...")


In [None]:
clear_gpu()

# (Opcional) Se você usa modelos gated no HF, configure um token:
# - Colab: Runtime → Secrets → adicione HF_TOKEN
try:
    from google.colab import userdata
    HF_TOKEN = userdata.get("HF_TOKEN")
except Exception:
    HF_TOKEN = None

load_kwargs = dict(
    model_name=MODEL_NAME,
    max_seq_length=MAX_SEQ_LENGTH,
    dtype=None,
    load_in_4bit=True,
    device_map="auto",
    trust_remote_code=True,
)
if HF_TOKEN:
    load_kwargs["token"] = HF_TOKEN

model, tokenizer = FastLanguageModel.from_pretrained(**load_kwargs)

# treino não precisa de KV cache
try:
    model.config.use_cache = False
except Exception:
    pass

# pad token safe
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token or tokenizer.unk_token
if getattr(model.config, "pad_token_id", None) is None and tokenizer.pad_token_id is not None:
    model.config.pad_token_id = tokenizer.pad_token_id

model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=42,
)
print("Modelo + LoRA prontos.")


In [None]:
MIN_TOKENS = 64

def tokenize_and_chunk(batch):
    out_input_ids, out_attention = [], []
    for text in batch["text"]:
        ids = tokenizer(text, add_special_tokens=False)["input_ids"]
        if tokenizer.eos_token_id is not None:
            ids = ids + [tokenizer.eos_token_id]

        for i in range(0, len(ids), MAX_SEQ_LENGTH):
            chunk = ids[i:i + MAX_SEQ_LENGTH]
            if len(chunk) < MIN_TOKENS:
                continue
            out_input_ids.append(chunk)
            out_attention.append([1] * len(chunk))

    return {"input_ids": out_input_ids, "attention_mask": out_attention}

chunked = dataset.map(
    tokenize_and_chunk,
    batched=True,
    remove_columns=dataset.column_names,
    num_proc=2,
    desc="tokenize+chunk",
)

print("Chunks:", len(chunked))


In [None]:
# fp16 vs bf16: escolha automática (A100/H100 usam bf16, T4 normalmente fp16)
use_bf16 = torch.cuda.is_available() and torch.cuda.is_bf16_supported()

sft_args = SFTConfig(
    output_dir=OUTPUT_DIR,
    max_seq_length=MAX_SEQ_LENGTH,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACC,
    max_steps=MAX_STEPS,
    warmup_steps=10,
    learning_rate=LR,
    logging_steps=10,
    optim="adamw_8bit",
    fp16=not use_bf16,
    bf16=use_bf16,
    report_to="none",
    save_steps=50,
    save_total_limit=2,

    # dataset pretokenizado:
    remove_unused_columns=False,
    dataset_kwargs={"skip_prepare_dataset": True},
)

trainer_kwargs = dict(model=model, train_dataset=chunked, args=sft_args)

sig = inspect.signature(SFTTrainer.__init__)
if "processing_class" in sig.parameters:
    trainer_kwargs["processing_class"] = tokenizer
else:
    trainer_kwargs["tokenizer"] = tokenizer

trainer = SFTTrainer(**trainer_kwargs)
trainer.train()


In [None]:
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print("Salvo em:", OUTPUT_DIR)

In [None]:
from google.colab import drive
drive.mount("/content/drive")

import torch, gc
assert torch.cuda.is_available()
print("GPU:", torch.cuda.get_device_name(0))

import unsloth
from unsloth import FastLanguageModel

ADAPTER_DIR = "/content/drive/MyDrive/Spaced Repetition Project/qwen3_4b_flashcard_finetuned_lora"

# Use o mesmo max_seq_length do treino (ajuste se você treinou diferente)
MAX_SEQ_LENGTH = 2048

# Salve primeiro no disco local (evita o Drive causar overhead durante o merge)
MERGED_DIR_LOCAL = "/content/qwen3_4b_flashcard_merged_16bit"

def clear():
    torch.cuda.empty_cache()
    gc.collect()

clear()

# Pode carregar em 4-bit pra economizar GPU; o merge vai produzir 16-bit no disco
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=ADAPTER_DIR,
    max_seq_length=MAX_SEQ_LENGTH,
    dtype=None,
    load_in_4bit=True,
    device_map="auto",
    trust_remote_code=True,
)

# Reduz picos durante o save (se crashar, reduza maximum_memory_usage)
model.save_pretrained_merged(
    MERGED_DIR_LOCAL,
    tokenizer,
    save_method="merged_16bit",
    maximum_memory_usage=0.5,
)

# Garante que tokenizer/config estão no mesmo diretório do modelo merged
tokenizer.save_pretrained(MERGED_DIR_LOCAL)

print("Merged 16-bit salvo em:", MERGED_DIR_LOCAL)


In [None]:
!mkdir -p "/content/drive/MyDrive/Spaced Repetition Project/qwen3_4b_flashcard_merged_16bit"
!rsync -a --info=progress2 /content/qwen3_4b_flashcard_merged_16bit/ "/content/drive/MyDrive/Spaced Repetition Project/qwen3_4b_flashcard_merged_16bit/"


In [None]:
from google.colab import drive
drive.mount("/content/drive")

MERGED_DIR = "/content/drive/MyDrive/Spaced Repetition Project/qwen3_4b_flashcard_merged_16bit"
OUT_DIR = "/content/drive/MyDrive/Spaced Repetition Project/qwen3_4b_flashcard_gguf"
!mkdir -p "{OUT_DIR}"


In [None]:
!apt-get -qq update
!apt-get -qq install -y build-essential cmake git python3-pip

!rm -rf llama.cpp
!git clone https://github.com/ggml-org/llama.cpp

!pip -q install -r llama.cpp/requirements.txt

# Converte HF merged -> GGUF f16
!python llama.cpp/convert_hf_to_gguf.py "{MERGED_DIR}" --outfile /content/qwen3_4b_model-f16.gguf --outtype f16

# Compila só o quantizador
!cmake -S llama.cpp -B llama.cpp/build -DCMAKE_BUILD_TYPE=Release
!cmake --build llama.cpp/build -j 2 --target llama-quantize

# Quantiza para q4_k_m
!./llama.cpp/build/bin/llama-quantize /content/qwen3_4b_model-f16.gguf /content/qwen3_4b_model-q4_k_m.gguf q4_k_m

# Copia pro Drive
!cp /content/qwen3_4b_model-q4_k_m.gguf "{OUT_DIR}/model-q4_k_m.gguf"
!ls -lh "{OUT_DIR}"
print("GGUF pronto em:", OUT_DIR)


In [None]:
!tree "/content/drive/MyDrive/Spaced Repetition Project"
