In [None]:
!pip install -q transformers datasets accelerate bitsandbytes peft

[0m

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from google.colab import userdata
access_token = userdata.get('HF_TOKEN')

In [None]:
from datasets import load_dataset, concatenate_datasets
from transformers import BitsAndBytesConfig, AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, TaskType

model_name = "google/gemma-3n-E4B-it"

In [None]:
def load_and_prepare_datasets():
    ds1 = load_dataset("json", data_files={"train": "/content/drive/MyDrive/data/health_train.jsonl"})["train"]
    ds2 = load_dataset("json", data_files={"train": "/content/drive/MyDrive/data/rehab_train.jsonl"})["train"]
    ds3 = load_dataset("json", data_files={"train": "/content/drive/MyDrive/data/secur_train.jsonl"})["train"]
    combined = concatenate_datasets([ds1, ds2, ds3])
    return combined

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name, token=access_token)

def preprocess(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=256)

In [None]:
dataset = load_and_prepare_datasets()
tokenized_dataset = dataset.map(preprocess, batched=True)

In [None]:
quant_config = BitsAndBytesConfig(load_in_8bit=True)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    token=access_token,
    torch_dtype="auto",
    quantization_config=quant_config,
    device_map={
        "transformer.h.0": 0
    },
)

lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "down_proj", "up_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)
model = get_peft_model(model, lora_config)

training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/output",
    per_device_train_batch_size=1,
    num_train_epochs=1,
    fp16=True,  # GPU 사용 시 True, CPU면 False
    save_strategy="epoch",
    report_to=[]
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)

In [None]:
trainer.train()

In [None]:
!zip -r /content/drive/MyDrive/output_model.zip /content/drive/MyDrive/output_model/