In [11]:
!pip install -q transformers datasets accelerate bitsandbytes peft
!pip install --upgrade transformers peft bitsandbytes

In [12]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [13]:
from google.colab import userdata
access_token = userdata.get('HF_TOKEN')

In [14]:
from datasets import load_dataset, concatenate_datasets
from transformers import BitsAndBytesConfig, AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, TaskType

model_name = "google/gemma-3n-E4B-it"

In [15]:
def load_and_prepare_datasets():
    ds1 = load_dataset("json", data_files={"train": "/content/drive/MyDrive/data/health_train.jsonl"})["train"]
    ds2 = load_dataset("json", data_files={"train": "/content/drive/MyDrive/data/rehab_train.jsonl"})["train"]
    ds3 = load_dataset("json", data_files={"train": "/content/drive/MyDrive/data/security_train.jsonl"})["train"]
    combined = concatenate_datasets([ds1, ds2, ds3])
    return combined

In [16]:
tokenizer = AutoTokenizer.from_pretrained(model_name, token=access_token)

def preprocess(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=256)

In [17]:
dataset = load_and_prepare_datasets()
tokenized_dataset = dataset.map(preprocess, batched=True)

In [19]:
quant_config = BitsAndBytesConfig(load_in_8bit=True)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    token=access_token,
    torch_dtype="auto",
    quantization_config=quant_config,
    device_map="auto",
)

lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "down_proj", "up_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)
model = get_peft_model(model, lora_config)

training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/output",
    per_device_train_batch_size=1,
    num_train_epochs=1,
    fp16=True,  # GPU 사용 시 True, CPU면 False
    save_strategy="epoch",
    report_to=[]
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [20]:
trainer.train()



RuntimeError: result type Float can't be cast to the desired output type signed char

In [None]:
!zip -r /content/drive/MyDrive/output_model.zip /content/drive/MyDrive/output_model/