accelerate : PyTorch 모델의 학습 속도 향상과 추론 최적화를 위한 라이브러리

peft : Parameter Efficient Fine Tunning의 약자로 대규모 언어 모델을 효율적으로 미세 조정할 수 있는 PEFT 기술 구현

bitsandbytes : 모델 매개변수 양자화로 메모리 사용량 절감

transfomers : 다양한 자연어 처리 모델을 쉽게 사용할 수 있는 API 제공


trl : Transformer Reinforcement Learning의 약자로 강화 학습 기반 언어 모델 미세 조정 기술 구현

datassets : 자연어 처리 데이터셋 다운로드 및 전처리 지원


In [None]:
pip install -U accelerate==0.29.3 peft==0.10.0 bitsandbytes==0.43.1 transformers==4.40.1 trl==0.8.6 datasets==2.19.0

Collecting accelerate==0.29.3
  Using cached accelerate-0.29.3-py3-none-any.whl.metadata (18 kB)
Collecting transformers==4.40.1
  Using cached transformers-4.40.1-py3-none-any.whl.metadata (137 kB)
Collecting tokenizers<0.20,>=0.19 (from transformers==4.40.1)
  Using cached tokenizers-0.19.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Using cached accelerate-0.29.3-py3-none-any.whl (297 kB)
Using cached transformers-4.40.1-py3-none-any.whl (9.0 MB)
Using cached tokenizers-0.19.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
Installing collected packages: tokenizers, accelerate, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.20.0
    Uninstalling tokenizers-0.20.0:
      Successfully uninstalled tokenizers-0.20.0
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.34.2
    Uninstalling accelerate-0.34.2:
      Successfully uninstalled accelerate-0.34.2
  At

In [None]:
pip install -i https://pypi.org/simple/ bitsandbytes

Looking in indexes: https://pypi.org/simple/


In [None]:
!pip install accelerate bitsandbytes




In [None]:
import json

# JSON 파일 읽기
with open('new_test.json', 'r', encoding='utf-8') as f:
    json_data = json.load(f)
    json_test = json_data['data']

# 전처리된 데이터 저장
ko_en_list = []
for item in json_test:
    if 'ko' in item and 'en' in item:
        ko_en_list.append({
            "ko": item["ko"],
            "en": item["en"]
        })

# 새로운 JSON 파일 생성
with open('ko_en_test.json', 'w', encoding='utf-8') as f:
    json.dump(ko_en_list, f, ensure_ascii=False, indent=4)  # "data" 키를 없앰

import huggingface_hub
huggingface_hub.login('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')  # API 키 입력

from datasets import Dataset
dataset = Dataset.from_json('ko_en_test.json')
dataset.push_to_hub('dawon62/test')

# 데이터 로드 및 모델 설정
import os
import torch
from datasets import load_dataset
import bitsandbytes as bnb
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from peft import LoraConfig
from trl import SFTTrainer

base_model = "Helsinki-NLP/opus-mt-tc-big-en-ko"  # 학습할 모델
hkcode_dataset = "dawon62/test"  # 데이터셋
new_model = "dawon62/hkcode-mt-en-ko"  # 새로운 모델

# CUDA 환경 설정
if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8:
    !pip install -qqq flash-attn
    attn_implementation = "flash_attention_2"
    torch_dtype = torch.bfloat16
else:
    attn_implementation = "eager"
    torch_dtype = torch.float16

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=False,
)

# 데이터셋 로드
dataset = load_dataset(hkcode_dataset, split="train")
print(dataset)

model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=quant_config,
    device_map={"": 0}
)

model.config.use_cache = False
model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(
    base_model,
    trust_remote_code=True
)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# PEFT 파라미터 설정
peft_params = LoraConfig(
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)

# TrainingArguments 설정
training_params = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,  # 에포크 수를 늘림
    per_device_train_batch_size=2,  # 배치 크기 증가
    gradient_accumulation_steps=1,
    optim="adamw_hf",
    save_steps=50,
    logging_steps=50,
    learning_rate=2e-5,  # 학습률 조정
    weight_decay=0.01,
    fp16=True,  # fp16 사용
    bf16=False,
    max_grad_norm=1.0,  # Gradient Clipping
    max_steps=-1,
    warmup_ratio=0.1,  # Warmup 비율 조정
    group_by_length=True,
    lr_scheduler_type="linear",  # 다른 스케줄러 사용
    report_to="tensorboard",
)

# SFTTrainer 설정
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_params,
    dataset_text_field="ko",  # "ko" 또는 "en" 필드로 수정
    max_seq_length=128,  # 적절한 길이로 설정
    tokenizer=tokenizer,
    args=training_params,
    packing=True,
)

# 트레이너 실행
trainer.train()

model.push_to_hub("dawon62/hkcode-mt-en-ko")
tokenizer.push_to_hub("dawon62/hkcode-mt-en-ko")


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


Generating train split: 0 examples [00:00, ? examples/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Dataset({
    features: ['ko', 'en'],
    num_rows: 31
})


Some weights of MarianForCausalLM were not initialized from the model checkpoint at Helsinki-NLP/opus-mt-tc-big-en-ko and are newly initialized: ['lm_head.weight', 'model.decoder.embed_positions.weight', 'model.decoder.embed_tokens.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[32000]], 'forced_eos_token_id': 2}


model.safetensors:   0%|          | 0.00/205M [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/dawon62/hkcode-mt-en-ko/commit/2e17da287d387415ce63986d048d52c21c6ec211', commit_message='Upload tokenizer', commit_description='', oid='2e17da287d387415ce63986d048d52c21c6ec211', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
!pip install --upgrade transformers torch accelerate


Collecting transformers
  Using cached transformers-4.45.1-py3-none-any.whl.metadata (44 kB)
Collecting accelerate
  Using cached accelerate-0.34.2-py3-none-any.whl.metadata (19 kB)
Collecting tokenizers<0.21,>=0.20 (from transformers)
  Using cached tokenizers-0.20.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Using cached transformers-4.45.1-py3-none-any.whl (9.9 MB)
Using cached accelerate-0.34.2-py3-none-any.whl (324 kB)
Using cached tokenizers-0.20.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.9 MB)
Installing collected packages: tokenizers, accelerate, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.19.1
    Uninstalling tokenizers-0.19.1:
      Successfully uninstalled tokenizers-0.19.1
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.29.3
    Uninstalling accelerate-0.29.3:
      Successfully uninstalled accelerate-0.29.3
  Attempting uninstall: trans

In [None]:
from transformers import MarianMTModel, MarianTokenizer
from huggingface_hub import login

# Hugging Face 로그인
api_token = "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"  # 여기에 API 토큰을 입력하세요.
login(token=api_token)

# 모델 이름
model_name = "dawon62/hkcode-mt-en-ko"

# 모델과 토크나이저 로드 (CPU에서 로드)
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name, device_map="cpu")  # device_map을 "cpu"로 설정

# 사용 예시 (텍스트 번역 등)
text = "안녕하세요, 저는 AI 모델입니다."
inputs = tokenizer(text, return_tensors="pt")

# 모델을 사용하여 번역
with torch.no_grad():
    outputs = model.generate(**inputs)

translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(translated_text)


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
Some weights of MarianMTModel were not initialized from the model checkpoint at dawon62/hkcode-mt-en-ko and are newly initialized: ['model.decoder.layers.0.encoder_attn.q_proj.bias', 'model.decoder.layers.0.encoder_attn.q_proj.weight', 'model.decoder.layers.0.encoder_attn.v_proj.bias', 'model.decoder.layers.0.encoder_attn.v_proj.weight', 'model.decoder.layers.0.self_attn.q_proj.bias', 'model.decoder.layers.0.self_attn.q_proj.weight', 'model.decoder.layers.0.self_attn.v_proj.bias', 'model.decoder.layers.0.self_attn.v_proj.weight', 'model.decoder.layers.1.encoder_attn.q_proj.bias', 'model.decoder.layers.1.encoder_attn.q_proj.weight', 'model.decoder.layers.1.encoder_attn.v_proj.bias', 'model.decoder.layers.1.encoder_attn.v_proj.weight', 'model.decoder.layers.1.self_attn.q_proj.bias', 'model.decoder.layers.1.self_attn.q_proj.we

ValueError: weight is on the meta device, we need a `value` to put in on cpu.

In [None]:
import json
import huggingface_hub
from datasets import Dataset
import os
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
)
from peft import LoraConfig
from trl import SFTTrainer

# JSON 파일 읽기
with open('1113_tech_train_set_1195228.json', 'r', encoding='utf-8') as f:
    json_data = json.load(f)
    json_test = json_data['data']

# 전처리된 데이터 저장
ko_en_list = []
for item in json_test:
    if 'ko' in item and 'en' in item:
        ko_en_list.append({
            "ko": item["ko"],
            "en": item["en"]
        })

# 새로운 JSON 파일 생성
with open('ko_en_test.json', 'w', encoding='utf-8') as f:
    json.dump(ko_en_list, f, ensure_ascii=False, indent=4)  # "data" 키를 없앰

# Hugging Face에 로그인
huggingface_hub.login('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')  # API 키 입력

# 데이터셋 로드
dataset = Dataset.from_json('ko_en_test.json')
dataset.push_to_hub('dawon62/test')

# 데이터 로드 및 모델 설정
base_model = "Helsinki-NLP/opus-mt-tc-big-en-ko"  # 학습할 모델

# 모델과 토크나이저 설정
model = AutoModelForCausalLM.from_pretrained(base_model)
tokenizer = AutoTokenizer.from_pretrained(base_model)

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# PEFT 파라미터 설정
peft_params = LoraConfig(
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)

# TrainingArguments 설정
training_params = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,  # 에포크 수
    per_device_train_batch_size=2,  # 배치 크기
    gradient_accumulation_steps=1,
    optim="adamw_hf",
    save_steps=50,
    logging_steps=50,
    learning_rate=2e-5,  # 학습률
    weight_decay=0.01,
    bf16=True,  # bf16 사용
    fp16=False,  # fp16 비활성화
    max_grad_norm=1.0,  # Gradient Clipping
    max_steps=-1,
    warmup_ratio=0.1,  # Warmup 비율
    group_by_length=True,
    lr_scheduler_type="linear",  # 학습률 스케줄러
    report_to="tensorboard",
)

# SFTTrainer 설정
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_params,
    dataset_text_field="en",  # 입력 텍스트 필드
    tokenizer=tokenizer,
    args=training_params,
    packing=True,
)

# 트레이너 실행
trainer.train()

# 훈련된 모델을 Hugging Face에 업로드
model.push_to_hub("dawon62/hkcode-mt-en-ko")
tokenizer.push_to_hub("dawon62/hkcode-mt-en-ko")

# 모델 로드 및 테스트
loaded_model = AutoModelForCausalLM.from_pretrained('dawon62/hkcode-mt-en-ko')
loaded_tokenizer = AutoTokenizer.from_pretrained('dawon62/hkcode-mt-en-ko')

# 테스트 함수
def translate(text):
    inputs = loaded_tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        outputs = loaded_model.generate(**inputs)
    return loaded_tokenizer.decode(outputs[0], skip_special_tokens=True)

# 번역 테스트
sample_text = "hello, how are you?"
translated_text = translate(sample_text)
print(f"원문: {sample_text}")
print(f"번역: {translated_text}")


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


Generating train split: 0 examples [00:00, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1196 [00:00<?, ?ba/s]

Some weights of MarianForCausalLM were not initialized from the model checkpoint at Helsinki-NLP/opus-mt-tc-big-en-ko and are newly initialized: ['lm_head.weight', 'model.decoder.embed_positions.weight', 'model.decoder.embed_tokens.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Generating train split: 0 examples [00:00, ? examples/s]



Step,Training Loss
50,5.24
100,0.0
150,0.0
200,0.0
250,0.0
300,0.0
350,0.0
400,0.0
450,0.0
500,0.0


In [None]:
import json
import huggingface_hub
from datasets import Dataset
from datasets import load_dataset
import os
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
)
from peft import LoraConfig
from trl import SFTTrainer

# JSON 파일 읽기
with open('1113_tech_train_set_1195228.json', 'r', encoding='utf-8') as f:
    json_data = json.load(f)
    json_test = json_data['data']

# 전처리된 데이터 저장
ko_en_list = []
for item in json_test:
    if 'ko' in item and 'en' in item:
        ko_en_list.append({
            "ko": item["ko"],
            "en": item["en"]
        })

# 새로운 JSON 파일 생성
with open('ko_en_test.json', 'w', encoding='utf-8') as f:
    json.dump(ko_en_list, f, ensure_ascii=False, indent=4)  # "data" 키를 없앰

# Hugging Face에 로그인
huggingface_hub.login('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')  # API 키 입력

# 데이터셋 로드
dataset = Dataset.from_json('ko_en_test.json')
dataset.push_to_hub('dawon62/test')
hkcode_dataset = "dawon62/test"  # 데이터셋
dataset = load_dataset(hkcode_dataset, split="train")

# 데이터 로드 및 모델 설정
base_model = "Helsinki-NLP/opus-mt-tc-big-en-ko"  # 학습할 모델

# 모델과 토크나이저 설정
model = AutoModelForCausalLM.from_pretrained(base_model)
tokenizer = AutoTokenizer.from_pretrained(base_model)

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# PEFT 파라미터 설정
peft_params = LoraConfig(
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)

# TrainingArguments 설정
training_params = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,  # 에포크 수
    per_device_train_batch_size=2,  # 배치 크기
    gradient_accumulation_steps=1,
    optim="adamw_hf",
    save_steps=50,
    logging_steps=50,
    learning_rate=2e-5,  # 학습률
    weight_decay=0.01,
    bf16=True,  # bf16 사용
    fp16=False,  # fp16 비활성화
    max_grad_norm=1.0,  # Gradient Clipping
    max_steps=-1,
    warmup_ratio=0.1,  # Warmup 비율
    group_by_length=True,
    lr_scheduler_type="linear",  # 학습률 스케줄러
    report_to="tensorboard",
)

# SFTTrainer 설정
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_params,
    dataset_text_field="en",  # 입력 텍스트 필드
    tokenizer=tokenizer,
    args=training_params,
    packing=False,
)

# 트레이너 실행
trainer.train()

# 훈련된 모델을 Hugging Face에 업로드
model.push_to_hub("dawon62/hkcode-mt-en-ko")
tokenizer.push_to_hub("dawon62/hkcode-mt-en-ko")

# 모델 로드 및 테스트
loaded_model = AutoModelForCausalLM.from_pretrained('dawon62/hkcode-mt-en-ko')
loaded_tokenizer = AutoTokenizer.from_pretrained('dawon62/hkcode-mt-en-ko')

# 테스트 함수
def translate(text):
    inputs = loaded_tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        outputs = loaded_model.generate(**inputs)
    return loaded_tokenizer.decode(outputs[0], skip_special_tokens=True)

# 번역 테스트
sample_text = "hello, how are you?"
translated_text = translate(sample_text)
print(f"원문: {sample_text}")
print(f"번역: {translated_text}")
