#### 실습은 런팟에서 진행하고 코드만 저장한다.

- requirements.txt 파일 생성 
transformers==4.44.2 
datasets==2.18.0 
accelerate==0.29.3 
evaluate==0.4.1 
bitsandbytes==0.43.1 
huggingface_hub>=0.23.2 
trl==0.8.6 
peft==0.10.0
scikit-learn 
wandb 


pip install -r requirements.txt 


In [None]:
import torch
import transformers
import trl

print(f"PyTorch version: {torch.__version__}")
print(f"Transformers version: {transformers.__version__}")
print(f"TRL version: {trl.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA version: {torch.version.cuda}")

#토큰값 넣기
from huggingface_hub import login

login(
  token="Your_Huggingface_API_KEY",
  add_to_git_credential=True
)



import json 
import torch
from datasets import Dataset, load_dataset
from trl import (setup_chat_format, 
                 DataCollatorForCompletionOnlyLM, 
                 SFTTrainer)
from peft import AutoPeftModelForCausalLM, LoraConfig, PeftConfig 
from transformers import (AutoTokenizer, 
                          AutoModelForCausalLM, 
                          TrainingArguments, 
                          BitsAndBytesConfig, 
                          pipeline, 
                          StoppingCriteria)

model_id = "google/gemma-2-9b-it" 

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    attn_implementation='eager'
    # load_in_8bit=True
)
tokenizer = AutoTokenizer.from_pretrained(model_id)


In [None]:
!wget https://raw.githubusercontent.com/MrBananaHuman/CounselGPT/main/total_kor_multiturn_counsel_bot.jsonl

In [None]:
with open("./total_kor_multiturn_counsel_bot.jsonl") as f:
    original_jsonl_data = [json.loads(line) for line in f ]

len(original_jsonl_data)
test_data = original_jsonl_data [5085]

speaker_dict = {'내담자' : 'user', '상담사' : 'assistant'}

tmp = []
for m in test_data:
    tmp.append({'role' : speaker_dict[m['speaker']], 'content': m['utterance']})

converted_messages = [{'role' : speaker_dict[m['speaker']], 'content': m['utterance']} for m in test_data]

#assistant로 시작하는 경우 첫번째 메시지 제거 
if converted_messages and converted_messages[0]['role'] == 'assistant':
    converted_messages = converted_messages[1:]
#user로 끝나는 경우 마지막 메시지들 제거 
while converted_messages and converted_messages[-1]['role'] == 'user':
    converted_messages = converted_messages[:-1]

#대화의 화자가 연속적일때 병합 
merged = []
current_role = converted_messages[0]['role']
current_content = converted_messages[0]['content']
for message in converted_messages[1:]:
    if message['role'] == current_role:
        current_content = current_content + " " + message['content']
    else:
        merged.append({'role' : current_role, 'content' : current_content})
        current_role = message['role']
        current_content = message['content']
merged.append({'role' : current_role , 'content' : current_content})

converted_messages[-3:]

In [None]:
#위의 로직을 사용자 함수로 
def preprocess_conversation(messages):
    # speaker를 role로 변환
    converted_messages = [{'role': speaker_dict[m['speaker']], 'content': m['utterance']} for m in messages]
    
    # assistant로 시작하는 경우 첫 메시지 제거
    if converted_messages and converted_messages[0]['role'] == 'assistant':
        converted_messages = converted_messages[1:]
    
    # user로 끝나는 경우 마지막 메시지들 제거
    while converted_messages and converted_messages[-1]['role'] == 'user':
        converted_messages = converted_messages[:-1]
    
    # 연속된 동일 역할의 메시지 병합
    converted_messages = merge_consecutive_messages(converted_messages)
    
    # 대화가 비어있거나 홀수 개의 메시지만 남은 경우 처리
    if not converted_messages or len(converted_messages) % 2 != 0:
        return []
    
    return converted_messages

In [None]:
#대화가 연속된 동일 역할의 메시지 병합 함수
def merge_consecutive_messages(messages):
    if not messages:
        return []
    
    merged = []
    current_role = messages[0]['role']
    current_content = messages[0]['content']
    
    for message in messages[1:]:
        if message['role'] == current_role:
            current_content += " " + message['content']
        else:
            merged.append({'role': current_role, 'content': current_content})
            current_role = message['role']
            current_content = message['content']
    
    merged.append({'role': current_role, 'content': current_content})
    return merged

In [None]:
#위에서 만든 사용자 함수를 사용하여 전체 데이터에 적용할 함수 
def transform_to_new_format(original_data):
    transformed_data = []
    for conversation in original_data:
        processed_conversation = preprocess_conversation(conversation)
        if processed_conversation:
            transformed_data.append(processed_conversation)
    return transformed_data

result = transform_to_new_format(original_jsonl_data)

with open("./train_dataset.jsonl" , "w", encoding='utf-8') as f:
    for conversation in result:
        json_obj = {'messages' : conversation}
        json.dump(json_obj, f, ensure_ascii=False)
        f.write("\n")

#ensure_ascii값을 True로 설정하면 아래와 같이 출력

In [None]:
#꼬리(마지막) 부분만 출력(마지막 한줄만 출력) (리눅스 명령어)
tail -n 1 train_dataset.jsonl

In [None]:
dataset = load_dataset("json", data_files="./train_dataset.jsonl", split="train")
dataset

peft_config = LoraConfig(
        lora_alpha=128,
        lora_dropout=0.05,
        r=256,
        bias="none",
        target_modules=[
            "q_proj",
            "up_proj",
            "o_proj",
            "k_proj",
            "down_proj",
            "gate_proj",
            "v_proj"],
        task_type="CAUSAL_LM",
)
args = TrainingArguments(
    output_dir="./model_output", 
    num_train_epochs=1,          
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,  
    optim="adamw_torch_fused",    
    logging_steps=100,            
    save_strategy="epoch",        
    learning_rate=2e-4,           
    bf16=True,                    
    tf32=True,                    
    max_grad_norm=0.3,            
    warmup_ratio=0.03,            
    lr_scheduler_type="constant", 
    push_to_hub=False,             
    #report_to="wandb",            
)

trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset=dataset,
    max_seq_length=512,
    peft_config=peft_config,
    tokenizer=tokenizer,
    packing=True,
)

In [None]:
#터미널에서 
watch -n 1 nvidia-smi