In [None]:
!pip3 install -q -U transformers
!pip3 install -q -U datasets
!pip3 install -q -U bitsandbytes
!pip3 install -q -U peft
!pip3 install -q -U trl
!pip3 install -q -U accelerate

In [None]:
import huggingface_hub
huggingface_hub.notebook_login()

In [None]:
import os
import json
import argparse
from datetime import datetime
import random
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig, TrainingArguments
import transformers
import torch

from peft import LoraConfig, PeftModel
from trl import SFTTrainer

CONST_STR_PROGRAM_NAME = "챗봇"
CONST_STR_COUNSEL_MODEL = "vvoooo/gemma-2-2b-it-eym-ko"
CONST_STR_END_MESSAGE = "수고하셨습니다."

CONST_PATH_MODEL = "./model"

CONST_LIST_CATEGORIES = ['가정폭력', '가출경험 및 가출중 정황', '걱정', '교사', '기타 보호자', '미래/진로', '방임', '분노/짜증', '성학대', '수면', '신체손상', '신체학대', '아버지', '어머니', '자해/자살', '정서학대', '즐거움', '친구', '통증', '트라우마', '학교폭력', '행복', '형제자매']

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)


In [None]:
class Chatbot():
    def __init__(self, model_id, quantization_config=bnb_config):
        self.model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.bfloat16, quantization_config=quantization_config)
        self.tokenizer = AutoTokenizer.from_pretrained(model_id)
        self.tokenizer.padding_size = "right"

    def add_sepcial_tokens(self, special_tokens:list=[]):
        for t in special_tokens:
            assert t[0] == "<" and t[-1] == ">"

        # 커스텀 토큰 정의
        special_tokens = {'additional_special_tokens': special_tokens}

        # 토크나이저에 커스텀 토큰 추가 및 모델 임베딩 크기 조정
        self.tokenizer.add_special_tokens(special_tokens)
        self.model.resize_token_embeddings(len(self.tokenizer))

        # 모델 설정 업데이트
        self.model.config.update(special_tokens)

    def ask_answer(self, chat, friendly:bool=False):
        prompt = self.tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
        inputs = self.tokenizer.encode(prompt, add_special_tokens=False, return_tensors='pt')
        if friendly:
            outputs = self.model.generate(input_ids=inputs.to(self.model.device), max_new_tokens=100, temperature=0.7, top_k=50, top_p=0.9, repetition_penalty=1.2, do_sample=True)
        else:
            outputs = self.model.generate(input_ids=inputs.to(self.model.device), max_new_tokens=150)
        return self.tokenizer.decode(outputs[0], skip_special_tokens=True).split("model\n")[-1]

def get_category(answer):
    for category in CONST_LIST_CATEGORIES:
        if category in answer:
            return category

    return None

def conversation(new_chat, counsel_bot):
    chat = []
    while new_chat != "///":
        chat.append({"role":"user", "content":f"{new_chat}"})

        answer = counsel_bot.ask_answer(chat)

        if CONST_STR_END_MESSAGE in answer:

            print(f"\n{CONST_STR_PROGRAM_NAME}: \n", "그렇구나. 너에 대해서 알아가서 너무 좋아. 다음에도 이야기 하자.", '\n')

            return chat
        else:
            print(f"\n{CONST_STR_PROGRAM_NAME}: \n", answer, '\n')

        chat.append({"role":"assistant", "content":f"{answer}"})

        print("user: ")
        new_chat = input()

def generate_report(chat, start, end, save:bool=False):
    report = {
        "info": {
            "ID": "0002",
            "상담시작": start,
            "상담종료": end,
            "문항": chat[0]["content"]
        },
        "text": chat[1:]
    }

    if save:
        output_path = "./result"
        os.mkdir(output_path)
        with open(os.path.join(output_path, f"counsel_log_{start}.json"), 'w') as json_file:
            json.dump(report, json_file)

    return report

In [None]:
counsel_bot = Chatbot(CONST_STR_COUNSEL_MODEL)

print(f"{CONST_STR_PROGRAM_NAME}이 나타났어요.")

In [None]:
category = CONST_LIST_CATEGORIES[random.randint(0, len(CONST_LIST_CATEGORIES)-1)]

chat = conversation(category, counsel_bot)