### Setup ###

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig, TrainingArguments
import transformers
import torch

from peft import LoraConfig, PeftModel
from trl import SFTTrainer

In [None]:
!sudo apt-get install expect -y # use ‘expect’ to handle interaction
!touch cmd.txt

# send 부분에 thuggingface token 입력
context = """
spawn huggingface-cli login
expect "Enter your token (input will not be visible):"
send "###########enteryourtoken############\r"
expect "Add token as git credential? (Y/n)"
send "y\r"
interact
"""
with open('cmd.txt', 'w') as f:
    f.write(context)

!expect cmd.txt

### Preprocessing ###

In [None]:
# input type: dataset
# input은 {'document': [대화1, 대화2, ...], 'summary': [대화1의 마지막 문장 뒤에 올 문장, 대화2의 마지막 문장 뒤에 올 문장, ...]} 으로 구성
# output type: list
# output은 [<bos><start_of_turn>user\n{대화1}<end_of_turn>\n<start_of_turn>model\n{대화2}
#           <end_of_turn>\n<start_of_turn>user\n{대화3}<end_of_turn> ... <end_of_turn>\n<start_of_turn>model\n{마지막 대화}<end_of_turn><EOS>]
#           처음이 user, 마지막이 model, <EOS>로 끝나게 됨
def generate_chat_prompts(example):
    output_texts = []
    for i in range(len(example['document'])):
        messages = []

        for j in range(len(example['document'][i]) // 2):
            messages.append({"role": "user", "content": f"{example['document'][i][2*j]}"})
            messages.append({"role": "assistant", "content": f"{example['document'][i][2*j+1]}"})
        messages.append({"role": "user", "content": f"{example['document'][i][2*(len(example['document'][i]) // 2)]}"})

        messages.append({"role": "assistant", "content": f"{example['summary'][i]}"})
        chat_message = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
        output_texts.append(chat_message[:-1] + '<EOS>')

    return output_texts

In [None]:
chat = ['hello', "Hello! 👋  How can I help you today? 😊", "what fruit do you like??"]
ex = {"document" : [chat],
      "summary" : ["What about you? What's your favorite fruit? 🍓🍎🍊🍌  😋 "]
     }
generate_chat_prompts(ex)

In [None]:
chat = [{"role":"user", "content":"hello"},{"role":"assistant", "content":"Hello! 👋  How can I help you today? 😊"}, {"role":"user", "content":"what fruit do you don't like?"}]
prompt = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
inputs = tokenizer.encode(prompt, add_special_tokens=False, return_tensors='pt')
outputs = model.generate(input_ids=inputs.to(model.device), max_new_tokens=100)

#prompt
#inputs
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

### DATA ###

In [None]:
import pickle

def pickle_save(data, file_name):
    with open(f"{file_name}.pickle","wb") as fw:
        pickle.dump(data, fw)
    return
 
def pickle_load(path):
    with open(path,"rb") as fr:
        return pickle.load(fr)

In [None]:
source_dataset = pickle_load("/kaggle/input/data20240923/conversation_messenger_corpus.pkl")

In [None]:
import numpy as np 
from datasets import Dataset

doc = []
summ = []
for i in range(len(source_dataset) // 20):
    data_segmentation = source_dataset[i][:2 * np.random.randint(1, len(source_dataset[i]) // 2 )]
    doc.append(data_segmentation[:-1])
    summ.append(data_segmentation[-1])
    
dataset = Dataset.from_dict({"document": doc, "summary": summ})

dataset

In [None]:
prompt_test = generate_chat_prompts(dataset)
prompt_test[0]

In [None]:
print(prompt_test[1])

### chatbot ###

In [None]:
# generate chat untill input = <EOS>

#chat = [{"role":"user", "content":""},{"role":"assistant", "content":"can you bring me some drink?"}] # model speaks first
chat = []
print("user: ")
new_chat = input() #new_chat = "<EOS>" 

while new_chat != "<EOS>":
    chat.append({"role":"user", "content":f"{new_chat}"})
    
    prompt = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer.encode(prompt, add_special_tokens=False, return_tensors='pt')
    outputs = model.generate(input_ids=inputs.to(model.device), max_new_tokens=150)

    answer = tokenizer.decode(outputs[0], skip_special_tokens=True).split("model\n")[-1]
    chat.append({"role":"assistant", "content":f"{answer}"})
    print("\nmodel: \n", answer)
    
    print("user: ")
    new_chat = input()

### Fine-tuning ###

In [None]:
lora_config = LoraConfig(
    r=6,
    lora_alpha = 8,
    lora_dropout = 0.05,
    target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
    task_type="CAUSAL_LM",
)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

In [None]:
BASE_MODEL = "google/gemma-2-2b-it"

model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, device_map="auto", torch_dtype=torch.bfloat16, quantization_config=bnb_config)
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
tokenizer.padding_side = 'right'

In [None]:
train_data = dataset

In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset=train_data,
    max_seq_length=512,
    args=TrainingArguments(
        output_dir="outputs",
        #num_train_epochs = 10,
        max_steps=1000,
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        optim="paged_adamw_8bit",
        learning_rate=2e-4,
        fp16=True,
        logging_steps=100,
        push_to_hub=False,
        report_to='none',
    ),
    peft_config=lora_config,
    formatting_func=generate_chat_prompts,
)

In [None]:
trainer.train()

In [None]:
ADAPTER_MODEL = "lora_adapter"

trainer.model.save_pretrained(ADAPTER_MODEL)

In [None]:
model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, device_map='auto', torch_dtype=torch.float16)
model = PeftModel.from_pretrained(model, ADAPTER_MODEL, device_map='auto', torch_dtype=torch.float16)

model = model.merge_and_unload()
model.save_pretrained('gemma-2-2b-it-sum-ko')

In [None]:
pipe_finetuned = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=512)


In [None]:
#doc = dataset['test']['document'][10]


In [None]:
"""messages = [
    {
        "role": "user",
        "content": "다음 글을 요약해주세요:\n\n{}".format(doc)
    }
]
prompt = pipe_finetuned.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)"""

In [None]:
messages = [
    {
        "role": "user",
        "content": "야, 이번 주말에 뭐 해? 영화 새로 개봉한 거 보러 갈래?"},
    {
        "role": "assistant",
        "content": "좋지! 근데 과제도 좀 해야 해서 시간이 맞을지 모르겠어. 너 이번에 수학 숙제 다 했어?"
    },
    {
        "role": "user",
        "content": "아직 안 했어. 나도 좀 미루고 있었어. 같이 모여서 끝내고 영화 보는 건 어때?"
    }
]
prompt = pipe_finetuned.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

In [None]:
outputs = pipe_finetuned(
    prompt,
    do_sample=True,
    temperature=0.2,
    top_k=50,
    top_p=0.95,
    add_special_tokens=True
)
print(outputs[0]["generated_text"][len(prompt):])

In [None]:
prompt, outputs[0]["generated_text"][len(prompt):]

In [None]:
import time

for _ in range(3600):
    time.sleep(15)