In [1]:
import warnings
warnings.filterwarnings("ignore")
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [2]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import bitsandbytes as bnb
import torch
import torch.nn as nn
import transformers
from datasets import Dataset
from peft import LoraConfig, PeftConfig
from trl import SFTTrainer
from trl import setup_chat_format
from transformers import (AutoModelForCausalLM, 
                          AutoTokenizer, 
                          BitsAndBytesConfig, 
                          TrainingArguments, 
                          pipeline,)
from sklearn.model_selection import train_test_split

In [3]:
filename = "./data/law-all-data.csv"
df = pd.read_csv(filename, 
                 encoding="utf-8", encoding_errors="replace")
# df_groupby = df[df['input'] == '가정법률']

In [4]:
print(len(df))

2275


In [5]:
train, test = train_test_split(df,
                               shuffle=True,
                                train_size=1000,
                                test_size=1000, 
                                random_state=32)

In [6]:
eval_idx = [idx for idx in df.index if idx not in list(train.index) + list(test.index)]

In [7]:
X_eval = df[df.index.isin(eval_idx)]

In [8]:
X_train = train.reset_index(drop=True)

In [32]:
train.to_csv("./data/train.csv")
test.to_csv("./data/val.csv")
X_eval.to_csv("./data/test.csv")

In [9]:
def generate_prompt(data_point):
    return f"""
            Below is an instruction that describes a task. Write a response that appropriately completes the request.

            [{data_point["instruction"]}] = {data_point["output"]}
            """.strip()

def generate_test_prompt(data_point):
    return f"""
            Below is an instruction that describes a task. Write a response that appropriately completes the request.

            [{data_point["instruction"]}] = """.strip()

In [10]:
X_train = pd.DataFrame(X_train.apply(generate_prompt, axis=1), 
                       columns=["text"])
X_eval = pd.DataFrame(X_eval.apply(generate_prompt, axis=1), 
                      columns=["text"])

y_true = test['output']
X_test = pd.DataFrame(test.apply(generate_test_prompt, axis=1), columns=["text"])


In [11]:
train_data = Dataset.from_pandas(X_train)
eval_data = Dataset.from_pandas(X_eval)

In [12]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
import difflib

def evaluate(y_pred, y_true):
    # 코사인
    sentences = (y_pred, y_true)
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(sentences)
    cos_similar = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
    print(f"Cosine Similarity: {cos_similar[0][0]}")
    
    # 유클리디언
    tfidf_normalized = tfidf_matrix/np.sum(tfidf_matrix)
    euc_d_norm = euclidean_distances(tfidf_normalized[0:1],tfidf_normalized[1:2])
    print(f"Euclidean Similarity: {euc_d_norm[0][0]}")
    
    # 자카드
    input_string = y_true
    answer_string = y_pred
    intersection_cardinality = len(set.intersection(*[set(answer_string), set(input_string)]))
    union_cardinality = len(set.union(*[set(answer_string), set(input_string)]))
    similar = intersection_cardinality / float(union_cardinality)
    print(f"Jaccard Similarity: {similar}")
    
    # SequenceMatcher
    answer_bytes = bytes(answer_string, 'utf-8')
    input_bytes = bytes(input_string, 'utf-8')
    answer_bytes_list = list(answer_bytes)
    input_bytes_list = list(input_bytes)
    sm = difflib.SequenceMatcher(None, answer_bytes_list, input_bytes_list)
    sequenceMatcher = sm.ratio()
    print(f"SequenceMatcher: {sequenceMatcher}")
    

In [13]:
model_name = "../../../../llama2"
compute_dtype = getattr(torch, "float16")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, 
    bnb_4bit_quant_type="nf4", 
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=compute_dtype,
    quantization_config=bnb_config, 
)

model.config.use_cache = False
model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(model_name, 
                                          trust_remote_code=True,
                                         )
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model, tokenizer = setup_chat_format(model, tokenizer)

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.22it/s]


In [14]:
def predict(test, model, tokenizer):
    y_pred = []
    for i in tqdm(range(len(X_test))):
        prompt = X_test.iloc[i]["text"]
        pipe = pipeline(task="text-generation", 
                        model=model, 
                        tokenizer=tokenizer, 
                        max_new_tokens = 100, 
                        temperature = 0.0,
                       )
        result = pipe(prompt)
        answer = result[0]['generated_text'].split("=")[-1]  
        # answer = result[0]['generated_text'].split("=")[-1]        
        y_pred.append(answer)
        
    return y_pred

In [17]:
y_pred = predict(test, model, tokenizer)

100%|██████████| 15/15 [01:11<00:00,  4.76s/it]


In [18]:
y_pred

[' 아니요\n\n            [친권자가 ����',
 ' 저는 이혼소송 중이다. 상대방이 재산을 마음대로 처분해서 나중에 재산분할을 못 받을까 걱정돼요. \n좋은 방법이 있나요?\n\n           ',
 ' 아이를 파양하려면 어떻게 해야 하나요?\n\n            [재혼한 남편의 아이를 일반양자로 입양하여 제 아이로 키우던 중 남편과 이혼하',
 ' 출국금지 대상이 되지 않습니다.\n\n            [저는 이혼 후 혼자 살고 있는데, 사업상 해외 출장을 자주 다닙니다. 법원으',
 ' 아니요\n\n            [이혼 후 혼자 초등학생 아이를 양육하고 있습니다. 전 배우자가 아이와 같이 살지 않는 경우에도, 이미 제가 지�',
 ' \n[당신이 바람을 피웠어요. \n이혼해 달랬더니 이혼은 절대 안 된다며 완강히 거부합니다. \n전 정말 이혼할 수 �',
 '"col-md-12">\n            <div',
 ' 재판상 이혼 절차는 어떻게 되나요',
 ' [예]\n\n            [예]\n\n            [예]\n\n            [예]\n\n            [예]\n\n            [예]\n\n            [예]\n\n            [예]\n\n            [예]\n\n            [예]\n\n            [예]\n\n            [예]\n\n            [예',
 ' 집을 나가 몇 달씩 안 들어옵니다. 남편을 상대로 이혼을 청구할 수 있나요?\n\n            [집을 나가 몇 달씩 안 들어옵니다. 남��',
 ' 시부모가 우리 결혼생활에 심하게 간섭하고 폭언을 일삼는 것 때문에 이혼하게 되었는데 시부모에게 위자료를 �',
 ' 아내가 재산분할청구를 할 수 있습니다.\n\n            [아내가 바람을 피워서 이혼하게 됐어요. \n아내가 재산을 분할해 달라고 하는데',
 ' [알 수 없어요]\n

In [15]:
peft_config = LoraConfig(
        lora_alpha=16, 
        lora_dropout=0.1,
        r=64,
        bias="none",
        task_type="CAUSAL_LM",
)

training_arguments = TrainingArguments(
    output_dir="logs",                        # directory to save and repository id
    num_train_epochs=30,                       # number of training epochs
    per_device_train_batch_size=8,            # batch size per device during training
    gradient_accumulation_steps=8,            # number of steps before performing a backward/update pass
    gradient_checkpointing=True,              # use gradient checkpointing to save memory
    optim="paged_adamw_32bit",
    save_steps=0,
    logging_steps=25,                         # log every 10 steps
    learning_rate=2e-4,                       # learning rate, based on QLoRA paper
    weight_decay=0.001,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,                        # max gradient norm based on QLoRA paper
    max_steps=-1,
    warmup_ratio=0.03,                        # warmup ratio based on QLoRA paper
    group_by_length=True,
    lr_scheduler_type="cosine",               # use cosine learning rate scheduler
    report_to="tensorboard",                  # report metrics to tensorboard
    evaluation_strategy="epoch"               # save checkpoint every epoch
)

trainer = SFTTrainer(
    model=model,
    args=training_arguments,
    train_dataset=train_data,
    eval_dataset=eval_data,
    peft_config=peft_config,
    dataset_text_field="text",
    tokenizer=tokenizer,
    max_seq_length=1024,
    packing=True,
    dataset_kwargs={
        "add_special_tokens": False,  # We template with special tokens
        "append_concat_token": False, # No need to add additional separator token
    }
)

In [16]:
# Train model
trainer.train()

# Save trained model
trainer.model.save_pretrained("law-trained-model")

You are using 8-bit optimizers with a version of `bitsandbytes` < 0.41.1. It is recommended to update your version as a major bug has been fixed in 8-bit optimizers.
You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
0,No log,0.954572
1,0.976100,0.844885
2,0.976100,0.815865
3,0.831800,0.801071
4,0.785400,0.791939
5,0.785400,0.786956
6,0.754200,0.783189
8,0.730600,0.782751


KeyboardInterrupt: 

In [23]:
y_pred = predict(test, model, tokenizer)
# evaluate(y_true, y_pred)

100%|██████████| 15/15 [01:36<00:00,  6.44s/it]


In [26]:
y_pred[1]

' 이혼소송 중인데 상대방이 재산을 마음대로 처분해서 나중에 재산분할을 못 받을까 걱정돼요. \n좋은 방법이 있나요?\n\n            [처'