In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
from tqdm import tqdm, trange
from sklearn.model_selection import train_test_split
from datasets import Dataset
import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import os
import bitsandbytes as bnb
import torch
import torch.nn as nn
import transformers
from datasets import Dataset
from peft import LoraConfig, PeftConfig
from trl import SFTTrainer
from trl import setup_chat_format
from transformers import (AutoModelForCausalLM, 
                          AutoTokenizer, 
                          BitsAndBytesConfig, 
                          TrainingArguments, 
                          pipeline,)

In [3]:
import wandb

user = "crysis"
project = "llm-law-pt"
display_name = "mistral-law-pt(24-02-06)"

wandb.init(entity=user, project=project, name=display_name)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mcrysis[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [4]:
filename = "./data/lawdata.csv"
df = pd.read_csv(filename, 
                 encoding="utf-8", encoding_errors="replace")
df

Unnamed: 0,instruction,output
0,동거 중인데 갑자기 헤어지자는 통보를 받았어요. 사실혼인 경우에도 위자료를 받을 수...,위자료를 받을 수 있습니다. 사실혼은 부부간 합의 또는 부부 어느 한 쪽의 일방적인...
1,이혼이 취소될 수 있나요?,"재판상 이혼은 재판절차를 거쳐 이혼판결이 선고된 것이므로 취소될 수 없지만, 협의이..."
2,이혼해도 자녀를 만날 수 있나요?,이혼 후 자녀를 직접 양육하지 않는 부모 일방과 자녀는 서로 만나거나 연락할 수 있...
3,이혼한 후에 자녀의 성과 본을 저의 성과 본으로 바꿀 수 있나요?,이혼 후 자녀의 성과 본을 자신의 성과 본으로 바꿀 수 있습니다. ◇ 법원 허가 자...
4,"중학생 딸아이가 학교 숙제로 유언장을 작성했는데, 이 유언장이 법적으로 효력 있는 ...",유언은 의사능력이 있는 17세(유언 적령)에 달한 사람이 할 수 있습니다. 따라서 ...
...,...,...
2270,"시청에 정기적으로 문구류를 납품하는 수의계약에 참여했는데, 수의계약의 계약상대자는 ...","수의계약대상자는 견적제출자의 견적가격과 계약이행능력 등에 따라 결정되며, 원칙적으로..."
2271,물품계약을 체결한 후 물가가 급격히 올라서 계약 당시의 금액으로는 수량을 맞추기 어...,아니요. 물품계약을 체결한 날부터 90일 이상 지난 후 입찰일을 기준일로 하여 품목...
2272,"계약을 체결한 후에는 계약 완료 전이라도 대금을 미리 받을 수 있다고 하던데, 얼마...",계약을 이행하기 전이라도 일정 요건에 해당하면 계약금액의 100분의 70을 초과하지...
2273,물품을 납품하기 직전에 천재지변 등 불가항력의 사유로 대형화재가 발생하여 해당 물품...,"계약상대자의 책임 없이 이행이 지체되는 경우에는 계약기간 연장신청을 할 수 있고, ..."


### Train, Validation, Test Set Split

In [5]:
train, test = train_test_split(df,
                               shuffle=True,
                               train_size=1800,
                               test_size=200, 
                               random_state=32)

In [6]:
eval_idx = [idx for idx in df.index if idx not in list(train.index) + list(test.index)]
X_eval = df[df.index.isin(eval_idx)]
X_train = train.reset_index(drop=True)

In [7]:
print(f'Train Set: {len(X_train)}\nValidation Set: {len(X_eval)}\nTest Set: {len(test)}')

Train Set: 1800
Validation Set: 275
Test Set: 200


### data preprocessing

In [8]:
def generate_prompt(df):
    text_col = list()
    for _, row in df.iterrows():
        prompt = "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n"
        instruction = str(row["instruction"])
        response = str(row["output"])
        
        text = prompt + "### Instruction:\n" + instruction + "\n### Response:\n" + response
        text_col.append(text)
    
    df.loc[:, "text"] = text_col        
    return df

def generate_test_prompt(df):
    text_col = list()
    for _, row in df.iterrows():
        prompt = "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n"
        instruction = str(row["instruction"])
        
        text = prompt + "### Instruction:\n" + instruction + "\n### Response:\n" 
        text_col.append(text)
    
    df.loc[:, "text"] = text_col        
    return df
        

In [9]:
X_train = generate_prompt(X_train)
X_eval = generate_prompt(X_eval)

y_true = test['output']
X_test = generate_test_prompt(test)

In [10]:
train_data = Dataset.from_pandas(X_train)
eval_data = Dataset.from_pandas(X_eval)

### Load Foundation Model

In [11]:
# model_name = "../../../../../llama2/"
model_name = "mistralai/Mistral-7B-v0.1"
compute_dtype = getattr(torch, "float16")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, 
    bnb_4bit_quant_type="nf4", 
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=compute_dtype,
    quantization_config=bnb_config, 
)

model.config.use_cache = False
model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(model_name, 
                                          trust_remote_code=True,
                                         )
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model, tokenizer = setup_chat_format(model, tokenizer)

Loading checkpoint shards: 100%|██████████| 2/2 [00:06<00:00,  3.45s/it]


In [13]:
peft_config = LoraConfig(
        lora_alpha=16, 
        lora_dropout=0.1,
        r=64,
        bias="none",
        task_type="CAUSAL_LM",
)

training_arguments = TrainingArguments(
    output_dir="./results",                      
    num_train_epochs=4,                     
    per_device_train_batch_size=8,    
    gradient_accumulation_steps=8,         
    gradient_checkpointing=True,           
    optim="paged_adamw_32bit",
    save_steps=0,
    logging_steps=5,                       
    learning_rate=2e-4,                    
    weight_decay=0.001,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,                     
    max_steps=-1,
    warmup_ratio=0.03,                     
    group_by_length=True,
    lr_scheduler_type="constant",              
    evaluation_strategy="steps"             
)

trainer = SFTTrainer(
    model=model,
    args=training_arguments,
    train_dataset=train_data,
    eval_dataset=eval_data,
    peft_config=peft_config,
    dataset_text_field="text",
    tokenizer=tokenizer,
    max_seq_length=1024,
    packing=True,
    dataset_kwargs={
        "add_special_tokens": False,  # We template with special tokens
        "append_concat_token": False, # No need to add additional separator token
    }
)

[codecarbon INFO @ 13:30:28] [setup] RAM Tracking...
[codecarbon INFO @ 13:30:28] [setup] GPU Tracking...
[codecarbon INFO @ 13:30:28] Tracking Nvidia GPU via pynvml
[codecarbon INFO @ 13:30:28] [setup] CPU Tracking...
[codecarbon INFO @ 13:30:29] CPU Model on constant consumption mode: AMD Ryzen 9 7950X 16-Core Processor
[codecarbon INFO @ 13:30:29] >>> Tracker's metadata:
[codecarbon INFO @ 13:30:29]   Platform system: Linux-5.15.133.1-microsoft-standard-WSL2-x86_64-with-glibc2.35
[codecarbon INFO @ 13:30:29]   Python version: 3.10.13
[codecarbon INFO @ 13:30:29]   CodeCarbon version: 2.2.3
[codecarbon INFO @ 13:30:29]   Available RAM : 30.928 GB
[codecarbon INFO @ 13:30:29]   CPU count: 32
[codecarbon INFO @ 13:30:29]   CPU model: AMD Ryzen 9 7950X 16-Core Processor
[codecarbon INFO @ 13:30:29]   GPU count: 1
[codecarbon INFO @ 13:30:29]   GPU model: 1 x NVIDIA GeForce RTX 4090


In [14]:
for name, module in trainer.model.named_modules():
    if "norm" in name:
        module = module.to(torch.float32)

In [15]:
trainer.train()
trainer.model.save_pretrained("llama-law-model-0206")

You are using 8-bit optimizers with a version of `bitsandbytes` < 0.41.1. It is recommended to update your version as a major bug has been fixed in 8-bit optimizers.
You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
[codecarbon INFO @ 13:31:36] Energy consumed for RAM : 0.000048 kWh. RAM Power : 11.597851753234863 W
[codecarbon INFO @ 13:31:36] Energy consumed for all GPUs : 0.001431 kWh. Total GPU Power : 343.36400000000003 W
[codecarbon INFO @ 13:31:36] Energy consumed for all CPUs : 0.000177 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 13:31:36] 0.001657 kWh of electricity used since the beginning.
[codecarbon INFO @ 13:31:51] Energy consumed for RAM : 0.000097 kWh. RAM Power : 11.597851753234863 W
[codecarbon INFO @ 13:31:51] Energy consumed for all GPUs : 0.002872 kWh. Total GPU Power : 346.666000000000

Step,Training Loss,Validation Loss
5,1.3295,1.247522
10,1.2277,1.187128
15,1.1787,1.153521
20,1.1589,1.132661
25,1.1226,1.117602
30,1.1069,1.106213
35,1.1066,1.095602
40,1.0923,1.087323
45,1.0682,1.081109
50,1.0825,1.075099


[codecarbon INFO @ 13:32:06] Energy consumed for RAM : 0.000145 kWh. RAM Power : 11.597851753234863 W
[codecarbon INFO @ 13:32:06] Energy consumed for all GPUs : 0.004372 kWh. Total GPU Power : 360.015 W
[codecarbon INFO @ 13:32:06] Energy consumed for all CPUs : 0.000531 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 13:32:06] 0.005048 kWh of electricity used since the beginning.
[codecarbon INFO @ 13:32:21] Energy consumed for RAM : 0.000193 kWh. RAM Power : 11.597851753234863 W
[codecarbon INFO @ 13:32:21] Energy consumed for all GPUs : 0.005899 kWh. Total GPU Power : 366.81 W
[codecarbon INFO @ 13:32:21] Energy consumed for all CPUs : 0.000708 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 13:32:21] 0.006801 kWh of electricity used since the beginning.
[codecarbon INFO @ 13:32:36] Energy consumed for RAM : 0.000241 kWh. RAM Power : 11.597851753234863 W
[codecarbon INFO @ 13:32:36] Energy consumed for all GPUs : 0.007407 kWh. Total GPU Power : 361.968 W
[codecarbon INFO @ 13:32: