# Install library

In [None]:
!pip install -q datasets==2.16.0
!pip install -q bitsandbytes
!pip install -q tiktoken
!pip install -q peft
!pip install -q trl
!pip install -q transformers
!pip install -q openpyxl
!pip install -q pandas
!pip install -q scikit-learn
!pip install -q flash-attn
#pip install -q transformers==4.38.1

[0m

# Import library

In [None]:
import json
import torch
import pandas as pd
import numpy as np
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
from accelerate import PartialState
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
from trl import SFTTrainer
from peft import prepare_model_for_kbit_training
from transformers import TrainingArguments

# Hyperparameters

In [None]:
modelpath = "google/gemma-2-9b-it"
lr=2e-4      # learning rate
bs=16            # batch size
bs_eval=16      # batch size for evals
ga_steps=1     # gradient acc. steps
epochs=4
max_length=128      # max. sample length with 24GB VRAM
output_dir="out"

# Remove old model
Because of limited storage, we can't save all models, we need to delete all models in cache by the following code:
- rm -r out: delete out folder (because I save finetuned model in out folder), you can change folder name like (rm -r output_folder). If you doesn't have out folder, this commend do not thing.
- all pretrained huggingface models will auto save in transformers.TRANSFORMERS_CACHE. I use glob.glob to read all folders and use shutil.rmtree to delete them.

In [None]:
!rm -r out

In [None]:
# from transformers import TRANSFORMERS_CACHE
# import glob
# print(TRANSFORMERS_CACHE)
# folders = glob.glob(f"{TRANSFORMERS_CACHE}/*")
# print(folders)

# import shutil
# #shutil.rmtree(TRANSFORMERS_CACHE)
# for folder in folders:
#     if '.txt' not in folder:
#         shutil.rmtree(folder)

# Create Dataset
Download dataset from [kaggle synthetic-vietnamese-students-feedback-corpus](https://www.kaggle.com/datasets/toreleon/synthetic-vietnamese-students-feedback-corpus/data)

We need convert DataFrame to json line (jsonl).

In [None]:
df_train = pd.read_csv("synthetic_train.csv")
df_test = pd.read_csv("synthetic_val.csv")

In [None]:
df_train.head()

Unnamed: 0,sentence,sentiment,topic
0,Đội ngũ bảo trì quá thưa thớt dẫn đến không đả...,negative,facility
1,The university's musical and artistic faciliti...,neutral,facility
2,Phương pháp giảng dạy phù hợp với các đối tượn...,neutral,curriculum
3,Chương trình học giúp tôi trở thành một chuyên...,positive,curriculum
4,Tôi nghĩ rằng chương trình đào tạo có thể có t...,neutral,curriculum


In [None]:
df_test.head()

Unnamed: 0,sentence,sentiment,topic
0,Chất lượng vật chất kém.,negative,facility
1,"Phần mềm học tập quá khó sử dụng, khiến sinh v...",negative,facility
2,Trường tôi thiếu những tiện ích cơ bản như máy...,negative,facility
3,Cần tạo thêm các hoạt động gắn kết giữa sinh v...,neutral,curriculum
4,Họ rất khoan dung và lượng giác trong quan điể...,neutral,others


In [None]:
df_train.sentiment.value_counts()

sentiment
neutral     2724
negative    2711
positive    2709
Name: count, dtype: int64

In [None]:
df_test.sentiment.value_counts()

sentiment
negative    686
positive    680
neutral     670
Name: count, dtype: int64

In [None]:
df_train['len'] = df_train.sentence.apply(lambda x: len(str(x).split()))
df_test['len'] = df_test.sentence.apply(lambda x: len(str(x).split()))

In [None]:
df_train['len'].describe()

count    8144.000000
mean       15.549730
std         5.018764
min         3.000000
25%        12.000000
50%        15.000000
75%        18.000000
max        43.000000
Name: len, dtype: float64

In [None]:
df_test['len'].describe()

count    2036.000000
mean       15.694990
std         5.185957
min         2.000000
25%        12.000000
50%        15.000000
75%        19.000000
max        48.000000
Name: len, dtype: float64

In [None]:
with open('train.jsonl', 'w') as outfile:
    for i, x in df_train.iterrows():
        comment = x['sentence']
        label = x['sentiment']
        #label = 'yes' if label == 'relevance' else 'no'
        data = {
            "input": f'''The sentiment of this comment "{comment}" is''',
            "output": f"{label}"
        }
        json.dump(data, outfile)
        outfile.write('\n')

In [None]:
with open('test.jsonl', 'w') as outfile:
    for i, x in df_test.iterrows():
        comment = x['sentence']
        label = x['sentiment']
        #label = 'yes' if label == 'relevance' else 'no'
        data = {
            "input": f'''The sentiment of this comment "{comment}" is''',
            "output": f"{label}"
        }
        json.dump(data, outfile)
        outfile.write('\n')

In [None]:
data_files = {
    "train": "train.jsonl",
    "validation": "test.jsonl",
}

dataset = load_dataset("json", data_files=data_files)

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['input', 'output'],
        num_rows: 8144
    })
    validation: Dataset({
        features: ['input', 'output'],
        num_rows: 2036
    })
})

# Create prompt format
Load tokenizer by AutoTokenizer.from_pretrained:
- We need create and copy YOUR TOKEN from [huggingface](https://huggingface.co/settings/tokens)
- We need use padding_side = 'right' because training library need padding_side = 'right' when training. You can use left padding but you need to make sure the library is not corrupted and check whether performance is affected by left padding!
- If you have 1 prompt like "test thử mô hình" and want to tokenize it, just you tokenizer(prompt, return_tensors="pt"). You can see output of tokenizer in cell below (output includes input_ids (list index of each token in prompt) and attention mask)
- We can use tokenizer.batch_decode to see how tokenizer restore string from token tensor. You can see that it automatically adds the start token "<bos>" at the beginning of the string.
- To train llm, we only need to pass 1 sentence to llm (including input and desired output) without specifying which is the input and which is the output.
- I wrote the function formatting_prompts_func to convert input and output to prompt and tested this function, you can see below.
- When predicting, remove the output part to let the model predict itself. See the predict section below later.
- we only need to predict some next tokens like A. positive, and B. neutral. We don't care what the model says after sentiment. Then we do not need to add <eos token>. If you fine-tune the model with other tasks, maybe you need to add <eos token> at the end of the prompt:
  - use tokenizer.eos_token, tokenizer.eos_token_id to see eos_token of your model and correspond id
  - for ex: eos_token is "<|im_end|>". You need edit prompt like:
    - '''...The correct answer is {output_}.''' --> '''...The correct answer is {output_}. <|im_end|>'''
  - for ex: eos_token is "end_token__". You need edit prompt like:
    - '''...The correct answer is {output_}.''' --> '''...The correct answer is {output_}. end_token__'''

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    modelpath,
    padding_side="right",
    # add_eos_token=True,
    # add_bos_token=True,
    trust_remote_code=True,
    token = 'YOUR TOKEN HERE'
)

In [None]:
prompt = "test thử mô hình"
tokens = tokenizer(prompt, return_tensors="pt")
print(tokens)
tokenizer.batch_decode(tokenizer.encode(prompt))

{'input_ids': tensor([[     2,   2195, 101869,  34580,  15885]]), 'attention_mask': tensor([[1, 1, 1, 1, 1]])}


['<bos>', 'test', ' thử', ' mô', ' hình']

In [None]:
def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example['input'])):
        input_ = example['input'][i]
        output_ = example['output'][i]
        output_ = 'A. Positive' if output_ == 'positive' else 'B. Neutral' if output_ == 'neutral' else 'C. Negative'
        #text = f"### Question: {input__}\n ### Answer: {example['output'][i]}"
        text = f'''{input_}
A. Positive
B. Neutral
C. Negative

The correct answer is {output_}.'''

        output_texts.append(text)
    return output_texts

In [None]:
dataset["validation"]['input'][0], dataset["validation"]['output'][0]

('The sentiment of this comment "Chất lượng vật chất kém." is', 'negative')

In [None]:
formatting_prompts_func({'input': [dataset["validation"]['input'][0]],
                         'output': [dataset["validation"]['output'][0]]})

['The sentiment of this comment "Chất lượng vật chất kém." is\nA. Positive\nB. Neutral\nC. Negative\n\nThe correct answer is C. Negative.']

In [None]:
print(formatting_prompts_func({'input': [dataset["validation"]['input'][0]],
                         'output': [dataset["validation"]['output'][0]]})[0])

The sentiment of this comment "Chất lượng vật chất kém." is
A. Positive
B. Neutral
C. Negative

The correct answer is C. Negative.


# Create model

## Load model
Because we use 4 bit for training, we need use BitsAndBytesConfig:
- load_in_4bit = True: we will load model with 4 bit format.
- bnb_4bit_use_double_quant = True: use double quant (you can search how double quant work)
- bnb_4bit_quant_type = 'nf4': nf4 is the normalized float 4 bit data type. It quantizes floats into 4 bits (you can search about this)
- bnb_4bit_compute_dtype: dtype before quantize (we use bfloat16, if your gpu don't support bfloat16, set it to float32).

We use AutoModelForCausalLM.from_pretrained to load model:
- device_map = 'auto': auto active gpu.
- torch_dtype: use bfloat16, if your gpu don't support bfloat16, set it to float32
- quantization_config: config of quantization (bnb_config above)
- attn_implementation: gemma 2 suggest we use 'eager', if you meet bug, maybe your gpu doesn't support it, you need delete this line.
- token: you huggingface token like tokenizer above.

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = AutoModelForCausalLM.from_pretrained(
    modelpath,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    #torch_dtype=torch.float32,
    quantization_config=bnb_config,
    attn_implementation="eager",#"flash_attention_2",
    trust_remote_code=True,
    token = 'YOUR TOKEN HERE'
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
model

Gemma2ForCausalLM(
  (model): Gemma2Model(
    (embed_tokens): Embedding(256000, 3584, padding_idx=0)
    (layers): ModuleList(
      (0-41): 42 x Gemma2DecoderLayer(
        (self_attn): Gemma2Attention(
          (q_proj): Linear4bit(in_features=3584, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=3584, out_features=2048, bias=False)
          (v_proj): Linear4bit(in_features=3584, out_features=2048, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=3584, bias=False)
          (rotary_emb): Gemma2RotaryEmbedding()
        )
        (mlp): Gemma2MLP(
          (gate_proj): Linear4bit(in_features=3584, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=3584, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=3584, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): Gemma2RMSNorm()
        (post_attention_layernorm): Gemma2RMSNorm(

## Eval model before training
We use create_prompt function to generate prompt (without output), our model will predict output. We will eval model before finetune. You can see some predict below.


In [None]:
def create_prompt(input_, output_):
    output_ = 'A. Positive' if output_ == 'positive' else 'B. Neutral' if output_ == 'neutral' else 'C. Negative'
        #text = f"### Question: {input__}\n ### Answer: {example['output'][i]}"
    text = f'''{input_}
A. Positive
B. Neutral
C. Negative

The correct answer is'''

    return text

sentence = 'món ăn rất ngon'
input_ = f'''The sentiment of this comment: "{sentence}" is'''
prompt = create_prompt(input_, "")
print(prompt)

inputs = torch.tensor([tokenizer.encode(prompt)])

tokens = model.generate(
    inputs.to(model.device),
    max_new_tokens=50,
    temperature=0.1,
    do_sample=False
)
print(tokenizer.decode(tokens[0], skip_special_tokens=False))

The sentiment of this comment: "món ăn rất ngon" is
A. Positive
B. Neutral
C. Negative

The correct answer is




<bos>The sentiment of this comment: "món ăn rất ngon" is
A. Positive
B. Neutral
C. Negative

The correct answer is **A. Positive**.

Here's why:

* **"món ăn"** means "dish" or "food"
* **"rất ngon"** means "very delicious"


Therefore, the entire phrase translates to "


In [None]:
from datetime import datetime

start = datetime.now()

prediction = []
response = []
accuracy = []
labels = []

for i, x in df_test.iterrows():
    sentence = x['sentence']
    label = x['sentiment']
    input_ = f'''The sentiment of this comment: "{sentence}" is'''
    prompt = create_prompt(input_, label)

    inputs = tokenizer.encode(
        prompt,
        # add_generation_prompt=True,
        return_tensors='pt'
    )

    tokens = model.generate(
        inputs.to(model.device),
        max_new_tokens=5,
        temperature=0.1,
        do_sample=False,
        pad_token_id=tokenizer.eos_token_id
    )
    #break

    answer = tokenizer.decode(tokens[0], skip_special_tokens=False).split("The correct answer is ")[-1]
    answer = 'positive' if 'positive' in answer.lower() else 'negative' if 'negative' in answer.lower() else 'neutral'
    prediction.append(answer.lower())
    response.append(tokenizer.decode(tokens[0], skip_special_tokens=False))

    accuracy.append(prediction[-1] == label)
    labels.append(label)

    if i % 100 == 0:
        print(i, np.array(accuracy).sum()/len(prediction)*100, datetime.now() - start, (datetime.now() - start)/len(prediction))

The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


0 100.0 0:00:00.513212 0:00:00.513218
100 70.29702970297029 0:00:53.384344 0:00:00.528558
200 71.14427860696517 0:01:44.991200 0:00:00.522344
300 68.43853820598007 0:02:37.695470 0:00:00.523905
400 69.07730673316709 0:03:30.176605 0:00:00.524131
500 69.66067864271457 0:04:21.950817 0:00:00.522856
600 68.21963394342761 0:05:15.703579 0:00:00.525297
700 67.3323823109843 0:06:08.683002 0:00:00.525939
800 67.9151061173533 0:07:01.045148 0:00:00.525649
900 66.92563817980022 0:07:52.489608 0:00:00.524406
1000 67.33266733266733 0:08:44.748468 0:00:00.524224
1100 67.12079927338783 0:09:36.275146 0:00:00.523411
1200 66.69442131557037 0:10:28.739089 0:00:00.523513
1300 67.1022290545734 0:11:20.626627 0:00:00.523157
1400 67.23768736616702 0:12:14.368536 0:00:00.524175
1500 67.28847435043305 0:13:05.543236 0:00:00.523347
1600 67.64522173641474 0:13:57.517333 0:00:00.523121
1700 67.72486772486772 0:14:49.612645 0:00:00.522994
1800 68.07329261521376 0:15:41.743545 0:00:00.522900
1900 68.016833245660

In [None]:
from sklearn.metrics import classification_report, f1_score
import sklearn

print(sklearn.metrics.classification_report(labels, prediction, digits=4))

              precision    recall  f1-score   support

    negative     0.9601    0.9825    0.9712       686
     neutral     0.8182    0.0403    0.0768       670
    positive     0.5219    0.9985    0.6855       680

    accuracy                         0.6778      2036
   macro avg     0.7667    0.6738    0.5778      2036
weighted avg     0.7671    0.6778    0.5815      2036



In [None]:
for x in response[-10:]:
    print(x)
    print("\n")

<bos>The sentiment of this comment: "The facilities of the university are versatile and helpful." is
A. Positive
B. Neutral
C. Negative

The correct answer is **A. Positive**.


<bos>The sentiment of this comment: "Mấy bạn đó hay đòi hỏi nhưng không bao giờ giúp đỡ người khác." is
A. Positive
B. Neutral
C. Negative

The correct answer is **C. Negative**.


<bos>The sentiment of this comment: "Cậu ấy rất có kỹ năng về sáng tạo và nghệ thuật." is
A. Positive
B. Neutral
C. Negative

The correct answer is **A. Positive**.


<bos>The sentiment of this comment: "Giảng viên này không nhàm chán." is
A. Positive
B. Neutral
C. Negative

The correct answer is **A. Positive**.


<bos>The sentiment of this comment: "Anh ta là một người rất tỉ mỉ và cẩn thận." is
A. Positive
B. Neutral
C. Negative

The correct answer is **A. Positive**.


<bos>The sentiment of this comment: "Giáo viên đưa ra các phương tiện hỗ trợ giảng dạy rất tốt và hiệu quả." is
A. Positive
B. Neutral
C. Negative

The correct ans

## Create peft
- To training model with 4 bit, we need you prepare_model_for_kbit_training.
- For lora training, we need craete LoraConfig:
  - lora_alpha, r, dropout is basic hyperarameter.
  - almost LLM use bias = 'none' when finetune then we set is to None
  - target_modules are list of layer we want to finetune, I set it to 'all-linear', then all linear layers will be finetuned. If you just finetune some layers like q_proj, k_proj, you can pass list ['q_proj', 'k_proj'].
  - modules_to_save is other layers we want to finetune (but don't use lora), in my experience, finetune all embedding layers will make model work betters than I set modules_to_save as list of all embedding layers.

In [None]:
from peft import prepare_model_for_kbit_training

model = prepare_model_for_kbit_training(model)

In [None]:
peft_config = LoraConfig(
    lora_alpha=32,
    lora_dropout=0.1,
    r=8,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules = 'all-linear',
#     target_modules=["q_proj",
#         "k_proj",
#         "v_proj",
#         "o_proj",
#         "gate_proj",
#         "up_proj",
#         "down_proj",
#         "lm_head",],
    modules_to_save=["embed_tokens", "rotary_emb"]
                     #"input_layernorm", "post_attention_layernorm", "norm"]
)
model = get_peft_model(model, peft_config)

model.print_trainable_parameters() #

trainable params: 944,513,024 || all params: 10,186,219,008 || trainable%: 9.2725


In [None]:
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Gemma2ForCausalLM(
      (model): Gemma2Model(
        (embed_tokens): ModulesToSaveWrapper(
          (original_module): Embedding(256000, 3584, padding_idx=0)
          (modules_to_save): ModuleDict(
            (default): Embedding(256000, 3584, padding_idx=0)
          )
        )
        (layers): ModuleList(
          (0-41): 42 x Gemma2DecoderLayer(
            (self_attn): Gemma2Attention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=3584, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=3584, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
         

## Create TrainingArguments
If your model doesn't support bfloat16, set bf16 = False and turn on fb16 = True.

In [None]:
print(len(df_train)//bs//ga_steps*epochs//4)

509


In [None]:
save_step = len(df_train)//bs//ga_steps*epochs//4
print(save_step)

509


In [None]:
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=epochs,
    per_device_train_batch_size=bs,
    per_device_eval_batch_size=bs_eval,
    evaluation_strategy="steps",
    eval_steps=save_step,
    gradient_accumulation_steps=ga_steps,
    optim="paged_adamw_32bit",
    save_steps=save_step,
    save_strategy="steps",
    logging_steps=save_step,
    learning_rate=lr,
    weight_decay=0.001,
    fp16=False,
    bf16=True,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="none",
    save_total_limit=1,
    #load_best_model_at_end=True
)



## Create trainer

In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    peft_config=peft_config,
    max_seq_length= 128,
    #dataset_text_field=["input", "output"],
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
    formatting_func = formatting_prompts_func,
    #data_collator=collator
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/8144 [00:00<?, ? examples/s]

Map:   0%|          | 0/2036 [00:00<?, ? examples/s]

# Train

In [None]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss,Validation Loss
509,0.7419,0.721467
1018,0.6263,0.726759
1527,0.5366,0.755013
2036,0.4677,0.816325



Cannot access gated repo for url https://huggingface.co/google/gemma-2-9b-it/resolve/main/config.json.
Access to model google/gemma-2-9b-it is restricted. You must be authenticated to access it. - silently ignoring the lookup for the file config.json in google/gemma-2-9b-it.

Cannot access gated repo for url https://huggingface.co/google/gemma-2-9b-it/resolve/main/config.json.
Access to model google/gemma-2-9b-it is restricted. You must be authenticated to access it. - silently ignoring the lookup for the file config.json in google/gemma-2-9b-it.

Cannot access gated repo for url https://huggingface.co/google/gemma-2-9b-it/resolve/main/config.json.
Access to model google/gemma-2-9b-it is restricted. You must be authenticated to access it. - silently ignoring the lookup for the file config.json in google/gemma-2-9b-it.

Cannot access gated repo for url https://huggingface.co/google/gemma-2-9b-it/resolve/main/config.json.
Access to model google/gemma-2-9b-it is restricted. You must be a

TrainOutput(global_step=2036, training_loss=0.593114227115054, metrics={'train_runtime': 12543.9762, 'train_samples_per_second': 2.597, 'train_steps_per_second': 0.162, 'total_flos': 7.691184737678131e+16, 'train_loss': 0.593114227115054, 'epoch': 4.0})

# Eval
- If you save the model to the output_dir folder, the training code above will automatically save the model to output_dir/checkpoint-{save_step} (in my case: out/checkpoint-2036). We will load the model from this folder:
  - I calculate _id = save_step * num_epoch = 509 * 4 = 2036
  - My model will saved at "{output_dir}/checkpoint-{_id}"
- I use del model, gc.collect() and torch.cuda.empty_cache() to release trained model (save gpu memory).
- We use PeftConfig.from_pretrained to lead peft config (this is the same as peft config at training)
- we load pretrained model like before.
- We load the tokenizer in this folder by passing the saved folder to AutoTokenizer.from_pretrained()
- We use PeftModel.from_pretrained(model, peft_model_id) to merge pre-trained model with finetuned lora model (note that when we use peft lora, training code only saves lora model, this saves our storage and saving time)

In [None]:
save_step = len(df_train)//bs//ga_steps*epochs//4
print(save_step)

509


In [None]:
import gc

del model
gc.collect()
torch.cuda.empty_cache()

In [None]:
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

1016

_id = save_step*4

peft_model_id = f"{output_dir}/checkpoint-{_id}"

config = PeftConfig.from_pretrained(peft_model_id)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

model = AutoModelForCausalLM.from_pretrained(
    modelpath,
    device_map="auto",
    #torch_dtype=torch.float16,
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config,
    attn_implementation="eager",#"flash_attention_2",
    trust_remote_code=True,
    token = 'YOUR TOKEN HERE'
)

tokenizer = AutoTokenizer.from_pretrained(f"{output_dir}/checkpoint-{_id}",
                                          trust_remote_code=True,
                                          padding_side='left',
                                          token='YOUR TOKEN HERE')

# Load the Lora model
model = PeftModel.from_pretrained(model, peft_model_id)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
def create_prompt(input_, output_):
    output_ = 'A. Positive' if output_ == 'positive' else 'B. Neutral' if output_ == 'neutral' else 'C. Negative'
        #text = f"### Question: {input__}\n ### Answer: {example['output'][i]}"
    text = f'''{input_}
A. Positive
B. Neutral
C. Negative

The correct answer is'''

    return text

In [None]:
sentence = 'món ăn rất ngon'
input_ = f'''The sentiment of this comment: "{sentence}" is'''
prompt = create_prompt(input_, "")
print(prompt)

The sentiment of this comment: "món ăn rất ngon" is
A. Positive
B. Neutral
C. Negative

The correct answer is


In [None]:
inputs = torch.tensor([tokenizer.encode(prompt)])
inputs

tensor([[     2,    651,  25627,    576,    736,   4986, 235292,    664,  92020,
          28644,  31085,  60774, 235281,    603,    108, 235280, 235265,  40695,
            108, 235305, 235265,  62407,    108, 235288, 235265,  48314,    109,
            651,   5112,   3448,    603]])

In [None]:
tokens = model.generate(
    inputs.to(model.device),
    max_new_tokens=50,
    temperature=0.1,
    do_sample=True
)

In [None]:
print(tokenizer.decode(tokens[0], skip_special_tokens=False))

<bos>The sentiment of this comment: "món ăn rất ngon" is
A. Positive
B. Neutral
C. Negative

The correct answer is B. Neutral. Here's why:

* A. Positive: The comment "Giảng viên này không có khả năng giải thích rõ ràng." is
* B. Neutral: The comment "Giảng viên này không có khả năng giải


In [None]:
tokens = model.generate(
    inputs.to(model.device),
    max_new_tokens=3,
    temperature=0.1,
    do_sample=False
)
print(tokenizer.decode(tokens[0], skip_special_tokens=False))

<bos>The sentiment of this comment: "món ăn rất ngon" is
A. Positive
B. Neutral
C. Negative

The correct answer is B. Neutral


In [None]:
from datetime import datetime

start = datetime.now()

prediction = []
response = []
accuracy = []
labels = []

for i, x in df_test.iterrows():
    sentence = x['sentence']
    label = x['sentiment']
    input_ = f'''The sentiment of this comment: "{sentence}" is'''
    prompt = create_prompt(input_, label)

    inputs = tokenizer.encode(
        prompt,
        # add_generation_prompt=True,
        return_tensors='pt'
    )

    tokens = model.generate(
        inputs.to(model.device),
        max_new_tokens=3,
        temperature=0.1,
        do_sample=False,
        pad_token_id=tokenizer.eos_token_id
    )
    #break

    answer = tokenizer.decode(tokens[0], skip_special_tokens=False).split("The correct answer is ")[-1]
    answer = 'positive' if 'positive' in answer.lower() else 'negative' if 'negative' in answer.lower() else 'neutral'
    prediction.append(answer.lower())
    response.append(tokenizer.decode(tokens[0], skip_special_tokens=False))

    accuracy.append(prediction[-1] == label)
    labels.append(label)

    if i % 100 == 0:
        print(i, np.array(accuracy).sum()/len(prediction)*100, datetime.now() - start, (datetime.now() - start)/len(prediction))

The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


0 100.0 0:00:00.443912 0:00:00.443917
100 88.11881188118812 0:00:42.964489 0:00:00.425391
200 86.06965174129353 0:01:24.758574 0:00:00.421684
300 85.71428571428571 0:02:07.577986 0:00:00.423847
400 86.0349127182045 0:02:50.612307 0:00:00.425467
500 86.62674650698602 0:03:34.146929 0:00:00.427439
600 86.18968386023295 0:04:16.161983 0:00:00.426226
700 85.87731811697576 0:04:58.100646 0:00:00.425251
800 85.76779026217228 0:05:40.632437 0:00:00.425259
900 86.12652608213097 0:06:23.042674 0:00:00.425131
1000 86.41358641358642 0:07:05.603754 0:00:00.425179
1100 86.64850136239782 0:07:47.328529 0:00:00.424458
1200 85.76186511240633 0:08:29.767013 0:00:00.424452
1300 85.7033051498847 0:09:11.901914 0:00:00.424214
1400 85.86723768736617 0:09:53.374604 0:00:00.423536
1500 85.74283810792804 0:10:34.943599 0:00:00.423014
1600 85.75890068707058 0:11:17.012999 0:00:00.422869
1700 85.94944150499705 0:11:58.463356 0:00:00.422377
1800 86.1188228761799 0:12:40.272811 0:00:00.422139
1900 86.007364544976

In [None]:
from sklearn.metrics import classification_report, f1_score
import sklearn

print(sklearn.metrics.classification_report(labels, prediction, digits=4))

              precision    recall  f1-score   support

    negative     0.9827    0.9913    0.9869       686
     neutral     0.8221    0.7313    0.7741       670
    positive     0.7754    0.8529    0.8123       680

    accuracy                         0.8595      2036
   macro avg     0.8601    0.8585    0.8578      2036
weighted avg     0.8606    0.8595    0.8586      2036



# Check results
- I print all sentences have wrong prediction and see that almost cases is bug.
- Conclusion, dataset is not clean than finetune LLM can't better than finetune roberta (~89..90%), because roberta will overfit even in test dataset.
- In additionally, when I print example: "món ăn này rất ngon", we can see that model before finetune work better. Model after finetune only think about school (because finetune dataset is about school) and it don't know about food review. Than I think we only need finetune for special cases and finetune dataset need be clean and large enough.

In [None]:
df_test['predict'] = prediction

In [None]:
for i, row in df_test.iterrows():
    if row['predict'] != row['sentiment']:
        print(row['sentence'], row['predict'], row['sentiment'])

Tôi có thể ứng dụng các kiến thức có được từ chương trình đào tạo vào công việc của mình. neutral positive
Giảng viên rất tài năng và có nhiều kinh nghiệm trong công tác giảng dạy. positive neutral
Được học tập với các giáo viên giàu kinh nghiệm và thực tiễn. positive neutral
Nhà hàng và các cửa hàng tiện lợi ở gần trường rất đa dạng và phong phú. neutral positive
Chị ấy rất giỏi quản lý thời gian và luôn hoàn thành công việc đúng tiến độ. positive neutral
Cô ấy là một giáo viên rất thông minh và chuyên nghiệp. neutral positive
Anh ấy có một tầm nhìn đỉnh cao và khả năng giải quyết vấn đề tốt. positive neutral
Có nhiều hoạt động ngoại khóa và phong phú cho sinh viên. neutral positive
Khu vực đặt máy bán thức uống rất tiện lợi cho sinh viên. neutral positive
Sân cỏ của trường được tu bổ và định hướng riêng cho các hoạt động thể thao. positive neutral
Chương trình học giúp tôi cảm thấy mình có mục tiêu rõ ràng hơn. neutral positive
Phòng học được trang bị đầy đủ tiện nghi giúp sinh viên 