# Install library

In [None]:
!pip install -q datasets==2.16.0
!pip install -q bitsandbytes
!pip install -q tiktoken
!pip install -q peft
!pip install -q trl
!pip install -q transformers
!pip install -q openpyxl
!pip install -q pandas
!pip install -q scikit-learn
!pip install -q flash-attn
#pip install -q transformers==4.38.1

[0m

# Import library

In [None]:
import json
import torch
import pandas as pd
import numpy as np
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
from accelerate import PartialState
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
from trl import SFTTrainer
from peft import prepare_model_for_kbit_training
from transformers import TrainingArguments

# Hyperparameters

In [None]:
modelpath = "google/gemma-2b-it"
lr=2e-4      # learning rate
bs=16            # batch size
bs_eval=16      # batch size for evals
ga_steps=1     # gradient acc. steps
epochs=4
max_length=128      # max. sample length with 24GB VRAM
output_dir="out"

# Remove old model
Because of limited storage, we can't save all models, we need to delete all models in cache by the following code:
- rm -r out: delete out folder (because I save finetuned model in out folder), you can change folder name like (rm -r output_folder). If you doesn't have out folder, this commend do not thing.
- all pretrained huggingface models will auto save in transformers.TRANSFORMERS_CACHE. I use glob.glob to read all folders and use shutil.rmtree to delete them.

In [None]:
!rm -r out

In [None]:
# from transformers import TRANSFORMERS_CACHE
# print(TRANSFORMERS_CACHE)

# import shutil
# shutil.rmtree(TRANSFORMERS_CACHE)

# Create Dataset
Download dataset from [kaggle synthetic-vietnamese-students-feedback-corpus](https://www.kaggle.com/datasets/toreleon/synthetic-vietnamese-students-feedback-corpus/data)

We need convert DataFrame to json line (jsonl).

In [None]:
df_train = pd.read_csv("synthetic_train.csv")
df_test = pd.read_csv("synthetic_val.csv")

In [None]:
df_train.head()

Unnamed: 0,sentence,sentiment,topic
0,Đội ngũ bảo trì quá thưa thớt dẫn đến không đả...,negative,facility
1,The university's musical and artistic faciliti...,neutral,facility
2,Phương pháp giảng dạy phù hợp với các đối tượn...,neutral,curriculum
3,Chương trình học giúp tôi trở thành một chuyên...,positive,curriculum
4,Tôi nghĩ rằng chương trình đào tạo có thể có t...,neutral,curriculum


In [None]:
df_test.head()

Unnamed: 0,sentence,sentiment,topic
0,Chất lượng vật chất kém.,negative,facility
1,"Phần mềm học tập quá khó sử dụng, khiến sinh v...",negative,facility
2,Trường tôi thiếu những tiện ích cơ bản như máy...,negative,facility
3,Cần tạo thêm các hoạt động gắn kết giữa sinh v...,neutral,curriculum
4,Họ rất khoan dung và lượng giác trong quan điể...,neutral,others


In [None]:
df_train.sentiment.value_counts()

sentiment
neutral     2724
negative    2711
positive    2709
Name: count, dtype: int64

In [None]:
df_test.sentiment.value_counts()

sentiment
negative    686
positive    680
neutral     670
Name: count, dtype: int64

In [None]:
df_train['len'] = df_train.sentence.apply(lambda x: len(str(x).split()))
df_test['len'] = df_test.sentence.apply(lambda x: len(str(x).split()))

In [None]:
df_train['len'].describe()

count    8144.000000
mean       15.549730
std         5.018764
min         3.000000
25%        12.000000
50%        15.000000
75%        18.000000
max        43.000000
Name: len, dtype: float64

In [None]:
df_test['len'].describe()

count    2036.000000
mean       15.694990
std         5.185957
min         2.000000
25%        12.000000
50%        15.000000
75%        19.000000
max        48.000000
Name: len, dtype: float64

In [None]:
with open('train.jsonl', 'w') as outfile:
    for i, x in df_train.iterrows():
        comment = x['sentence']
        label = x['sentiment']
        #label = 'yes' if label == 'relevance' else 'no'
        data = {
            "input": f'''The sentiment of this comment "{comment}" is''',
            "output": f"{label}"
        }
        json.dump(data, outfile)
        outfile.write('\n')

In [None]:
with open('test.jsonl', 'w') as outfile:
    for i, x in df_test.iterrows():
        comment = x['sentence']
        label = x['sentiment']
        #label = 'yes' if label == 'relevance' else 'no'
        data = {
            "input": f'''The sentiment of this comment "{comment}" is''',
            "output": f"{label}"
        }
        json.dump(data, outfile)
        outfile.write('\n')

In [None]:
data_files = {
    "train": "train.jsonl",
    "validation": "test.jsonl",
}

dataset = load_dataset("json", data_files=data_files)

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['input', 'output'],
        num_rows: 8144
    })
    validation: Dataset({
        features: ['input', 'output'],
        num_rows: 2036
    })
})

# Create prompt format
Load tokenizer by AutoTokenizer.from_pretrained:
- We need create and copy YOUR TOKEN from [huggingface](https://huggingface.co/settings/tokens)
- We need use padding_side = 'right' because training library need padding_side = 'right' when training. You can use left padding but you need to make sure the library is not corrupted and check whether performance is affected by left padding!
- If you have 1 prompt like "test thử mô hình" and want to tokenize it, just you tokenizer(prompt, return_tensors="pt"). You can see output of tokenizer in cell below (output includes input_ids (list index of each token in prompt) and attention mask)
- We can use tokenizer.batch_decode to see how tokenizer restore string from token tensor. You can see that it automatically adds the start token "<bos>" at the beginning of the string.
- To train llm, we only need to pass 1 sentence to llm (including input and desired output) without specifying which is the input and which is the output.
- I wrote the function formatting_prompts_func to convert input and output to prompt and tested this function, you can see below.
- When predicting, remove the output part to let the model predict itself. See the predict section below later.
- we only need to predict some next tokens like A. positive, and B. neutral. We don't care what the model says after sentiment. Then we do not need to add <eos token>. If you fine-tune the model with other tasks, maybe you need to add <eos token> at the end of the prompt:
  - use tokenizer.eos_token, tokenizer.eos_token_id to see eos_token of your model and correspond id
  - for ex: eos_token is "<|im_end|>". You need edit prompt like:
    - '''...The correct answer is {output_}.''' --> '''...The correct answer is {output_}. <|im_end|>'''
  - for ex: eos_token is "end_token__". You need edit prompt like:
    - '''...The correct answer is {output_}.''' --> '''...The correct answer is {output_}. end_token__'''

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    modelpath,
    padding_side="right",
    # add_eos_token=True,
    # add_bos_token=True,
    trust_remote_code=True,
    token = 'YOUR TOKEN HERE'
)

tokenizer_config.json:   0%|          | 0.00/34.2k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

In [None]:
prompt = "test thử mô hình"
tokens = tokenizer(prompt, return_tensors="pt")
print(tokens)
tokenizer.batch_decode(tokenizer.encode(prompt))

{'input_ids': tensor([[     2,   2195, 101869,  34580,  15885]]), 'attention_mask': tensor([[1, 1, 1, 1, 1]])}


['<bos>', 'test', ' thử', ' mô', ' hình']

In [None]:
def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example['input'])):
        input_ = example['input'][i]
        output_ = example['output'][i]
        output_ = 'A. Positive' if output_ == 'positive' else 'B. Neutral' if output_ == 'neutral' else 'C. Negative'
        #text = f"### Question: {input__}\n ### Answer: {example['output'][i]}"
        text = f'''{input_}
A. Positive
B. Neutral
C. Negative

The correct answer is {output_}.'''

        output_texts.append(text)
    return output_texts

In [None]:
dataset["validation"]['input'][0], dataset["validation"]['output'][0]

('The sentiment of this comment "Chất lượng vật chất kém." is', 'negative')

In [None]:
formatting_prompts_func({'input': [dataset["validation"]['input'][0]],
                         'output': [dataset["validation"]['output'][0]]})

['The sentiment of this comment "Chất lượng vật chất kém." is\nA. Positive\nB. Neutral\nC. Negative\n\nThe correct answer is C. Negative.']

In [None]:
print(formatting_prompts_func({'input': [dataset["validation"]['input'][0]],
                         'output': [dataset["validation"]['output'][0]]})[0])

The sentiment of this comment "Chất lượng vật chất kém." is
A. Positive
B. Neutral
C. Negative

The correct answer is C. Negative.


# Create model

## Load model
Because we use 4 bit for training, we need use BitsAndBytesConfig:
- load_in_4bit = True: we will load model with 4 bit format.
- bnb_4bit_use_double_quant = True: use double quant (you can search how double quant work)
- bnb_4bit_quant_type = 'nf4': nf4 is the normalized float 4 bit data type. It quantizes floats into 4 bits (you can search about this)
- bnb_4bit_compute_dtype: dtype before quantize (we use bfloat16, if your gpu don't support bfloat16, set it to float32).

We use AutoModelForCausalLM.from_pretrained to load model:
- device_map = 'auto': auto active gpu.
- torch_dtype: use bfloat16, if your gpu don't support bfloat16, set it to float32
- quantization_config: config of quantization (bnb_config above)
- attn_implementation: you can use 'flash_attention_2', if you meet bug, maybe your gpu doesn't support it, you need delete this line.
- token: you huggingface token like tokenizer above.

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = AutoModelForCausalLM.from_pretrained(
    modelpath,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    #torch_dtype=torch.float32,
    quantization_config=bnb_config,
    attn_implementation="flash_attention_2",
    trust_remote_code=True,
    token = 'YOUR TOKEN HERE'
)

config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

In [None]:
model

GemmaForCausalLM(
  (model): GemmaModel(
    (embed_tokens): Embedding(256000, 2048, padding_idx=0)
    (layers): ModuleList(
      (0-17): 18 x GemmaDecoderLayer(
        (self_attn): GemmaFlashAttention2(
          (q_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear4bit(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear4bit(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): GemmaRotaryEmbedding()
        )
        (mlp): GemmaMLP(
          (gate_proj): Linear4bit(in_features=2048, out_features=16384, bias=False)
          (up_proj): Linear4bit(in_features=2048, out_features=16384, bias=False)
          (down_proj): Linear4bit(in_features=16384, out_features=2048, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): GemmaRMSNorm()
        (post_attention_layernorm): GemmaRMSNorm()
  

## Eval model before training
We use create_prompt function to generate prompt (without output), our model will predict output. We will eval model before finetune. You can see some predict below.

You can see that pretrained model can't clarify neutral class!

In [None]:
def create_prompt(input_, output_):
    output_ = 'A. Positive' if output_ == 'positive' else 'B. Neutral' if output_ == 'neutral' else 'C. Negative'
        #text = f"### Question: {input__}\n ### Answer: {example['output'][i]}"
    text = f'''{input_}
A. Positive
B. Neutral
C. Negative

The correct answer is'''

    return text

sentence = 'món ăn rất ngon'
input_ = f'''The sentiment of this comment: "{sentence}" is'''
prompt = create_prompt(input_, "")
print(prompt)

inputs = torch.tensor([tokenizer.encode(prompt)])

tokens = model.generate(
    inputs.to(model.device),
    max_new_tokens=50,
    temperature=0.1,
    do_sample=False
)
print(tokenizer.decode(tokens[0], skip_special_tokens=False))

The sentiment of this comment: "món ăn rất ngon" is
A. Positive
B. Neutral
C. Negative

The correct answer is


The input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in torch.bfloat16.


<bos>The sentiment of this comment: "món ăn rất ngon" is
A. Positive
B. Neutral
C. Negative

The correct answer is A. Positive.

"Mon ăn rất ngon" means "This food is very delicious".<eos>


In [None]:
from datetime import datetime

start = datetime.now()

prediction = []
response = []
accuracy = []
labels = []

for i, x in df_test.iterrows():
    sentence = x['sentence']
    label = x['sentiment']
    input_ = f'''The sentiment of this comment: "{sentence}" is'''
    prompt = create_prompt(input_, label)

    inputs = tokenizer.encode(
        prompt,
        # add_generation_prompt=True,
        return_tensors='pt'
    )

    tokens = model.generate(
        inputs.to(model.device),
        max_new_tokens=3,
        temperature=0.1,
        do_sample=False,
        pad_token_id=tokenizer.eos_token_id
    )
    #break

    answer = tokenizer.decode(tokens[0], skip_special_tokens=False).split("The correct answer is ")[-1]
    answer = 'positive' if 'positive' in answer.lower() else 'negative' if 'negative' in answer.lower() else 'neutral'
    prediction.append(answer.lower())
    response.append(tokenizer.decode(tokens[0], skip_special_tokens=False))

    accuracy.append(prediction[-1] == label)
    labels.append(label)

    if i % 100 == 0:
        print(i, np.array(accuracy).sum()/len(prediction)*100, datetime.now() - start, (datetime.now() - start)/len(prediction))

The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


0 100.0 0:00:00.094615 0:00:00.094618
100 64.35643564356435 0:00:09.207557 0:00:00.091164
200 66.16915422885572 0:00:18.205571 0:00:00.090575
300 64.11960132890366 0:00:27.193370 0:00:00.090343
400 65.5860349127182 0:00:36.242059 0:00:00.090379
500 66.46706586826348 0:00:45.228362 0:00:00.090276
600 64.72545757071548 0:00:54.239705 0:00:00.090249
700 63.7660485021398 0:01:03.279808 0:00:00.090271
800 64.54431960049938 0:01:12.326279 0:00:00.090295
900 63.4850166481687 0:01:21.344465 0:00:00.090282
1000 63.73626373626373 0:01:30.366041 0:00:00.090276
1100 63.21525885558583 0:01:39.423528 0:00:00.090303
1200 62.781015820149875 0:01:48.456868 0:00:00.090305
1300 63.18216756341276 0:01:57.472199 0:00:00.090294
1400 63.24054246966453 0:02:06.496202 0:00:00.090290
1500 63.49100599600267 0:02:15.501607 0:00:00.090274
1600 63.897564022485945 0:02:24.539199 0:00:00.090281
1700 63.78600823045267 0:02:33.549107 0:00:00.090270
1800 63.96446418656302 0:02:42.632562 0:00:00.090301
1900 63.9663335086

In [None]:
from sklearn.metrics import classification_report, f1_score
import sklearn

print(sklearn.metrics.classification_report(labels, prediction, digits=4))

              precision    recall  f1-score   support

    negative     0.9670    0.8965    0.9304       686
     neutral     0.0000    0.0000    0.0000       670
    positive     0.4857    1.0000    0.6538       680

    accuracy                         0.6361      2036
   macro avg     0.4842    0.6322    0.5281      2036
weighted avg     0.4880    0.6361    0.5319      2036



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
for x in response[-10:]:
    print(x)
    print("\n")

<bos>The sentiment of this comment: "The facilities of the university are versatile and helpful." is
A. Positive
B. Neutral
C. Negative

The correct answer is A. Positive


<bos>The sentiment of this comment: "Mấy bạn đó hay đòi hỏi nhưng không bao giờ giúp đỡ người khác." is
A. Positive
B. Neutral
C. Negative

The correct answer is C. Negative


<bos>The sentiment of this comment: "Cậu ấy rất có kỹ năng về sáng tạo và nghệ thuật." is
A. Positive
B. Neutral
C. Negative

The correct answer is A. Positive


<bos>The sentiment of this comment: "Giảng viên này không nhàm chán." is
A. Positive
B. Neutral
C. Negative

The correct answer is A. Positive


<bos>The sentiment of this comment: "Anh ta là một người rất tỉ mỉ và cẩn thận." is
A. Positive
B. Neutral
C. Negative

The correct answer is A. Positive


<bos>The sentiment of this comment: "Giáo viên đưa ra các phương tiện hỗ trợ giảng dạy rất tốt và hiệu quả." is
A. Positive
B. Neutral
C. Negative

The correct answer is A. Positive


<bos

## Create peft
- To training model with 4 bit, we need you prepare_model_for_kbit_training.
- For lora training, we need craete LoraConfig:
  - lora_alpha, r, dropout is basic hyperarameter.
  - almost LLM use bias = 'none' when finetune then we set is to None
  - target_modules are list of layer we want to finetune, I set it to 'all-linear', then all linear layers will be finetuned. If you just finetune some layers like q_proj, k_proj, you can pass list ['q_proj', 'k_proj'].
  - modules_to_save is other layers we want to finetune (but don't use lora), in my experience, finetune all embedding layers will make model work betters than I set modules_to_save as list of all embedding layers.

In [None]:
from peft import prepare_model_for_kbit_training

model = prepare_model_for_kbit_training(model)

In [None]:
peft_config = LoraConfig(
    lora_alpha=32,
    lora_dropout=0.1,
    r=8,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules = 'all-linear',
#     target_modules=["q_proj",
#         "k_proj",
#         "v_proj",
#         "o_proj",
#         "gate_proj",
#         "up_proj",
#         "down_proj",
#         "lm_head",],
    modules_to_save=["embed_tokens", "rotary_emb"]
                     #"input_layernorm", "post_attention_layernorm", "norm"]
)
model = get_peft_model(model, peft_config)

model.print_trainable_parameters() #

trainable params: 534,093,824 || all params: 3,040,266,240 || trainable%: 17.5673


## Create TrainingArguments
If your model doesn't support bfloat16, set bf16 = False and turn on fb16 = True.

In [None]:
print(len(df_train)//bs//ga_steps*epochs//4)

509


In [None]:
save_step = len(df_train)//bs//ga_steps*epochs//4
print(save_step)

509


In [None]:
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=epochs,
    per_device_train_batch_size=bs,
    per_device_eval_batch_size=bs_eval,
    evaluation_strategy="steps",
    eval_steps=save_step,
    gradient_accumulation_steps=ga_steps,
    optim="paged_adamw_32bit",
    save_steps=save_step,
    save_strategy="steps",
    logging_steps=save_step,
    learning_rate=lr,
    weight_decay=0.001,
    fp16=False,
    bf16=True,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="none",
    save_total_limit=1,
    #load_best_model_at_end=True
)



## Create trainer

In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    peft_config=peft_config,
    max_seq_length= 128,
    #dataset_text_field=["input", "output"],
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
    formatting_func = formatting_prompts_func,
    #data_collator=collator
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/8144 [00:00<?, ? examples/s]

Map:   0%|          | 0/2036 [00:00<?, ? examples/s]

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


# Train

In [None]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss,Validation Loss
509,0.8359,0.775727
1018,0.6789,0.783178
1527,0.5823,0.805292
2036,0.5042,0.857617



Cannot access gated repo for url https://huggingface.co/google/gemma-2b-it/resolve/main/config.json.
Access to model google/gemma-2b-it is restricted. You must be authenticated to access it. - silently ignoring the lookup for the file config.json in google/gemma-2b-it.

Cannot access gated repo for url https://huggingface.co/google/gemma-2b-it/resolve/main/config.json.
Access to model google/gemma-2b-it is restricted. You must be authenticated to access it. - silently ignoring the lookup for the file config.json in google/gemma-2b-it.

Cannot access gated repo for url https://huggingface.co/google/gemma-2b-it/resolve/main/config.json.
Access to model google/gemma-2b-it is restricted. You must be authenticated to access it. - silently ignoring the lookup for the file config.json in google/gemma-2b-it.

Cannot access gated repo for url https://huggingface.co/google/gemma-2b-it/resolve/main/config.json.
Access to model google/gemma-2b-it is restricted. You must be authenticated to access

TrainOutput(global_step=2036, training_loss=0.6503350908020858, metrics={'train_runtime': 740.7535, 'train_samples_per_second': 43.977, 'train_steps_per_second': 2.749, 'total_flos': 1.834279790247936e+16, 'train_loss': 0.6503350908020858, 'epoch': 4.0})

# Eval
- If you save the model to the output_dir folder, the training code above will automatically save the model to output_dir/checkpoint-{save_step} (in my case: out/checkpoint-2036). We will load the model from this folder:
  - I calculate _id = save_step * num_epoch = 509 * 4 = 2036
  - My model will saved at "{output_dir}/checkpoint-{_id}"
- I use del model, gc.collect() and torch.cuda.empty_cache() to release trained model (save gpu memory).
- We use PeftConfig.from_pretrained to lead peft config (this is the same as peft config at training)
- we load pretrained model like before.
- We load the tokenizer in this folder by passing the saved folder to AutoTokenizer.from_pretrained()
- We use PeftModel.from_pretrained(model, peft_model_id) to merge pre-trained model with finetuned lora model (note that when we use peft lora, training code only saves lora model, this saves our storage and saving time)

In [None]:
save_step = len(df_train)//bs//ga_steps*epochs//4
print(save_step)

509


In [None]:
import gc

del model
gc.collect()
torch.cuda.empty_cache()

In [None]:
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

_id = save_step*4

peft_model_id = f"{output_dir}/checkpoint-{_id}"

config = PeftConfig.from_pretrained(peft_model_id)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

model = AutoModelForCausalLM.from_pretrained(
    modelpath,
    device_map="auto",
    #torch_dtype=torch.float16,
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config,
    attn_implementation="flash_attention_2",
    trust_remote_code=True,
    token = 'YOUR TOKEN HERE'
)

tokenizer = AutoTokenizer.from_pretrained(f"{output_dir}/checkpoint-{_id}",
                                          trust_remote_code=True,
                                          padding_side='left',
                                          token='YOUR TOKEN HERE')

# Load the Lora model
model = PeftModel.from_pretrained(model, peft_model_id)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
def create_prompt(input_, output_):
    output_ = 'A. Positive' if output_ == 'positive' else 'B. Neutral' if output_ == 'neutral' else 'C. Negative'
        #text = f"### Question: {input__}\n ### Answer: {example['output'][i]}"
    text = f'''{input_}
A. Positive
B. Neutral
C. Negative

The correct answer is'''

    return text

In [None]:
sentence = 'món ăn rất ngon'
input_ = f'''The sentiment of this comment: "{sentence}" is'''
prompt = create_prompt(input_, "")
print(prompt)

The sentiment of this comment: "món ăn rất ngon" is
A. Positive
B. Neutral
C. Negative

The correct answer is


In [None]:
inputs = torch.tensor([tokenizer.encode(prompt)])
inputs

tensor([[     2,    651,  25627,    576,    736,   4986, 235292,    664,  92020,
          28644,  31085,  60774, 235281,    603,    108, 235280, 235265,  40695,
            108, 235305, 235265,  62407,    108, 235288, 235265,  48314,    109,
            651,   5112,   3448,    603]])

In [None]:
tokens = model.generate(
    inputs.to(model.device),
    max_new_tokens=50,
    temperature=0.1,
    do_sample=True
)

In [None]:
print(tokenizer.decode(tokens[0], skip_special_tokens=False))

<bos>The sentiment of this comment: "món ăn rất ngon" is
A. Positive
B. Neutral
C. Negative

The correct answer is B. Neutral.

The neutral sentiment of this comment "Không có khu vực để sinh viên thư giãn sau giờ học." is a middle ground between positive and negative.<eos>


In [None]:
tokens = model.generate(
    inputs.to(model.device),
    max_new_tokens=3,
    temperature=0.1,
    do_sample=False
)
print(tokenizer.decode(tokens[0], skip_special_tokens=False))

<bos>The sentiment of this comment: "món ăn rất ngon" is
A. Positive
B. Neutral
C. Negative

The correct answer is B. Neutral




In [None]:
from datetime import datetime

start = datetime.now()

prediction = []
response = []
accuracy = []
labels = []

for i, x in df_test.iterrows():
    sentence = x['sentence']
    label = x['sentiment']
    input_ = f'''The sentiment of this comment: "{sentence}" is'''
    prompt = create_prompt(input_, label)

    inputs = tokenizer.encode(
        prompt,
        # add_generation_prompt=True,
        return_tensors='pt'
    )

    tokens = model.generate(
        inputs.to(model.device),
        max_new_tokens=3,
        temperature=0.1,
        do_sample=False,
        pad_token_id=tokenizer.eos_token_id
    )
    #break

    answer = tokenizer.decode(tokens[0], skip_special_tokens=False).split("The correct answer is ")[-1]
    answer = 'positive' if 'positive' in answer.lower() else 'negative' if 'negative' in answer.lower() else 'neutral'
    prediction.append(answer.lower())
    response.append(tokenizer.decode(tokens[0], skip_special_tokens=False))

    accuracy.append(prediction[-1] == label)
    labels.append(label)

    if i % 100 == 0:
        print(i, np.array(accuracy).sum()/len(prediction)*100, datetime.now() - start, (datetime.now() - start)/len(prediction))

0 100.0 0:00:00.122506 0:00:00.122509
100 88.11881188118812 0:00:12.185316 0:00:00.120647
200 84.07960199004975 0:00:24.150237 0:00:00.120150
300 84.38538205980066 0:00:36.177115 0:00:00.120190
400 84.53865336658353 0:00:48.170303 0:00:00.120125
500 84.83033932135729 0:01:00.213893 0:00:00.120187
600 85.19134775374376 0:01:12.246645 0:00:00.120211
700 85.30670470756063 0:01:24.269723 0:00:00.120214
800 85.39325842696628 0:01:36.232939 0:00:00.120141
900 86.01553829078802 0:01:48.269853 0:00:00.120166
1000 86.11388611388612 0:02:00.242135 0:00:00.120122
1100 86.37602179836512 0:02:12.229190 0:00:00.120099
1200 85.67860116569526 0:02:24.258395 0:00:00.120115
1300 85.7033051498847 0:02:36.222706 0:00:00.120079
1400 86.0813704496788 0:02:48.214244 0:00:00.120067
1500 85.67621585609594 0:03:00.188641 0:00:00.120046
1600 85.8213616489694 0:03:12.124168 0:00:00.120003
1700 86.0082304526749 0:03:24.116642 0:00:00.119998
1800 86.17434758467517 0:03:36.194998 0:00:00.120042
1900 86.2177801157285

In [None]:
from sklearn.metrics import classification_report, f1_score
import sklearn

print(sklearn.metrics.classification_report(labels, prediction, digits=4))

              precision    recall  f1-score   support

    negative     0.9742    0.9898    0.9819       686
     neutral     0.7877    0.8030    0.7953       670
    positive     0.8171    0.7882    0.8024       680

    accuracy                         0.8610      2036
   macro avg     0.8596    0.8603    0.8599      2036
weighted avg     0.8603    0.8610    0.8605      2036



# Check results
- I print all sentences have wrong prediction and see that almost cases is bug.
- Conclusion, dataset is not clean than finetune LLM can't better than finetune roberta (~89..90%), because roberta will overfit even in test dataset.
- In additionally, when I print example: "món ăn này rất ngon", we can see that model before finetune work better. Model after finetune only think about school (because finetune dataset is about school) and it don't know about food review. Than I think we only need finetune for special cases and finetune dataset need be clean and large enough.

In [None]:
df_test['predict'] = prediction

In [None]:
for i, row in df_test.iterrows():
    if row['predict'] != row['sentiment']:
        print(row['sentence'], row['predict'], row['sentiment'])

Ngôn ngữ tiếng Anh được sử dụng trong nhiều môn học như là ngôn ngữ chính. positive neutral
Tôi có thể ứng dụng các kiến thức có được từ chương trình đào tạo vào công việc của mình. neutral positive
Trường này giúp tốt cho việc thực hành và phát triển kỹ năng. neutral positive
Giảng viên rất tài năng và có nhiều kinh nghiệm trong công tác giảng dạy. positive neutral
Được học tập với các giáo viên giàu kinh nghiệm và thực tiễn. positive neutral
Nhà hàng và các cửa hàng tiện lợi ở gần trường rất đa dạng và phong phú. neutral positive
Cô ấy là một giáo viên rất thông minh và chuyên nghiệp. neutral positive
Anh ấy có một tầm nhìn đỉnh cao và khả năng giải quyết vấn đề tốt. positive neutral
Có nhiều hoạt động ngoại khóa và phong phú cho sinh viên. neutral positive
Giảng viên đầy năng lượng và sáng tạo trong việc truyền đạt kiến thức. neutral positive
Sân cỏ của trường được tu bổ và định hướng riêng cho các hoạt động thể thao. positive neutral
Phòng học được trang bị đầy đủ tiện nghi giúp si