[Links to Train notebook script](https://www.kaggle.com/code/ethanyee2706/project-01-anains)

# **I &nbsp;&nbsp;&nbsp; Import Libraries**

In [None]:
# Manipulate models
import torch
from torch.utils.data import Dataset
from peft import LoraConfig, get_peft_model, TaskType, PeftModel
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

import numpy as np # Manipulate matrices
import pandas as pd # Manipulate and analyze data

# Visual plot
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report# Visual models evaluation

from data_preprocessor import DataPreprocessor

# **II &nbsp;&nbsp;&nbsp; Load Data**

In [None]:
train_path = "../../data/input/cleaned/train/train.csv"
test_path = "../../data/input/cleaned/test/test.csv"
val_path = "../../data/input/cleaned/val/val.csv"

In [None]:
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)
val_df = pd.read_csv(val_path)

## **1 &nbsp;&nbsp;&nbsp; Encode Dataset**

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    "vinai/phobert-base",
    use_fast=True,
    trust_remote_code=False,
)

In [None]:
class PhoBERTSADataset(Dataset):
    def __init__(self, df, tokenizer, max_length=256):
        self.texts = df.Text.astype(str).to_list()
        self.labels = df.Label.astype(int).to_list()
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts[index]
        label = self.labels[index]
        
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )

        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "token_type_ids": encoding["token_type_ids"].squeeze(0),
            "label": torch.tensor([label], dtype=torch.long)
        }


In [None]:
train_ds = PhoBERTSADataset(train_df, tokenizer, 256)
test_ds = PhoBERTSADataset(test_df, tokenizer, 256)
val_ds = PhoBERTSADataset(val_df, tokenizer, 256)

# **III &nbsp;&nbsp;&nbsp; Model**

## **1 &nbsp;&nbsp;&nbsp; Load Model**

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    "vinai/phobert-base",
    num_labels=3,
    trust_remote_code=False,
)
print(model)

In [None]:
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,  
    r=16,                        
    lora_alpha=16,
    lora_dropout=0,
    target_modules=[
        "query",
        "key",
        "value",
        "dense"
    ], 
    bias="none",
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

## **2 &nbsp;&nbsp;&nbsp; Train Model**

In [None]:
training_args = TrainingArguments(
    output_dir = "fine-tuned_PhoBERT",
    
    eval_strategy = "epoch",
    per_device_train_batch_size = 32,
    per_device_eval_batch_size = 32,
    gradient_accumulation_steps = 4,
    
    learning_rate = 2e-4,
    weight_decay = 0.01,
    
    num_train_epochs = 10,
    warmup_ratio = 0.05,
    lr_scheduler_type = "linear",
    logging_steps = 1,
    
    seed = 3407,
    dataloader_num_workers=2,
    
    remove_unused_columns = False,
    report_to = "none",  
)

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_ds,
    eval_dataset = val_ds,
    tokenizer = tokenizer
)

trainer_stats = trainer.train()
logs = trainer.state.log_history

visual_loss = [ log["loss"] for log in logs if "loss" in log and "eval_loss" not in log ]
val_loss = [ log["eval_loss"] for log in logs if "eval_loss" in log ]
train_loss = [loss for i, loss in enumerate(visual_loss) if (i % 150 == 0) or (i == len(visual_loss) - 1)]

## **3&nbsp;&nbsp;&nbsp; Evaluate Model**

In [None]:
predictions = trainer.predict(test_ds)
y_pred = np.argmax(predictions.predictions, axis=1)
y_true = predictions.label_ids

In [None]:
cm = confusion_matrix(y_true, y_pred)
acc = accuracy_score(y_true, y_pred)
print(classification_report(y_true, y_pred, digits=4))

In [None]:
train_iters = 1500
eval_iters = 150
num_classes = 3

In [None]:
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=True, linewidths=0.5)
plt.xlabel("Predicted", fontsize=12)
plt.ylabel("True", fontsize=12)
plt.xticks(ticks=np.arange(num_classes)+0.5, labels=range(num_classes))
plt.yticks(ticks=np.arange(num_classes)+0.5, labels=range(num_classes), rotation=0)

plt.title("Confusion Matrix", fontsize=14)
plt.tight_layout()
plt.savefig("lstmcm.png", dpi=300, bbox_inches="tight")
plt.show()

In [None]:
steps = list(range(149, train_iters, eval_iters))
plt.figure(figsize=(8, 4))

plt.plot(steps, train_loss, linewidth=1.5, label="Train Loss")
plt.plot(steps, val_loss, linewidth=1.5, label="Validation Loss")

plt.xlabel("Iteration")
plt.ylabel("Loss")
plt.title("Training and Validation Loss")
plt.legend()
plt.tight_layout()
plt.savefig("fine-tuned_PhoBERT/berttrainvalloss.png", dpi=300, bbox_inches="tight")
plt.show()

In [None]:
steps = list(range(0, train_iters))
plt.figure(figsize=(10, 4))

plt.plot(steps, visual_loss, linewidth=1, label="Train Loss",)

plt.xlabel("Iteration")
plt.ylabel("Loss")
plt.title("Training Loss")
plt.legend()
plt.tight_layout()
plt.savefig("fine-tuned_PhoBERT/berttrainloss.png", dpi=300, bbox_inches="tight")
plt.show()

In [None]:
input_text = "Sản phẩm rất tệ"

preprocessor = DataPreprocessor()
preprocessed_text = preprocessor.preprocess(input_text)
encoded_text = tokenizer(
    preprocessed_text,
    add_special_tokens=True,
    truncation=True,
    padding="max_length",
    max_length=256,
    return_tensors="pt"
)

def predict_sentiment(text):
    model.eval()
    text = text.to("cpu")
    logits = model(**text).logits
    pred = torch.argmax(logits)
    return pred

result = predict_sentiment(encoded_text)
print(result.item())

## **4 &nbsp;&nbsp;&nbsp; Save Model**

In [None]:
model.save_pretrained("fine-tuned_PhoBERT")
tokenizer.save_pretrained("fine-tuned_PhoBERT")