In [1]:
import os
import warnings
import pandas as pd
import numpy as np
import torch
from transformers import PhobertTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, f1_score, classification_report
import logging
import torch.nn as nn

warnings.filterwarnings("ignore")
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
os.environ['WANDB_DISABLED'] = 'true'
logging.getLogger("transformers").setLevel(logging.ERROR)

In [2]:
# ====== ƒê∆Ø·ªúNG D·∫™N FILE ======
TRAIN_FILE = r"/content/Depression1287_train.xlsx"
VALID_FILE = r"/content/Depression1287_valid.xlsx"
TEST_FILE  = r"/content/Depression1287_test.xlsx"

# ====== ƒê·ªåC D·ªÆ LI·ªÜU ======
train_df = pd.read_excel(TRAIN_FILE)
valid_df = pd.read_excel(VALID_FILE)
test_df  = pd.read_excel(TEST_FILE)


# ====== LABEL ENCODING (0..3) ======
label_order = ["1-B√¨nh th∆∞·ªùng", "2-Nh·∫π", "3-V·ª´a", "4-N·∫∑ng"]
label2id = {lbl: i for i, lbl in enumerate(label_order)}
id2label = {i: lbl for lbl, i in label2id.items()}

# √Ånh x·∫° nh√£n
y_train = train_df["Label"].astype(str).map(label2id)
y_valid = valid_df["Label"].astype(str).map(label2id)
y_test  = test_df["Label"].astype(str).map(label2id)

# L·∫•y text
Xtrain_text = train_df["Content"].astype(str).tolist()
Xvalid_text = valid_df["Content"].astype(str).tolist()
Xtest_text  = test_df["Content"].astype(str).tolist()

In [3]:
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train.values
)
class_weights_dict = {i: class_weights[i] for i in range(len(class_weights))}
print(class_weights_dict)
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float)

class WeightedTrainer(Trainer):
    def __init__(self, class_weights=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"]
        )
        logits = outputs.logits

        loss_fct = nn.CrossEntropyLoss(weight=self.class_weights.to(model.device))
        loss = loss_fct(logits.view(-1, model.config.num_labels),
                        labels.view(-1))

        return (loss, outputs) if return_outputs else loss

{0: np.float64(0.36883223684210525), 1: np.float64(1.180263157894737), 2: np.float64(2.7347560975609757), 3: np.float64(13.191176470588236)}


In [4]:
# ====== TOKENIZER ======
MODEL_NAME = "vinai/phobert-base"
tokenizer = PhobertTokenizer.from_pretrained(MODEL_NAME)

# Tokenize d·ªØ li·ªáu
def tokenize_function(texts):
    return tokenizer(texts, padding=True, truncation=True, max_length=256)

print("üîÑ ƒêang tokenize d·ªØ li·ªáu...")
train_encodings = tokenize_function(Xtrain_text)
valid_encodings = tokenize_function(Xvalid_text)
test_encodings = tokenize_function(Xtest_text)

vocab.txt: 0.00B [00:00, ?B/s]

bpe.codes: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/557 [00:00<?, ?B/s]

üîÑ ƒêang tokenize d·ªØ li·ªáu...


In [5]:
# Cell 5: T·∫°o PyTorch Dataset
class DepressionDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {
            'input_ids': torch.tensor(self.encodings['input_ids'][idx]),
            'attention_mask': torch.tensor(self.encodings['attention_mask'][idx])
        }
        item["labels"] = torch.tensor(self.labels.iloc[idx], dtype=torch.long)
        return item

train_dataset = DepressionDataset(train_encodings, y_train)
valid_dataset = DepressionDataset(valid_encodings, y_valid)
test_dataset = DepressionDataset(test_encodings, y_test)

In [6]:
# Cell 6: Kh·ªüi t·∫°o m√¥ h√¨nh
model = RobertaForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=4,
    id2label=id2label,
    label2id=label2id
)

pytorch_model.bin:   0%|          | 0.00/543M [00:00<?, ?B/s]

In [7]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)

    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='weighted')
    return {'accuracy': acc, 'f1': f1}

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=25,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=15,
    learning_rate = 3e-5,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    report_to="none"
)

In [8]:
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    class_weights=class_weights_tensor,
)
train_history = trainer.train()

model.safetensors:   0%|          | 0.00/543M [00:00<?, ?B/s]

{'loss': 1.4883, 'grad_norm': 5.4980244636535645, 'learning_rate': 2.9705263157894738e-05, 'epoch': 0.2631578947368421}
{'loss': 1.3887, 'grad_norm': 4.891634941101074, 'learning_rate': 2.9389473684210528e-05, 'epoch': 0.5263157894736842}
{'loss': 1.3202, 'grad_norm': 2.083486795425415, 'learning_rate': 2.9073684210526314e-05, 'epoch': 0.7894736842105263}
{'eval_loss': 1.684542179107666, 'eval_accuracy': 0.36153846153846153, 'eval_f1': 0.336804542774692, 'eval_runtime': 1.695, 'eval_samples_per_second': 76.696, 'eval_steps_per_second': 1.77, 'epoch': 1.0}
{'loss': 1.3495, 'grad_norm': 6.504144191741943, 'learning_rate': 2.8757894736842107e-05, 'epoch': 1.0526315789473684}
{'loss': 1.4556, 'grad_norm': 3.075618028640747, 'learning_rate': 2.8442105263157897e-05, 'epoch': 1.3157894736842106}
{'loss': 1.3664, 'grad_norm': 3.5294387340545654, 'learning_rate': 2.8126315789473684e-05, 'epoch': 1.5789473684210527}
{'loss': 1.3196, 'grad_norm': 3.8392770290374756, 'learning_rate': 2.78105263157

In [9]:
test_results = trainer.evaluate(test_dataset)
predictions = trainer.predict(test_dataset)
predicted_labels = np.argmax(predictions.predictions, axis=1)
true_labels = y_test.values
print(classification_report(true_labels, predicted_labels,
                          target_names=[id2label[i] for i in range(4)]))

{'eval_loss': 1.9451850652694702, 'eval_accuracy': 0.6692307692307692, 'eval_f1': 0.6513640238704177, 'eval_runtime': 3.7153, 'eval_samples_per_second': 69.981, 'eval_steps_per_second': 1.346, 'epoch': 25.0}
               precision    recall  f1-score   support

1-B√¨nh th∆∞·ªùng       0.76      0.85      0.80       174
        2-Nh·∫π       0.42      0.27      0.33        56
        3-V·ª´a       0.33      0.38      0.35        24
       4-N·∫∑ng       0.67      0.33      0.44         6

     accuracy                           0.67       260
    macro avg       0.54      0.46      0.48       260
 weighted avg       0.65      0.67      0.65       260

