In [1]:
from google.colab import drive
drive.mount('/content/gdrive')
%cd '/content/gdrive/MyDrive/Machine-Learning'

Mounted at /content/gdrive
/content/gdrive/MyDrive/Machine-Learning


In [2]:
!pip install transformers datasets torch


Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.

In [None]:
import os
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
from phobert_model import PhoBERTModel
from preprocessing import clean_text

# 🔍 Debug CUDA
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
os.environ['TORCH_USE_CUDA_DSA'] = "1"

# Chuyển vào thư mục chứa code
project_path = "/content/gdrive/MyDrive/Machine-Learning"

def preprocess_dataset(file_path):
    """Load dataset, xử lý NaN, kiểm tra giá trị label và fix lỗi"""
    df = pd.read_csv(file_path)

    # 🔍 Xóa NaN
    df.dropna(inplace=True)

    # 🔍 Chuyển label về kiểu số nguyên
    df["label"] = df["label"].astype(int)

    # 🔍 Kiểm tra giá trị bất thường
    unique_labels = df["label"].unique()
    if not np.all(np.isin(unique_labels, [0, 1, 2])):  # Đảm bảo nhãn chỉ có 0,1,2
        print(f"🚨 Dataset {file_path} có label không hợp lệ: {unique_labels}")
        df = df[df["label"].between(0, 2)]  # Xóa nhãn không hợp lệ
        print("✅ Fixed labels.")

    # 🔍 Lưu dataset đã xử lý
    df.to_csv(file_path, index=False)
    return file_path

def fine_tune_phobert():
    """Hàm train mô hình PhoBERT"""
    model_path = "vinai/phobert-base"
    tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)  # Fix lỗi tokenizer
    model = RobertaForSequenceClassification.from_pretrained(
        model_path, num_labels=3, ignore_mismatched_sizes=True  # 🔥 Đặt num_labels=3 để phù hợp với dataset
    )

    # Xử lý dataset trước khi load
    train_path = preprocess_dataset(f"{project_path}/data/train.csv")
    test_path = preprocess_dataset(f"{project_path}/data/test.csv")

    # Load dataset
    dataset = load_dataset("csv", data_files={"train": train_path, "test": test_path})

    # 🔍 Kiểm tra NaN và labels trong dataset
    labels = np.array(dataset["train"]["label"])
    print("✅ NaN in dataset:", np.isnan(labels).any())
    print("✅ Unique labels:", np.unique(labels))

    # 🔍 Kiểm tra token có vượt quá vocab không
    vocab_size = tokenizer.vocab_size
    print("📌 PhoBERT vocab size:", vocab_size)

    for sample in dataset["train"]:
        tokens = tokenizer(sample["comment"], padding="max_length", truncation=True, max_length=256)
        if max(tokens["input_ids"]) >= vocab_size:
            print(f"🚨 Lỗi: Input {sample['comment']} có token ngoài vocab!")

    # Tokenize dữ liệu
    def preprocess_function(examples):
        tokens = tokenizer(examples["comment"], padding="max_length", truncation=True, max_length=256)
        for i, token_list in enumerate(tokens["input_ids"]):
            if max(token_list) >= tokenizer.vocab_size:
                print(f"🚨 Lỗi: Input {examples['comment'][i]} có token ngoài vocab!")
        return tokens

    tokenized_datasets = dataset.map(preprocess_function, batched=True)

    # Cấu hình training
    training_args = TrainingArguments(
        output_dir=f"{project_path}/results",
        per_device_train_batch_size=2,  # Giảm batch size tránh lỗi GPU
        per_device_eval_batch_size=2,
        gradient_accumulation_steps=16,  # Điều chỉnh để không giảm tốc độ học
        num_train_epochs=3,
        weight_decay=0.01,
        report_to="none",
        use_cpu=True  # 🔥 Sửa `use_cpu=True` → `no_cuda=True` để chạy trên CPU nếu cần
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["test"],
    )

    # Train model
    print("🚀 Starting training...")
    trainer.train()

    # Lưu model vào Google Drive
    trainer.save_model(f"{project_path}/sentiment_phobert")
    print("✅ Model saved successfully!")

def main():
    """Hàm chạy training và dự đoán"""
    fine_tune_phobert()

    # Load model đã train xong để dự đoán
    model_path = f"{project_path}/sentiment_phobert"
    analyzer = PhoBERTModel(model_path)

    # Load dataset để test
    df = pd.read_csv(f"{project_path}/data/test.csv")

    for index, row in df.iterrows():
        text = clean_text(row['comment'])
        sentiment, confidence, scores = analyzer.predict(text)

        print(f"Text: {text}")
        print(f"Predicted Sentiment: {sentiment}, Confidence: {confidence:.2f}")
        print(f"Scores: {scores}\n")

if __name__ == "__main__":
    main()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/557 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/895k [00:00<?, ?B/s]

bpe.codes:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.13M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/543M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/543M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/phobert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

✅ NaN in dataset: False
✅ Unique labels: [0 1]
📌 PhoBERT vocab size: 64000


Map:   0%|          | 0/12870 [00:00<?, ? examples/s]

Map:   0%|          | 0/3217 [00:00<?, ? examples/s]

🚀 Starting training...


Step,Training Loss
