In [1]:
from google.colab import drive
drive.mount('/content/gdrive')
%cd '/content/gdrive/MyDrive/Machine-Learning'

Mounted at /content/gdrive
/content/gdrive/MyDrive/Machine-Learning


In [2]:
!pip install transformers datasets torch


Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.

In [None]:
import os
import pandas as pd
import torch
from transformers import AutoTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict
from phobert_model import PhoBERTModel
from preprocessing import clean_text
import logging

# Enable CUDA debugging
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
os.environ['TORCH_USE_CUDA_DSA'] = "1"

# Set up logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)

# Project path on Google Drive
project_path = "/content/gdrive/MyDrive/Machine-Learning"
model_save_path = f"{project_path}/sentiment_phobert"
checkpoint_file = f"{project_path}/checkpoint.txt"

def load_checkpoint():
    """Load checkpoint to continue training from last processed row."""
    if os.path.exists(checkpoint_file):
        with open(checkpoint_file, "r") as f:
            return int(f.read().strip())
    return 0

def save_checkpoint(processed_rows):
    """Save checkpoint after each chunk."""
    with open(checkpoint_file, "w") as f:
        f.write(str(processed_rows))

def preprocess_dataframe(df):
    df = df.dropna().copy()
    df["label"] = df["label"].astype(int)
    valid_labels = [0, 1, 2]
    invalid_labels = set(df["label"]) - set(valid_labels)
    if invalid_labels:
        logger.warning(f"Found invalid labels: {invalid_labels}. Filtering to valid labels {valid_labels}.")
        df = df[df["label"].isin(valid_labels)]
    return df

def process_chunk(chunk, tokenizer):
    chunk = preprocess_dataframe(chunk)
    dataset = Dataset.from_pandas(chunk)
    dataset_dict = DatasetDict({"train": dataset})
    
    def preprocess_function(examples):
        return tokenizer(
            examples["comment"],
            padding="max_length",
            truncation=True,
            max_length=256,
        )
    
    tokenized_dataset = dataset_dict.map(
        preprocess_function,
        batched=True,
        desc="Tokenizing chunk",
        remove_columns=["comment"]
    )
    return tokenized_dataset["train"]

def fine_tune_phobert(train_path=f"{project_path}/data/train.csv", test_path=f"{project_path}/data/test.csv", chunk_size=1000):
    processed_rows = load_checkpoint()
    logger.info(f"Starting from {processed_rows} processed rows.")
    
    # Load model (pre-trained if exists, else from base PhoBERT)
    if os.path.exists(model_save_path):
        logger.info(f"Loading pre-trained model from {model_save_path}...")
        tokenizer = AutoTokenizer.from_pretrained(model_save_path, use_fast=True)
        model = RobertaForSequenceClassification.from_pretrained(model_save_path, num_labels=3)
    else:
        logger.info("No pre-trained model found, starting from vinai/phobert-base...")
        model_path = "vinai/phobert-base"
        tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
        model = RobertaForSequenceClassification.from_pretrained(model_path, num_labels=3, ignore_mismatched_sizes=True)
    
    for name, param in model.named_parameters():
        if "roberta.encoder.layer" in name and int(name.split(".")[3]) < 3:
            param.requires_grad = False
    
    num_workers = min(4, os.cpu_count() or 1)
    batch_size = 16 if torch.cuda.is_available() else 4
    training_args = TrainingArguments(
        output_dir=f"{project_path}/results",
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=1,
        weight_decay=0.01,
        logging_steps=10,
        save_strategy="no",
        report_to="none",
        no_cuda=not torch.cuda.is_available(),
        fp16=torch.cuda.is_available(),
        dataloader_num_workers=num_workers,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
    )
    
    logger.info("Starting training over chunks...")
    chunk_count = 0
    total_rows = 0
    
    for chunk in pd.read_csv(train_path, chunksize=chunk_size):
        total_rows += len(chunk)
        if total_rows <= processed_rows:
            logger.info(f"Skipping chunk {chunk_count + 1} - already processed.")
            chunk_count += 1
            continue
        
        chunk_count += 1
        logger.info(f"Processing chunk {chunk_count} (rows {total_rows - len(chunk)} to {total_rows})...")
        tokenized_train = process_chunk(chunk, tokenizer)
        trainer.train_dataset = tokenized_train
        trainer.train()
        
        trainer.save_model(model_save_path)
        save_checkpoint(total_rows)
        logger.info(f"Checkpoint saved after chunk {chunk_count} at {model_save_path}, processed {total_rows} rows.")
    
    test_df = preprocess_dataframe(pd.read_csv(test_path))
    tokenized_test = process_chunk(test_df, tokenizer)
    trainer.eval_dataset = tokenized_test
    logger.info("Evaluating model on test set...")
    eval_results = trainer.evaluate()
    logger.info(f"Evaluation results: {eval_results}")
    
    trainer.save_model(model_save_path)
    logger.info(f"Final model saved to {model_save_path}!")

def predict_sentiment(model_path=model_save_path, test_path=f"{project_path}/data/test.csv", num_samples=5):
    analyzer = PhoBERTModel(model_path=model_path)
    df = pd.read_csv(test_path).sample(num_samples, random_state=42)
    
    for _, row in df.iterrows():
        text = clean_text(row["comment"])
        sentiment, confidence, scores = analyzer.predict(text)
        logger.info(f"Text: {text}")
        logger.info(f"Predicted Sentiment: {sentiment}, Confidence: {confidence:.2f}")
        logger.info(f"Scores: {scores}")

def main():
    try:
        fine_tune_phobert()
        predict_sentiment()
    except Exception as e:
        logger.error(f"An error occurred: {str(e)}")

if __name__ == "__main__":
    main()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/557 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/895k [00:00<?, ?B/s]

bpe.codes:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.13M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/543M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/543M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/phobert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

✅ NaN in dataset: False
✅ Unique labels: [0 1]
📌 PhoBERT vocab size: 64000


Map:   0%|          | 0/12870 [00:00<?, ? examples/s]

Map:   0%|          | 0/3217 [00:00<?, ? examples/s]

🚀 Starting training...


Step,Training Loss
