In [None]:
from google.colab import drive
drive.mount('/content/gdrive')
%cd '/content/gdrive/MyDrive/Machine-Learning'

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
/content/gdrive/MyDrive/Machine-Learning


In [None]:
pip install torch transformers datasets pandas scikit-learn psutil nlpaug

Found existing installation: torch 2.6.0+cu118
Uninstalling torch-2.6.0+cu118:
  Successfully uninstalled torch-2.6.0+cu118
Found existing installation: torchvision 0.21.0+cu118
Uninstalling torchvision-0.21.0+cu118:
  Successfully uninstalled torchvision-0.21.0+cu118
Found existing installation: torchaudio 2.6.0+cu118
Uninstalling torchaudio-2.6.0+cu118:
  Successfully uninstalled torchaudio-2.6.0+cu118
Looking in indexes: https://download.pytorch.org/whl/cu118
Collecting torch
  Using cached https://download.pytorch.org/whl/cu118/torch-2.6.0%2Bcu118-cp311-cp311-linux_x86_64.whl.metadata (27 kB)
Collecting torchvision
  Using cached https://download.pytorch.org/whl/cu118/torchvision-0.21.0%2Bcu118-cp311-cp311-linux_x86_64.whl.metadata (6.1 kB)
Collecting torchaudio
  Using cached https://download.pytorch.org/whl/cu118/torchaudio-2.6.0%2Bcu118-cp311-cp311-linux_x86_64.whl.metadata (6.6 kB)
Using cached https://download.pytorch.org/whl/cu118/torch-2.6.0%2Bcu118-cp311-cp311-linux_x86_64.

In [None]:
import torch
import os
import pandas as pd
from transformers import AutoTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from phobert_model import PhoBERTModel
from preprocessing import clean_text
import logging
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import psutil  # Để theo dõi tài nguyên
import signal  # Để xử lý dừng thực thi

# Enable CUDA debugging
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
os.environ['TORCH_USE_CUDA_DSA'] = "1"

# Set up logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)

# Project path on Google Drive
project_path = "/content/gdrive/MyDrive/Machine-Learning"
model_save_path = f"{project_path}/sentiment_phobert"
checkpoint_file = f"{project_path}/checkpoint.txt"
MAX_LENGTH = 256
PATIENCE = 3  # For early stopping

# Biến để kiểm soát việc dừng huấn luyện
stop_training = False

def signal_handler(sig, frame):
    """Xử lý tín hiệu dừng từ người dùng (Ctrl+C)."""
    global stop_training
    logger.info("Received interrupt signal. Stopping training gracefully...")
    stop_training = True

signal.signal(signal.SIGINT, signal_handler)

def load_checkpoint():
    """Load checkpoint để tiếp tục từ dòng đã xử lý."""
    if os.path.exists(checkpoint_file):
        with open(checkpoint_file, "r") as f:
            return int(f.read().strip())
    return 0

def save_checkpoint(processed_rows):
    """Lưu checkpoint sau mỗi chunk."""
    with open(checkpoint_file, "w") as f:
        f.write(str(processed_rows))

def preprocess_dataframe(df):
    """Xử lý trước dataframe và đảm bảo dữ liệu hợp lệ."""
    df = df.dropna().copy()
    df["label"] = df["label"].astype(int)
    df = df[df["comment"].str.strip().str.len() > 0]
    valid_labels = [0, 1, 2]
    invalid_labels = set(df["label"]) - set(valid_labels)
    if invalid_labels:
        logger.warning(f"Found invalid labels: {invalid_labels}. Filtering to valid labels {valid_labels}.")
        df = df[df["label"].isin(valid_labels)]
    if len(df) == 0:
        logger.error("Dataframe is empty after preprocessing!")
        raise ValueError("No valid data after preprocessing.")
    return df

def process_chunk(chunk, tokenizer):
    """Xử lý một chunk dữ liệu và tokenize nó."""
    chunk = preprocess_dataframe(chunk)
    logger.info(f"Chunk size after preprocessing: {len(chunk)} rows")

    dataset = Dataset.from_pandas(chunk)
    def preprocess_function(examples):
        tokenized = tokenizer(
            examples["comment"],
            padding="max_length",
            truncation=True,
            max_length=MAX_LENGTH,
            return_tensors="pt",
            add_special_tokens=True,
            return_token_type_ids=False
        )
        return tokenized

    tokenized_dataset = dataset.map(
        preprocess_function,
        batched=True,
        desc="Tokenizing chunk",
        remove_columns=["comment"]
    )
    tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
    return tokenized_dataset

def split_and_prepare_initial_data(train_path, test_path):
    """Chia dữ liệu train thành train và validation ban đầu."""
    train_df = preprocess_dataframe(pd.read_csv(train_path))
    test_df = preprocess_dataframe(pd.read_csv(test_path))

    train_split, val_split = train_test_split(train_df, test_size=0.2, stratify=train_df["label"], random_state=42)
    logger.info(f"Train size: {len(train_split)}, Validation size: {len(val_split)}, Test size: {len(test_df)}")

    train_split_path = f"{project_path}/data/train_split.csv"
    val_split_path = f"{project_path}/data/val_split.csv"
    test_split_path = f"{project_path}/data/test_split.csv"

    train_split.to_csv(train_split_path, index=False)
    val_split.to_csv(val_split_path, index=False)
    test_df.to_csv(test_split_path, index=False)

    return train_split_path, val_split_path, test_split_path

def compute_metrics(pred):
    """Tính các metric như accuracy, precision, recall, f1."""
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

def load_model_and_tokenizer(model_save_path, base_model="vinai/phobert-base"):
    """Tải model và tokenizer từ đường dẫn hoặc khởi tạo mới."""
    if os.path.exists(model_save_path) and all(f in os.listdir(model_save_path) for f in ["tokenizer_config.json", "config.json"]):
        logger.info(f"Loading pre-trained model from {model_save_path}...")
        tokenizer = AutoTokenizer.from_pretrained(model_save_path, use_fast=True)
        model = RobertaForSequenceClassification.from_pretrained(model_save_path, num_labels=3)
    else:
        logger.info("No valid pre-trained model found, starting from vinai/phobert-base...")
        tokenizer = AutoTokenizer.from_pretrained(base_model, use_fast=True)
        model = RobertaForSequenceClassification.from_pretrained(base_model, num_labels=3, ignore_mismatched_sizes=True)
    return model, tokenizer

def log_resources():
    """Ghi log thông tin tài nguyên hệ thống."""
    memory = psutil.virtual_memory()
    cpu_usage = psutil.cpu_percent(interval=1)
    logger.info(f"System Resources - CPU Usage: {cpu_usage}%, Memory Used: {memory.percent}% "
                f"({memory.used / 1024**3:.2f} GB / {memory.total / 1024**3:.2f} GB)")

def train_chunk(trainer, chunk, tokenizer, chunk_count, total_rows, model_save_path, manual_mode=False):
    """Huấn luyện một chunk dữ liệu."""
    global stop_training
    try:
        tokenized_train = process_chunk(chunk, tokenizer)
        trainer.train_dataset = tokenized_train

        if manual_mode:
            user_input = input(f"Train chunk {chunk_count} (rows {total_rows - len(chunk)} to {total_rows})? (y/n): ")
            if user_input.lower() != 'y':
                logger.info(f"Skipping chunk {chunk_count} as per user request.")
                return None

        trainer.train()
        trainer.save_model(model_save_path)
        tokenizer.save_pretrained(model_save_path)
        save_checkpoint(total_rows)
        logger.info(f"Checkpoint saved after chunk {chunk_count} at {model_save_path}, processed {total_rows} rows.")

        val_results = trainer.evaluate()
        logger.info(f"Validation results after chunk {chunk_count}: {val_results}")
        log_resources()  # Ghi log tài nguyên sau mỗi chunk
        return val_results['eval_loss']
    except Exception as e:
        logger.error(f"Error in chunk {chunk_count}: {str(e)}. Skipping to next chunk.")
        return None
    finally:
        if stop_training:
            logger.info("Training stopped by user.")
            return None

def fine_tune_phobert(train_path=f"{project_path}/data/train_data.csv",
                     test_path=f"{project_path}/data/test.csv",
                     chunk_size=2000,
                     manual_mode=False,
                     reset_session=False):
    """Fine-tune PhoBERT model với các tùy chọn bổ sung."""
    global stop_training
    processed_rows = 0 if reset_session else load_checkpoint()
    logger.info(f"Starting from {processed_rows} processed rows (reset_session={reset_session}).")

    logger.info(f"CUDA available: {torch.cuda.is_available()}")
    if torch.cuda.is_available():
        logger.info(f"Device: {torch.cuda.current_device()} - {torch.cuda.get_device_name(0)}")

    model, tokenizer = load_model_and_tokenizer(model_save_path)

    train_split_path = f"{project_path}/data/train_split.csv"
    val_split_path = f"{project_path}/data/val_split.csv"
    test_split_path = f"{project_path}/data/test_split.csv"

    if not os.path.exists(train_split_path) or reset_session:
        train_split_path, val_split_path, test_split_path = split_and_prepare_initial_data(train_path, test_path)

    val_dataset = process_chunk(pd.read_csv(val_split_path), tokenizer)
    test_dataset = process_chunk(pd.read_csv(test_split_path), tokenizer)

    for name, param in model.named_parameters():
        if "roberta.encoder.layer" in name and int(name.split(".")[3]) < 1:
            param.requires_grad = False

    num_workers = min(4, os.cpu_count() or 1)
    batch_size = 16 if torch.cuda.is_available() else 4
    training_args = TrainingArguments(
        output_dir=f"{project_path}/results",
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=1,
        weight_decay=0.01,
        learning_rate=2e-5,
        logging_steps=10,
        save_strategy="no",
        evaluation_strategy="steps",
        eval_steps=50,
        report_to="none",
        no_cuda=not torch.cuda.is_available(),
        fp16=torch.cuda.is_available(),
        dataloader_num_workers=num_workers,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
    )

    logger.info("Starting training over chunks...")
    chunk_count = 0
    total_rows = 0
    best_val_loss = float('inf')
    wait = 0

    for chunk in pd.read_csv(train_split_path, chunksize=chunk_size):
        total_rows += len(chunk)
        if total_rows <= processed_rows:
            logger.info(f"Skipping chunk {chunk_count + 1} - already processed.")
            chunk_count += 1
            continue

        if stop_training:
            break

        chunk_count += 1
        logger.info(f"Processing chunk {chunk_count} (rows {total_rows - len(chunk)} to {total_rows})...")
        val_loss = train_chunk(trainer, chunk, tokenizer, chunk_count, total_rows, model_save_path, manual_mode)
        if val_loss is not None:
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                wait = 0
            else:
                wait += 1
                if wait >= PATIENCE:
                    logger.info("Early stopping triggered.")
                    break

    if not stop_training:
        trainer.eval_dataset = test_dataset
        logger.info("Evaluating model on test set...")
        eval_results = trainer.evaluate()
        logger.info(f"Test set evaluation results: {eval_results}")

        trainer.save_model(model_save_path)
        tokenizer.save_pretrained(model_save_path)
        logger.info(f"Final model and tokenizer saved to {model_save_path}!")

def predict_sentiment(model_path=model_save_path, test_path=f"{project_path}/data/test.csv", num_samples=5):
    """Dự đoán sentiment trên dữ liệu test."""
    analyzer = PhoBERTModel(model_path=model_path)
    df = pd.read_csv(test_path).sample(num_samples, random_state=42)

    for _, row in df.iterrows():
        text = clean_text(row["comment"])
        sentiment, confidence, scores = analyzer.predict(text)
        logger.info(f"Text: {text}")
        logger.info(f"True Label: {row['label']}")
        logger.info(f"Predicted Sentiment: {sentiment}, Confidence: {confidence:.2f}")
        logger.info(f"Scores: {scores}")

def main():
    try:
        # Chạy huấn luyện với chế độ thủ công (manual_mode=True) hoặc tự động
        fine_tune_phobert(chunk_size=2000, manual_mode=False, reset_session=False)
        predict_sentiment()
    except Exception as e:
        logger.error(f"An error occurred: {str(e)}")
        raise

if __name__ == "__main__":
    main()

Tokenizing chunk:   0%|          | 0/1020 [00:00<?, ? examples/s]

Tokenizing chunk:   0%|          | 0/3217 [00:00<?, ? examples/s]



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
import torch
import os
import pandas as pd
from transformers import AutoTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict
from phobert_model import PhoBERTModel
from preprocessing import clean_text
import logging
from sklearn.model_selection import train_test_split

print("All imports successful")

ModuleNotFoundError: No module named 'phobert_model'