In [None]:
!pip install -q -U torch torchvision torchaudio transformers datasets accelerate scikit-learn pandas

In [None]:
import os
import torch
import numpy as np
import pandas as pd
from datasets import Dataset, concatenate_datasets
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification, 
    TrainingArguments, 
    Trainer,
    DataCollatorWithPadding
)
from sklearn.metrics import accuracy_score, f1_score, classification_report

# C·∫•u h√¨nh thi·∫øt b·ªã
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

In [None]:
MODEL_NAME = "vinai/phobert-large"
OUTPUT_DIR = "./phobert-large-hallu-finetuned"

# ƒê∆∞·ªùng d·∫´n g·ªëc t·ªõi folder ch·ª©a d·ªØ li·ªáu ƒë√£ x·ª≠ l√Ω (D·ª±a theo ·∫£nh b·∫°n g·ª≠i)
# Gi·∫£ s·ª≠ c·∫•u tr√∫c l√†: /kaggle/input/processed-hal/processed_data_3labels/{T√™n_B·ªô}/train.csv
DATA_ROOT = "/kaggle/input/processed-hal/processed_data_3labels/"

# ƒê·ªãnh nghƒ©a Mapping t·ª´ Ch·ªØ sang S·ªë
label2id = {
    "Entailment": 0,
    "Intrinsic-Hal": 1,
    "Extrinsic-Hal": 2
}
id2label = {0: "Entailment", 1: "Intrinsic-Hal", 2: "Extrinsic-Hal"}

def load_and_map_data(dataset_name, split):
    # T·∫°o ƒë∆∞·ªùng d·∫´n file
    file_path = os.path.join(DATA_ROOT, dataset_name, f"{split}.csv")
    
    if not os.path.exists(file_path):
        print(f"‚ö†Ô∏è Kh√¥ng t√¨m th·∫•y file: {file_path}")
        return None
    
    print(f"üîÑ ƒêang ƒë·ªçc {dataset_name} - {split}...")
    df = pd.read_csv(file_path)
    
    # L·ªçc b·ªè d√≤ng l·ªói (n·∫øu c√≥)
    df = df.dropna(subset=['sentence1', 'sentence2', 'label'])
    
    # --- MAP NH√ÉN TR·ª∞C TI·∫æP ---
    # V√¨ file csv ƒë√£ c√≥ nh√£n ƒë√∫ng t√™n, ta ch·ªâ c·∫ßn map sang s·ªë
    df['labels'] = df['label'].map(label2id)
    
    # Ki·ªÉm tra xem c√≥ nh√£n n√†o l·∫° kh√¥ng (b·ªã NaN sau khi map)
    if df['labels'].isnull().any():
        print(f"‚ö†Ô∏è C·∫£nh b√°o: C√≥ {df['labels'].isnull().sum()} d√≤ng nh√£n l·∫°, s·∫Ω b·ªã b·ªè qua.")
        df = df.dropna(subset=['labels'])
        
    df['labels'] = df['labels'].astype(int)
    
    return df[['sentence1', 'sentence2', 'labels']]

# --- LOAD V√Ä G·ªòP 3 B·ªò D·ªÆ LI·ªÜU ---
dfs_train = []
dfs_dev = []

# Danh s√°ch t√™n th∆∞ m·ª•c con c·ªßa 3 b·ªô d·ªØ li·ªáu
datasets_list = ["ViMedNLI", "ViANLI", "ViNLI"] 

for ds in datasets_list:
    t = load_and_map_data(ds, "train")
    d = load_and_map_data(ds, "dev")
    if t is not None: dfs_train.append(t)
    if d is not None: dfs_dev.append(d)

if not dfs_train:
    raise ValueError("‚ùå L·ªñI: Kh√¥ng load ƒë∆∞·ª£c d·ªØ li·ªáu n√†o! Ki·ªÉm tra l·∫°i ƒë∆∞·ªùng d·∫´n DATA_ROOT.")

# G·ªôp DataFrame
full_train_df = pd.concat(dfs_train, ignore_index=True)
full_dev_df = pd.concat(dfs_dev, ignore_index=True)

# Chuy·ªÉn sang HuggingFace Dataset
train_dataset = Dataset.from_pandas(full_train_df)
dev_dataset = Dataset.from_pandas(full_dev_df)

print("-" * 40)
print(f"‚úÖ T·ªïng m·∫´u Train: {len(train_dataset)}")
print(f"‚úÖ T·ªïng m·∫´u Dev: {len(dev_dataset)}")
print(f"‚úÖ Ph√¢n b·ªë nh√£n Train: {full_train_df['labels'].value_counts().to_dict()}")
print(f"   (0: Entailment, 1: Intrinsic-Hal, 2: Extrinsic-Hal)")
print("-" * 40)


In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def preprocess_function(examples):
    return tokenizer(
        examples["sentence1"], 
        examples["sentence2"], 
        truncation=True, 
        padding=False, 
        max_length=256 
    )

print("‚è≥ ƒêang Tokenize...")
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_dev = dev_dataset.map(preprocess_function, batched=True)

# X√≥a c·ªôt c≈©
cols_to_remove = ['sentence1', 'sentence2']
if '__index_level_0__' in tokenized_train.column_names: cols_to_remove.append('__index_level_0__')
tokenized_train = tokenized_train.remove_columns(cols_to_remove)
tokenized_dev = tokenized_dev.remove_columns(cols_to_remove)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=3, # 3 L·ªõp
    id2label=id2label,
    label2id=label2id
).to(device)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='macro')
    return {"accuracy": acc, "f1_macro": f1}

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=7,              
    learning_rate=2e-5,              
    per_device_train_batch_size=8,   
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=2,   
    
    # --- C·∫§U H√åNH L∆ØU MODEL TI·∫æT KI·ªÜM ---
    eval_strategy="epoch",           # V·∫´n ƒë√°nh gi√° sau m·ªói epoch
    save_strategy="epoch",           # L∆∞u sau m·ªói epoch
    save_total_limit=1,              # <--- QUAN TR·ªåNG: Ch·ªâ gi·ªØ l·∫°i 1 checkpoint t·ªët nh·∫•t, x√≥a c√°c c√°i c≈©
    load_best_model_at_end=True,     # Load model t·ªët nh·∫•t khi xong
    # ------------------------------------
    
    metric_for_best_model="f1_macro",
    weight_decay=0.01,
    report_to="none",
    fp16=True,                       
    logging_steps=100
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_dev,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# X√≥a b·ªõt cache c≈© n·∫øu c√≥ ƒë·ªÉ gi·∫£i ph√≥ng dung l∆∞·ª£ng ngay l·∫≠p t·ª©c
import shutil
if os.path.exists(OUTPUT_DIR):
    shutil.rmtree(OUTPUT_DIR)
    os.makedirs(OUTPUT_DIR)

print("üöÄ B·∫Øt ƒë·∫ßu Training PhoBERT-Large (3-Class)...")
trainer.train()

In [None]:
# L∆∞u model
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print(f"‚úÖ Model ƒë√£ l∆∞u t·∫°i: {OUTPUT_DIR}")

# ƒê√°nh gi√° chi ti·∫øt
print("\n--- Classification Report (Dev Set) ---")
preds_output = trainer.predict(tokenized_dev)
y_preds = np.argmax(preds_output.predictions, axis=1)
y_true = preds_output.label_ids

target_names = ["Entailment", "Intrinsic-Hal", "Extrinsic-Hal"]
print(classification_report(y_true, y_preds, target_names=target_names, digits=4))

# L∆∞u k·∫øt qu·∫£ ra CSV
df_res = pd.DataFrame({"True": y_true, "Pred": y_preds})
df_res.to_csv("phobert_3class_results.csv", index=False)