In [None]:
!pip install -q -U torch torchvision torchaudio transformers datasets accelerate scikit-learn pandas

import os
import torch
import numpy as np
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification, 
    TrainingArguments, Trainer, DataCollatorWithPadding
)
from sklearn.metrics import accuracy_score, f1_score, classification_report

DATASET_NAME = "ViMedNLI"
MODEL_NAME = "vinai/phobert-large"
OUTPUT_DIR = f"./phobert-large-{DATASET_NAME.lower()}"
# S·ª≠a ƒë∆∞·ªùng d·∫´n n√†y n·∫øu c·∫ßn
DATA_ROOT = "/kaggle/input/processed-hal/processed_data_3labels/" 

# --- X·ª¨ L√ù D·ªÆ LI·ªÜU ---
label2id = {"Entailment": 0, "Intrinsic-Hal": 1, "Extrinsic-Hal": 2}
id2label = {0: "Entailment", 1: "Intrinsic-Hal", 2: "Extrinsic-Hal"}

def load_data(split):
    path = os.path.join(DATA_ROOT, DATASET_NAME, f"{split}.csv")
    df = pd.read_csv(path)
    df = df.dropna(subset=['sentence1', 'sentence2', 'label'])
    # Map nh√£n text sang s·ªë
    df['labels'] = df['label'].map(label2id)
    return Dataset.from_pandas(df)

print(f"‚è≥ ƒêang load d·ªØ li·ªáu {DATASET_NAME}...")
train_dataset = load_data("train")
dev_dataset = load_data("dev")
print(f"‚úÖ Train: {len(train_dataset)} | Dev: {len(dev_dataset)}")

# --- TOKENIZER ---
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
def preprocess(examples):
    return tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, padding=False, max_length=256)

tokenized_train = train_dataset.map(preprocess, batched=True)
tokenized_dev = dev_dataset.map(preprocess, batched=True)

# --- MODEL & METRICS ---
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=3, id2label=id2label, label2id=label2id
).to("cuda")

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    preds = np.argmax(preds, axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1_macro": f1_score(labels, preds, average='macro')
    }

# --- TRAINING (FIX L·ªñI TR√ÄN ·ªî C·ª®NG) ---
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=7,              # Train k·ªπ 7 epoch
    learning_rate=2e-5,
    per_device_train_batch_size=8,   # Batch nh·ªè cho model Large
    gradient_accumulation_steps=2,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,              # <--- QUAN TR·ªåNG: Ch·ªâ gi·ªØ 1 checkpoint t·ªët nh·∫•t
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    fp16=True,
    report_to="none"
)

trainer = Trainer(
    model=model, args=training_args,
    train_dataset=tokenized_train, eval_dataset=tokenized_dev,
    tokenizer=tokenizer, data_collator=DataCollatorWithPadding(tokenizer),
    compute_metrics=compute_metrics
)

print("üöÄ B·∫Øt ƒë·∫ßu Training...")
trainer.train()

# --- L∆ØU & ƒê√ÅNH GI√Å ---
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print(f"‚úÖ ƒê√£ l∆∞u model t·∫°i {OUTPUT_DIR}")

preds = np.argmax(trainer.predict(tokenized_dev).predictions, axis=1)
print(classification_report(tokenized_dev['labels'], preds, target_names=list(label2id.keys()), digits=4))