#Load LLM

In [None]:
!pip install -U transformers



In [None]:
!pip install -U bitsandbytes
!pip install accelerate peft

Collecting bitsandbytes
  Downloading bitsandbytes-0.48.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Downloading bitsandbytes-0.48.1-py3-none-manylinux_2_24_x86_64.whl (60.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.1/60.1 MB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.48.1


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

model_id = "Qwen/Qwen3-4B-Instruct-2507"


# Bước 1: cấu hình 4-bit cho QLoRA
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="bfloat16"
)

# Bước 2: load tokenizer & model
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto"
)

# Bước 3: chuẩn bị QLoRA
model = prepare_model_for_kbit_training(model)

# Bước 4: cấu hình LoRA
lora_config = LoraConfig(
    r=64,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],  # thường áp dụng cho LLaMA/Qwen
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)

# Test inference như cũ
messages = [
    {"role": "user", "content": "Who are you?"},
]

inputs = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    tokenize=True,
    return_dict=True,
    return_tensors="pt",
).to(model.device)

outputs = model.generate(**inputs, max_new_tokens=40)
print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:]))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/99.6M [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/3.99G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/238 [00:00<?, ?B/s]

I am Qwen, a large-scale language model developed by Alibaba Cloud's Tongyi Lab. I can assist you with a wide range of tasks, including answering questions, writing stories, creating documents,


In [None]:
messages = [
    {"role": "user", "content": "Thủ đô của việt nam là gì?"}
]

inputs = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    tokenize=True,
    return_dict=True,
    return_tensors="pt",
).to(model.device)

outputs = model.generate(**inputs, max_new_tokens=40)
print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:]))

Thủ đô của Việt Nam là **Hà Nội**.<|im_end|>


In [None]:
def chat(model, tokenizer, messages, max_new_tokens=100):
    inputs = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        tokenize=True,
        return_dict=True,
        return_tensors="pt",
    ).to(model.device)

    outputs = model.generate(**inputs, max_new_tokens=max_new_tokens)
    return tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:])

In [None]:
messages = [{"role": "user", "content": "What is the capital of France?"}]
print(chat(model, tokenizer, messages))

messages.append({"role": "user", "content": "And the capital of Germany?"})
print(chat(model, tokenizer, messages))

The capital of France is Paris.<|im_end|>
The capital of France is Paris.  
The capital of Germany is Berlin.<|im_end|>


#Split original sentences + Extract TERM

In [None]:
import re

def split_sentence_with_terms_llm(sentence, model, tokenizer, max_new_tokens=300):
    """
    Tách một sentence thành các clause + extract term/aspect chỉ bằng LLM.
    Trả về list dict:
    [{"clause": ..., "term": ..., "sentence_original": ...}, ...]
    """
    prompt = (
    "You are an expert linguist working on Aspect-Based Sentiment Analysis (ABSA).\n"
    "Your task is to split the following review sentence into smaller clauses and identify the aspect/term discussed in each clause.\n\n"

    "==================== STRICT RULES ====================\n"
    "1️. DO NOT add, remove, translate, explain, or modify ANY words, symbols, or punctuation in the original sentence.\n"
    "   • Every clause must be a **continuous substring** of the original sentence.\n"
    "   • The output must cover **all parts of the sentence** — no content should be ignored or missing.\n"
    "2️. Only split the sentence where it makes sense semantically — typically around conjunctions ('and', 'but', 'while', 'although', etc.) "
    "or when the opinion changes.\n"
    "   •Do NOT split phrases that grammatically or logically belong to the same subject. "
    "   • If a descriptive phrase does not have a clear term in the sentence, keep it as a separate clause but leave Term blank."
    "3️. Keep the exact original wording and order in each clause. Do NOT reorder, paraphrase, or summarize.\n"
    "4️. Each clause must express a clear **opinion or evaluative meaning**, either explicit (e.g., 'dirty', 'perfect') or implicit "
    "(e.g., 'gave us many tips' implies helpfulness, 'helped us with departure' implies good service).\n"
    "5️. Do NOT separate adverbs (e.g., 'really', 'very', 'so', 'too', 'quite', 'extremely', 'absolutely', "
    "'rather', 'fairly', 'pretty', 'incredibly', 'particularly', 'deeply', 'highly') from the words they modify.\n"
    "6️. Keep negative or limiting words such as 'nothing', 'none', 'nobody', 'no one', 'nowhere', 'never', "
    "'hardly', 'barely', 'scarcely', 'without', 'no', 'not' **inside the same clause** — they must not be removed or separated.\n"
    "7️. Identify the **TERM** being discussed in each clause.\n"
    "   • TERM: the main aspect or entity being described (e.g., 'staff', 'room', 'hotel').\n"
    "   • If no clear term appears, leave it blank.\n"
    "8️. Avoid creating meaningless or redundant clauses.\n"
    "9️. If multiple terms appear in the same clause, separate them with commas.\n"
    "10️. If a clause refers to the same entity as a previous one but does not repeat it explicitly, "
    "**propagate the term from the previous clause**.\n\n"

    "==================== COVERAGE REQUIREMENT ====================\n"
    " Every part of the original sentence must appear in at least one clause.\n"
    " Do NOT skip, shorten, or drop any meaningful phrase, even if it lacks an explicit sentiment word.\n"
    " Clauses that describe actions, experiences, or behaviors with clear positive/negative implications "
    "must be included (e.g., 'gave us many tips', 'helped us with departure').\n\n"

    "==================== OUTPUT FORMAT ====================\n"
    "Clause: <clause text> | Term: <term1,term2,...>\n\n"

    "==================== EXAMPLES ====================\n"
    "Input: The apartment was fully furnished, great facilities, everything was cleaned and well prepared.\n"
    "Output:\n"
    "Clause: The apartment was fully furnished | Term: apartment\n"
    "Clause: great facilities | Term: facilities\n"
    "Clause: everything was cleaned and well prepared | Term: room,facility\n\n"

    "Input: diny was really helpful, he gave us many tips and helped us with departure.\n"
    "Output:\n"
    "Clause: diny was really helpful | Term: staff\n"
    "Clause: he gave us many tips | Term: staff\n"
    "Clause: helped us with departure | Term: staff\n\n"

    "Input: i can definitely recommend it!.\n"
    "Output:\n"
    "Clause: i can definitely recommend it! | Term: \n\n"

    "==================== RESPONSE INSTRUCTION ====================\n"
    "Respond ONLY with the clauses and terms exactly in the format shown above.\n"
    "Do NOT include any explanation, reasoning, or commentary.\n"
    "Do NOT include quotation marks, markdown, or extra text.\n\n"

    f"Now process this sentence WITHOUT changing any words:\n{sentence}"
)


    messages = [{"role": "user", "content": prompt}]
    response = chat(model, tokenizer, messages, max_new_tokens=max_new_tokens).strip()

    # --- Làm sạch output ---
    result = []
    last_term = ""
    for line in response.split("\n"):
        line = line.strip()
        if not line:
            continue
        if "| Term:" in line:
            clause_text, term = line.split("| Term:")
            clause_text = clause_text.replace("Clause:", "").strip()
            term = term.strip()
            if term == "":
                term = last_term  # propagate term
            else:
                last_term = term
        else:
            clause_text = line
            term = last_term
        result.append({"clause": clause_text, "term": term, "sentence_original": sentence})

    return result

##Term Refinement & Normalization

In [None]:
def split_and_term_extraction(sentence):
  clauses = split_sentence_with_terms_llm(sentence, model=model, tokenizer=tokenizer)
  if clauses:
    last_clause = clauses[-1]
    if "term" in last_clause:
        # Loại bỏ chuỗi '<|im_end|>' nếu xuất hiện
        last_clause["term"] = last_clause["term"].replace("<|im_end|>", "").strip()
  for c in clauses:
    terms = [t.strip() for t in c.get("term", "").split(",") if t.strip()]
    # Chỉ giữ term xuất hiện trong sentence_original
    terms = [t for t in terms if t.lower() in c["sentence_original"].lower()]
    c["term"] = ",".join(terms)
  return clauses

In [None]:
sentence = "very friendly and helpful host"

In [None]:
clauses=split_and_term_extraction(sentence)

In [None]:
clauses

[{'clause': 'very friendly and helpful host',
  'term': 'host',
  'sentence_original': 'very friendly and helpful host'}]

#Extract OPINION

In [None]:
import re

def extract_opinions_only_from_clauses(clauses, model, tokenizer, max_new_tokens=200):
    """
    Extract only OPINION for each clause using LLM.
    If opinion words are not present in the original sentence, exclude them.
    Returns a list of dicts with 'opinion' field.
    """
    final_clauses = []

    for c in clauses:
        clause_text = c["clause"]
        term = c.get("term", "")
        sentence_original = c.get("sentence_original", "")
        prompt = f"""
You are an expert in Aspect-Based Sentiment Analysis (ABSA).

Task: Extract all **opinion expressions** about the aspect/term "{term}" from the following clause.

Strict rules:
1. Only extract opinion words or short opinion phrases that appear **exactly** in the clause.
2. Extract only opinions that clearly describe or evaluate the main term "{term}".
3. Do **NOT** paraphrase, translate, or invent new words.
4. Do **NOT** include explanations, reasoning, or labels.
5. If there is no clear opinion, output an empty string.
6. Output format: comma-separated list — e.g., "very helpful, friendly".

Clause:
"{clause_text}"

Answer:
"""
        messages = [{"role": "user", "content": prompt}]
        opinion_text = chat(model, tokenizer, messages, max_new_tokens=max_new_tokens).strip()

        # Làm sạch đầu ra
        opinion_text = (
            opinion_text.replace("<|im_end|>", "")
            .replace("\n", " ")
            .strip()
        )

        # Chuẩn hóa danh sách opinions
        opinions = re.split(r",", opinion_text)
        opinions = [o.strip() for o in opinions if o.strip()]

        valid_opinions = [
            o for o in opinions if re.search(rf"\b{o}\b", sentence_original, re.IGNORECASE)
        ]

        new_c = c.copy()
        new_c["opinion"] = ", ".join(valid_opinions) if valid_opinions else ""
        final_clauses.append(new_c)

    return final_clauses



In [None]:
clauses = extract_opinions_only_from_clauses(clauses, model, tokenizer)

In [None]:
clauses

[{'clause': 'very friendly and helpful host',
  'term': 'host',
  'sentence_original': 'very friendly and helpful host',
  'opinion': 'friendly, helpful'}]

#Semantic Verification(YES/NO)


In [None]:
# def verify(clauses, model, tokenizer, max_new_tokens=100):
#     final_clauses = []
#     for c in clauses:
#         clause_text = c.get("clause", "")
#         term = c.get("term", "")
#         opinion_text = c.get("opinion", "").strip()
#         new_c = c.copy()

#         if not opinion_text:
#             new_c["opinion_validated"] = ""
#             final_clauses.append(new_c)
#             continue

#         # --- tách từng opinion ---
#         opinions = [op.strip() for op in opinion_text.split(",") if op.strip()]
#         validated = []

#         for op in opinions:
#             prompt = f"""
# You are an expert in Aspect-Based Sentiment Analysis (ABSA).

# Check whether the opinion describes the term meaningfully in this clause.

# Even short phrases (like "friendly host") count as valid.

# Term: "{term}"
# Opinion: "{op}"
# Clause: "{clause_text}"

# Answer only YES or NO.
# """
#             messages = [{"role": "user", "content": prompt}]
#             result = chat(model, tokenizer, messages, max_new_tokens=20).strip().upper()
#             result = result.replace("<|im_end|>", "").strip()

#             if "YES" in result:
#                 validated.append(op)

#         new_c["opinion_validated"] = ", ".join(validated)
#         final_clauses.append(new_c)

#     return final_clauses



In [None]:
# clauses= verify(clauses, model, tokenizer)


#Finetune Roberta base

In [None]:
# ============================================================
# === FINETUNE CATEGORY CLASSIFIER (LoRA) TỪ FILE CSV =======
# ============================================================

!pip install -q transformers datasets peft accelerate bitsandbytes scikit-learn

import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model, TaskType
from datasets import Dataset
from sklearn.metrics import accuracy_score, classification_report

# ====================== CẤU HÌNH ======================
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f" Using device: {device}")

# ====================== ĐỌC DỮ LIỆU ======================
goal_path = "goal.csv"
llm_path = "/content/sample_qa.csv"

goal_df = pd.read_csv(goal_path)
llm_predict_df = pd.read_csv(llm_path)


# Đảm bảo có cột đúng định dạng
assert {"clause", "category"}.issubset(goal_df.columns), "goal.csv thiếu cột 'clause' hoặc 'category'"
assert {"clause", "category"}.issubset(llm_predict_df.columns), "llm_predict.csv thiếu cột 'clause' hoặc 'category'"

print(f" goal_df: {len(goal_df)} mẫu | llm_predict_df: {len(llm_predict_df)} mẫu")
print(goal_df.head(2))
print(llm_predict_df.head(2))

# ====================== CHUẨN BỊ DỮ LIỆU ======================
# goal_df = ground truth
# llm_predict_df = mô hình cũ dự đoán (có thể đúng/sai)
train_df = goal_df.copy()
test_df = llm_predict_df.copy()

# ====================== TOKENIZER + ENCODING ======================
model_name = "roberta-base"
tokenizer_cat = AutoTokenizer.from_pretrained(model_name)

label_list = sorted(goal_df["category"].unique().tolist())
label2id = {l: i for i, l in enumerate(label_list)}
id2label = {i: l for l, i in label2id.items()}

def encode_fn(batch):
    enc = tokenizer_cat(batch["clause"], truncation=True, padding="max_length", max_length=128)
    enc["labels"] = label2id[batch["category"]]
    return enc

train_ds = Dataset.from_pandas(train_df)
test_ds = Dataset.from_pandas(test_df)
train_ds = train_ds.map(encode_fn)
test_ds = test_ds.map(encode_fn)
train_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
test_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# ====================== HÀM ĐÁNH GIÁ ======================
def evaluate_model(model, dataset):
    model.eval()
    model.to(device)
    preds, trues = [], []
    for i in range(len(dataset)):
        inputs = {k: v.unsqueeze(0).to(device) for k, v in dataset[i].items() if k in ["input_ids", "attention_mask"]}
        with torch.no_grad():
            outputs = model(**inputs)
            pred = torch.argmax(outputs.logits, dim=1).item()
        preds.append(pred)
        trues.append(dataset[i]["labels"].item())
    print(classification_report(trues, preds, target_names=label_list, zero_division=0))
    return accuracy_score(trues, preds)

# ====================== MODEL GỐC (chưa fine-tune) ======================
print("\n Đánh giá trước fine-tune ...")

model_pre = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(label_list),
    ignore_mismatched_sizes=True
).to(device)

acc_before = evaluate_model(model_pre, test_ds)
print(f" Accuracy before finetune: {acc_before:.2f}")

# ====================== CẤU HÌNH LoRA ======================
print("\n Fine-tuning với LoRA ...")

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["query", "value"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_CLS
)

model_cat = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(label_list),
    ignore_mismatched_sizes=True
)
model_cat = get_peft_model(model_cat, lora_config)
model_cat.to(device)
model_cat.print_trainable_parameters()

# ====================== HUẤN LUYỆN ======================
args = TrainingArguments(
    output_dir="./roberta_lora_goal",
    per_device_train_batch_size=8,       # ↑ tăng batch size giúp gradient ổn định hơn
    gradient_accumulation_steps=4,       # tích lũy gradient → giả lập batch lớn hơn (8×4=32)
    num_train_epochs=80,                 # ↑ học lâu hơn (từ 50 → 80)
    learning_rate=1.5e-4,                # giảm nhẹ learning rate để tránh overfit khi tăng epoch
    warmup_ratio=0.1,                    # warmup giúp ổn định giai đoạn đầu
    lr_scheduler_type="cosine",          # scheduler mượt hơn
    weight_decay=0.05,                   # tăng nhẹ regularization để chống overfit
    save_strategy="no",                  # vẫn không lưu giữa chừng
    logging_steps=10,
    fp16=torch.cuda.is_available(),      # giữ nguyên để tận dụng GPU
)

trainer = Trainer(
    model=model_cat,
    args=args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    tokenizer=tokenizer_cat,
)

trainer.train()

# ====================== ĐÁNH GIÁ SAU FINE-TUNE ======================
print("\n Đánh giá sau fine-tune ...")
acc_after = evaluate_model(model_cat, test_ds)
print(f" Accuracy after finetune: {acc_after:.2f}")
print(f" Cải thiện: {(acc_after - acc_before)*100:.1f}% accuracy tăng thêm")



 Using device: cuda
 goal_df: 309 mẫu | llm_predict_df: 309 mẫu
   entity_id   review_id                          clause  \
0    3000584  9672019_R1  it's easy to get to the castle   
1    3000584  9672019_R2         loves the big screen tv   

                                   sentence_original           term opinion  \
0  it's near the subway station. it's easy to get...         castle    easy   
1  the owner is enthusiastic, sweet, fully furnis...  big screen tv   loves   

   category                                           top3_sim  \
0   Amenity  [('Facility', 0.2345069795846939), ('Amenity',...   
1  Facility  [('Facility', 0.14196377992630005), ('Amenity'...   

                                         score_table  polarity  \
0       category     score\n0     Amenity  0.3660...  Positive   
1       category     score\n0     Amenity  0.3927...  Positive   

                                     polarity_scores  
0  {'Positive': 0.9592183232307434, 'Negative': 0...  
1  {'Posi

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/309 [00:00<?, ? examples/s]

Map:   0%|          | 0/309 [00:00<?, ? examples/s]


 Đánh giá trước fine-tune ...


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


              precision    recall  f1-score   support

     Amenity       0.03      1.00      0.05         8
    Branding       0.00      0.00      0.00         9
  Experience       0.00      0.00      0.00         7
    Facility       0.00      0.00      0.00         4
     Loyalty       0.00      0.00      0.00         4
     Service       0.00      0.00      0.00       277

    accuracy                           0.03       309
   macro avg       0.00      0.17      0.01       309
weighted avg       0.00      0.03      0.00       309

 Accuracy before finetune: 0.03

 Fine-tuning với LoRA ...


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 890,118 || all params: 125,540,364 || trainable%: 0.7090


  trainer = Trainer(
  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

#Extract CATEGORY

In [None]:
# ====================== PREDICT CATEGORY CHO CLAUSES ======================

model_cat.eval()
model_cat.to(device)

for c in clauses:
    text = str(c.get("clause", "")).strip()
    if text == "":
        c["category"] = "Unknown"
        continue

    # CHÚ Ý: dùng tokenizer_cat
    inputs = tokenizer_cat(
        text,
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=128
    ).to(device)

    try:
        with torch.no_grad():
            outputs = model_cat(**inputs)
            pred_id = torch.argmax(outputs.logits, dim=1).item()
            c["category"] = id2label[pred_id]
    except Exception as e:
        # Nếu gặp lỗi GPU, in ra để debug và gán Unknown
        print(f" Lỗi khi xử lý clause='{text}': {e}")
        c["category"] = "Unknown"

print(clauses)


#Extract POLARITY

In [None]:
from transformers import pipeline
from tqdm import tqdm

In [None]:
from transformers import pipeline
from tqdm import tqdm

# --- Khởi tạo pipeline ---
polarity_classifier = pipeline(
    "text-classification",
    model="yangheng/deberta-v3-base-absa-v1.1",
    top_k=None,   # lấy tất cả logits, sau đó chọn nhãn cao nhất
    truncation=True
)

# --- Hàm phát hiện polarity ---
def detect_polarity(clauses):
    results = []
    for item in tqdm(clauses, desc="Detecting sentiment polarity (clauses)"):
        clause = str(item.get("clause", "")).strip()
        if clause == "":
            item["polarity"] = "Neutral"
            item["polarity_score"] = 0.0
            results.append(item)
            continue

        try:
            res = polarity_classifier(clause)
            # Một số model Hugging Face trả về list lồng list -> xử lý để lấy nhãn cao nhất
            if isinstance(res, list) and isinstance(res[0], list):
                res = res[0]
            top = max(res, key=lambda x: x["score"])
            item["polarity"] = top["label"].capitalize()
            item["polarity_score"] = round(top["score"], 4)
        except Exception as e:
            print(f" Lỗi khi xử lý câu '{clause}': {e}")
            item["polarity"] = "Neutral"
            item["polarity_score"] = 0.0

        results.append(item)
    return results


In [None]:
# --- Gọi hàm ---
clauses_with_polarity = detect_polarity(clauses)

# --- In chỉ polarity ---
for c in clauses_with_polarity:
    print(f"{c['clause']} → {c['polarity']}")

In [None]:
clauses

#ABSA pipeline

In [None]:
def absa_pipeline(sentences, model_qwen, tokenizer_qwen, model_cat, tokenizer_cat, id2label, device):
    """
    Hàm tổng hợp toàn bộ quy trình ABSA:
    - Tách clause và term
    - Trích xuất & chuẩn hóa opinion (Qwen)
    - Dự đoán category (RoBERTa fine-tuned)
    - Dự đoán polarity (DeBERTa ABSA)
    """
    if isinstance(sentences, str):
        sentences = [sentences]

    all_clauses = []

    # === B1: Tách và trích xuất opinion bằng Qwen ===
    for sentence in tqdm(sentences, desc="Extracting clauses & opinions"):
        clauses = split_and_term_extraction(sentence)
        clauses = extract_opinions_only_from_clauses(clauses, model_qwen, tokenizer_qwen)
        # clauses = verify(clauses, model_qwen, tokenizer_qwen)
        for c in clauses:
            c["sentence_original"] = sentence
        all_clauses.extend(clauses)

    # === B2: Dự đoán CATEGORY bằng mô hình fine-tuned ===
    model_cat.eval()
    model_cat.to(device)

    for c in tqdm(all_clauses, desc="Predicting category"):
        text = str(c.get("clause", "")).strip()
        if text == "":
            c["category"] = "Unknown"
            continue

        inputs = tokenizer_cat(
            text,
            return_tensors="pt",
            truncation=True,
            padding="max_length",
            max_length=128
        ).to(device)

        with torch.no_grad():
            outputs = model_cat(**inputs)
            pred_id = torch.argmax(outputs.logits, dim=1).item()
            c["category"] = id2label[pred_id]

    # === B3: Dự đoán POLARITY ===
    for c in tqdm(all_clauses, desc="Predicting polarity"):
        clause_text = str(c.get("clause", "")).strip()
        if clause_text == "":
            c["polarity"] = "Neutral"
            c["polarity_score"] = 0.0
            continue
        try:
            res = polarity_classifier(clause_text)
            if isinstance(res, list) and isinstance(res[0], list):
                res = res[0]
            top = max(res, key=lambda x: x["score"])
            c["polarity"] = top["label"].capitalize()
            c["polarity_score"] = round(top["score"], 4)
        except Exception as e:
            print(f" Lỗi khi xử lý '{clause_text}': {e}")
            c["polarity"] = "Neutral"
            c["polarity_score"] = 0.0

    # === B4: Xuất DataFrame ===
    df = pd.DataFrame(all_clauses)
    cols = ["sentence_original", "clause", "term", "opinion", "category", "polarity", "polarity_score"]
    df = df[[c for c in cols if c in df.columns]]
    return df


In [None]:
import pandas as pd
sample=pd.read_csv('/content/sample.csv')

In [None]:
sample=sample["sentence_original"].unique()[:100]

In [None]:
len(sample)

In [None]:
sample=list(sample)

In [None]:
df_results = absa_pipeline(
    sample,
    model, tokenizer,   # dùng cho split/normalize
    model_cat, tokenizer_cat,     # dùng cho category classification
    id2label,
    device
)

print(df_results)

In [None]:
import pandas as pd
from google.colab import files
from io import StringIO

# Chuyển DataFrame thành CSV string
csv_buffer = StringIO()
df_results.to_csv(csv_buffer, index=False)
csv_buffer.seek(0)

# Lưu tạm vào file và tải về
with open("df_results.csv", "w", encoding="utf-8-sig") as f:
    f.write(csv_buffer.getvalue())

files.download("df_results.csv")