In [2]:
!pip install --upgrade transformers huggingface-hub

Collecting huggingface-hub
  Using cached huggingface_hub-0.36.0-py3-none-any.whl.metadata (14 kB)
Using cached huggingface_hub-0.36.0-py3-none-any.whl (566 kB)
Installing collected packages: huggingface-hub
  Attempting uninstall: huggingface-hub
    Found existing installation: huggingface_hub 1.2.3
    Uninstalling huggingface_hub-1.2.3:
      Successfully uninstalled huggingface_hub-1.2.3
Successfully installed huggingface-hub-0.36.0


In [2]:
!pip install --force datasets==3.6.0

Collecting datasets==3.6.0
  Using cached datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting filelock (from datasets==3.6.0)
  Using cached filelock-3.20.1-py3-none-any.whl.metadata (2.1 kB)
Collecting numpy>=1.17 (from datasets==3.6.0)
  Using cached numpy-2.4.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (6.6 kB)
Collecting pyarrow>=15.0.0 (from datasets==3.6.0)
  Using cached pyarrow-22.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets==3.6.0)
  Using cached dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting pandas (from datasets==3.6.0)
  Using cached pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (91 kB)
Collecting requests>=2.32.2 (from datasets==3.6.0)
  Using cached requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting tqdm>=4.66.3 (from datasets==3.6.0)
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting xxhash (from datas

In [2]:
from datasets import load_dataset
print("Đang tải dataset UIT-VSFC từ Hugging Face...")
dataset = load_dataset("uitnlp/vietnamese_students_feedback",trust_remote_code=True)

print("Cấu trúc dataset:", dataset)


Đang tải dataset UIT-VSFC từ Hugging Face...
Cấu trúc dataset: DatasetDict({
    train: Dataset({
        features: ['sentence', 'sentiment', 'topic'],
        num_rows: 11426
    })
    validation: Dataset({
        features: ['sentence', 'sentiment', 'topic'],
        num_rows: 1583
    })
    test: Dataset({
        features: ['sentence', 'sentiment', 'topic'],
        num_rows: 3166
    })
})


In [3]:
import torch
import numpy as np
from tqdm import tqdm
from transformers import AutoModel, AutoTokenizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# ======================================================
# 1. LOAD MODEL SimCSE (VoVanPhuc)
# ======================================================
MODEL_NAME = "VoVanPhuc/sup-SimCSE-VietNamese-phobert-base"
device = 'cuda' if torch.cuda.is_available() else 'cpu'

print(f"--> Đang load model: {MODEL_NAME}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME).to(device)
model.eval()

# ======================================================
# 2. HÀM EMBEDDING
# ======================================================
def generate_embeddings(data_split, model, tokenizer, batch_size=32):
    texts = data_split['sentence']
    labels = data_split['sentiment']

    all_embeddings = []

    print(f"Đang embedding {len(texts)} câu trên {device}...")

    for i in tqdm(range(0, len(texts), batch_size)):
        batch_texts = texts[i : i + batch_size]

        inputs = tokenizer(
            batch_texts,
            padding=True,
            truncation=True,
            max_length=256,
            return_tensors="pt"
        ).to(device)

        with torch.no_grad():
            outputs = model(**inputs)
            cls_embeddings = outputs.last_hidden_state[:, 0, :]
            all_embeddings.append(cls_embeddings.cpu().numpy())

    return np.vstack(all_embeddings), np.array(labels)

# ======================================================
# 3. CHẠY QUY TRÌNH
# ======================================================
print("\n--- Xử lý tập TRAIN ---")
X_train, y_train = generate_embeddings(dataset['train'], model, tokenizer)

print("\n--- Xử lý tập TEST ---")
X_test, y_test = generate_embeddings(dataset['test'], model, tokenizer)

print(f"\nKích thước X_train: {X_train.shape}")
print(f"\nKích thước X_test: {X_test.shape}")

print("\nĐang training Classifier...")
clf = LogisticRegression(max_iter=3000, solver='lbfgs')
clf.fit(X_train, y_train)

print("\nKẾT QUẢ ĐÁNH GIÁ TRÊN TẬP TEST:")
y_pred = clf.predict(X_test)
target_names = ['Tiêu cực (0)', 'Trung tính (1)', 'Tích cực (2)']
print(classification_report(y_test, y_pred, target_names=target_names, digits=4))

--> Đang load model: VoVanPhuc/sup-SimCSE-VietNamese-phobert-base

--- Xử lý tập TRAIN ---
Đang embedding 11426 câu trên cuda...


100%|██████████| 358/358 [00:27<00:00, 13.23it/s]



--- Xử lý tập TEST ---
Đang embedding 3166 câu trên cuda...


100%|██████████| 99/99 [00:07<00:00, 12.83it/s]



Kích thước X_train: (11426, 768)

Kích thước X_test: (3166, 768)

Đang training Classifier...

KẾT QUẢ ĐÁNH GIÁ TRÊN TẬP TEST:
                precision    recall  f1-score   support

  Tiêu cực (0)     0.9079    0.9446    0.9259      1409
Trung tính (1)     0.5800    0.3473    0.4345       167
  Tích cực (2)     0.9256    0.9314    0.9285      1590

      accuracy                         0.9065      3166
     macro avg     0.8045    0.7411    0.7630      3166
  weighted avg     0.8995    0.9065    0.9013      3166



In [4]:
import torch
import numpy as np
from tqdm import tqdm
from transformers import AutoModel, AutoTokenizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# ======================================================
# 1. LOAD MODEL SimCSE (VoVanPhuc)
# ======================================================
MODEL_NAME = "dangvantuan/vietnamese-embedding"
device = 'cuda' if torch.cuda.is_available() else 'cpu'

print(f"--> Đang load model: {MODEL_NAME}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME).to(device)
model.eval()

# ======================================================
# 2. HÀM EMBEDDING
# ======================================================
def generate_embeddings(data_split, model, tokenizer, batch_size=32):
    texts = data_split['sentence']
    labels = data_split['sentiment']

    all_embeddings = []

    print(f"Đang embedding {len(texts)} câu trên {device}...")

    for i in tqdm(range(0, len(texts), batch_size)):
        batch_texts = texts[i : i + batch_size]

        inputs = tokenizer(
            batch_texts,
            padding=True,
            truncation=True,
            max_length=256,
            return_tensors="pt"
        ).to(device)

        with torch.no_grad():
            outputs = model(**inputs)
            cls_embeddings = outputs.last_hidden_state[:, 0, :]
            all_embeddings.append(cls_embeddings.cpu().numpy())

    return np.vstack(all_embeddings), np.array(labels)

# ======================================================
# 3. CHẠY QUY TRÌNH
# ======================================================
print("\n--- Xử lý tập TRAIN ---")
X_train, y_train = generate_embeddings(dataset['train'], model, tokenizer)

print("\n--- Xử lý tập TEST ---")
X_test, y_test = generate_embeddings(dataset['test'], model, tokenizer)

print(f"\nKích thước X_train: {X_train.shape}")
print(f"\nKích thước X_test: {X_test.shape}")

print("\nĐang training Classifier...")
clf = LogisticRegression(max_iter=3000, solver='lbfgs')
clf.fit(X_train, y_train)

print("\nKẾT QUẢ ĐÁNH GIÁ TRÊN TẬP TEST:")
y_pred = clf.predict(X_test)
target_names = ['Tiêu cực (0)', 'Trung tính (1)', 'Tích cực (2)']
print(classification_report(y_test, y_pred, target_names=target_names, digits=4))

--> Đang load model: dangvantuan/vietnamese-embedding


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

bpe.codes: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/22.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/965 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/753 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/540M [00:00<?, ?B/s]


--- Xử lý tập TRAIN ---
Đang embedding 11426 câu trên cuda...


100%|██████████| 358/358 [00:30<00:00, 11.76it/s]



--- Xử lý tập TEST ---
Đang embedding 3166 câu trên cuda...


100%|██████████| 99/99 [00:08<00:00, 12.29it/s]



Kích thước X_train: (11426, 768)

Kích thước X_test: (3166, 768)

Đang training Classifier...

KẾT QUẢ ĐÁNH GIÁ TRÊN TẬP TEST:
                precision    recall  f1-score   support

  Tiêu cực (0)     0.9065    0.9354    0.9207      1409
Trung tính (1)     0.5521    0.3174    0.4030       167
  Tích cực (2)     0.9171    0.9321    0.9245      1590

      accuracy                         0.9011      3166
     macro avg     0.7919    0.7283    0.7494      3166
  weighted avg     0.8931    0.9011    0.8953      3166



In [5]:
import torch
import numpy as np
from tqdm import tqdm
from transformers import AutoModel, AutoTokenizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# ======================================================
# 1. LOAD MODEL SimCSE (VoVanPhuc)
# ======================================================
MODEL_NAME = "vinai/phobert-base-v2"
device = 'cuda' if torch.cuda.is_available() else 'cpu'

print(f"--> Đang load model: {MODEL_NAME}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME).to(device)
model.eval()

# ======================================================
# 2. HÀM EMBEDDING
# ======================================================
def generate_embeddings(data_split, model, tokenizer, batch_size=32):
    texts = data_split['sentence']
    labels = data_split['sentiment']

    all_embeddings = []

    print(f"Đang embedding {len(texts)} câu trên {device}...")

    for i in tqdm(range(0, len(texts), batch_size)):
        batch_texts = texts[i : i + batch_size]

        inputs = tokenizer(
            batch_texts,
            padding=True,
            truncation=True,
            max_length=256,
            return_tensors="pt"
        ).to(device)

        with torch.no_grad():
            outputs = model(**inputs)
            cls_embeddings = outputs.last_hidden_state[:, 0, :]
            all_embeddings.append(cls_embeddings.cpu().numpy())

    return np.vstack(all_embeddings), np.array(labels)

# ======================================================
# 3. CHẠY QUY TRÌNH
# ======================================================
print("\n--- Xử lý tập TRAIN ---")
X_train, y_train = generate_embeddings(dataset['train'], model, tokenizer)

print("\n--- Xử lý tập TEST ---")
X_test, y_test = generate_embeddings(dataset['test'], model, tokenizer)

print(f"\nKích thước X_train: {X_train.shape}")
print(f"\nKích thước X_test: {X_test.shape}")

print("\nĐang training Classifier...")
clf = LogisticRegression(max_iter=3000, solver='lbfgs')
clf.fit(X_train, y_train)

print("\nKẾT QUẢ ĐÁNH GIÁ TRÊN TẬP TEST:")
y_pred = clf.predict(X_test)
target_names = ['Tiêu cực (0)', 'Trung tính (1)', 'Tích cực (2)']
print(classification_report(y_test, y_pred, target_names=target_names, digits=4))

--> Đang load model: vinai/phobert-base-v2


config.json:   0%|          | 0.00/678 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

bpe.codes: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/540M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at vinai/phobert-base-v2 and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/540M [00:00<?, ?B/s]


--- Xử lý tập TRAIN ---
Đang embedding 11426 câu trên cuda...



  0%|          | 0/358 [00:00<?, ?it/s][A
  0%|          | 1/358 [00:00<00:41,  8.62it/s][A
  1%|          | 2/358 [00:00<00:38,  9.27it/s][A
  1%|          | 4/358 [00:00<00:32, 11.05it/s][A
  2%|▏         | 6/358 [00:00<00:28, 12.32it/s][A
  2%|▏         | 8/358 [00:00<00:27, 12.68it/s][A
  3%|▎         | 10/358 [00:00<00:29, 11.73it/s][A
  3%|▎         | 12/358 [00:01<00:28, 12.33it/s][A
  4%|▍         | 14/358 [00:01<00:28, 11.94it/s][A
  4%|▍         | 16/358 [00:01<00:32, 10.55it/s][A
  5%|▌         | 18/358 [00:01<00:31, 10.92it/s][A
  6%|▌         | 20/358 [00:01<00:29, 11.29it/s][A
  6%|▌         | 22/358 [00:01<00:31, 10.64it/s][A
  7%|▋         | 24/358 [00:02<00:29, 11.47it/s][A
  7%|▋         | 26/358 [00:02<00:28, 11.66it/s][A
  8%|▊         | 28/358 [00:02<00:28, 11.52it/s][A
  8%|▊         | 30/358 [00:02<00:28, 11.68it/s][A
  9%|▉         | 32/358 [00:02<00:32, 10.04it/s][A
  9%|▉         | 34/358 [00:03<00:32,  9.97it/s][A
 10%|█         | 36/358 [


--- Xử lý tập TEST ---
Đang embedding 3166 câu trên cuda...


100%|██████████| 99/99 [00:08<00:00, 11.86it/s]



Kích thước X_train: (11426, 768)

Kích thước X_test: (3166, 768)

Đang training Classifier...

KẾT QUẢ ĐÁNH GIÁ TRÊN TẬP TEST:
                precision    recall  f1-score   support

  Tiêu cực (0)     0.9015    0.9290    0.9151      1409
Trung tính (1)     0.5300    0.3174    0.3970       167
  Tích cực (2)     0.9176    0.9314    0.9245      1590

      accuracy                         0.8980      3166
     macro avg     0.7830    0.7259    0.7455      3166
  weighted avg     0.8900    0.8980    0.8925      3166



In [6]:
import torch
import numpy as np
from tqdm import tqdm
from transformers import AutoModel, AutoTokenizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# ======================================================
# 1. LOAD MODEL SimCSE (VoVanPhuc)
# ======================================================
MODEL_NAME = "vinai/phobert-large"
device = 'cuda' if torch.cuda.is_available() else 'cpu'

print(f"--> Đang load model: {MODEL_NAME}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME).to(device)
model.eval()

# ======================================================
# 2. HÀM EMBEDDING
# ======================================================
def generate_embeddings(data_split, model, tokenizer, batch_size=32):
    texts = data_split['sentence']
    labels = data_split['sentiment']

    all_embeddings = []

    print(f"Đang embedding {len(texts)} câu trên {device}...")

    for i in tqdm(range(0, len(texts), batch_size)):
        batch_texts = texts[i : i + batch_size]

        inputs = tokenizer(
            batch_texts,
            padding=True,
            truncation=True,
            max_length=256,
            return_tensors="pt"
        ).to(device)

        with torch.no_grad():
            outputs = model(**inputs)
            cls_embeddings = outputs.last_hidden_state[:, 0, :]
            all_embeddings.append(cls_embeddings.cpu().numpy())

    return np.vstack(all_embeddings), np.array(labels)

# ======================================================
# 3. CHẠY QUY TRÌNH
# ======================================================
print("\n--- Xử lý tập TRAIN ---")
X_train, y_train = generate_embeddings(dataset['train'], model, tokenizer)

print("\n--- Xử lý tập TEST ---")
X_test, y_test = generate_embeddings(dataset['test'], model, tokenizer)

print(f"\nKích thước X_train: {X_train.shape}")
print(f"\nKích thước X_test: {X_test.shape}")

print("\nĐang training Classifier...")
clf = LogisticRegression(max_iter=3000, solver='lbfgs')
clf.fit(X_train, y_train)

print("\nKẾT QUẢ ĐÁNH GIÁ TRÊN TẬP TEST:")
y_pred = clf.predict(X_test)
target_names = ['Tiêu cực (0)', 'Trung tính (1)', 'Tích cực (2)']
print(classification_report(y_test, y_pred, target_names=target_names, digits=4))

--> Đang load model: vinai/phobert-large


config.json:   0%|          | 0.00/558 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

bpe.codes: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.48G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.48G [00:00<?, ?B/s]


--- Xử lý tập TRAIN ---
Đang embedding 11426 câu trên cuda...



  0%|          | 0/358 [00:00<?, ?it/s][A
  0%|          | 1/358 [00:00<02:06,  2.82it/s][A
  1%|          | 2/358 [00:00<01:34,  3.77it/s][A
  1%|          | 3/358 [00:00<01:26,  4.08it/s][A
  1%|          | 4/358 [00:00<01:19,  4.45it/s][A
  1%|▏         | 5/358 [00:01<01:17,  4.54it/s][A
  2%|▏         | 6/358 [00:01<01:23,  4.22it/s][A
  2%|▏         | 7/358 [00:01<01:23,  4.21it/s][A
  2%|▏         | 8/358 [00:01<01:26,  4.06it/s][A
  3%|▎         | 9/358 [00:02<01:35,  3.67it/s][A
  3%|▎         | 10/358 [00:02<01:41,  3.41it/s][A
  3%|▎         | 11/358 [00:02<01:28,  3.91it/s][A
  3%|▎         | 12/358 [00:03<01:34,  3.65it/s][A
  4%|▎         | 13/358 [00:03<01:32,  3.72it/s][A
  4%|▍         | 14/358 [00:03<01:31,  3.74it/s][A
  4%|▍         | 15/358 [00:03<01:30,  3.80it/s][A
  4%|▍         | 16/358 [00:04<01:52,  3.04it/s][A
  5%|▍         | 17/358 [00:04<01:42,  3.32it/s][A
  5%|▌         | 18/358 [00:04<01:44,  3.25it/s][A
  5%|▌         | 19/358 [00:0


--- Xử lý tập TEST ---
Đang embedding 3166 câu trên cuda...


100%|██████████| 99/99 [00:27<00:00,  3.61it/s]



Kích thước X_train: (11426, 1024)

Kích thước X_test: (3166, 1024)

Đang training Classifier...

KẾT QUẢ ĐÁNH GIÁ TRÊN TẬP TEST:
                precision    recall  f1-score   support

  Tiêu cực (0)     0.8303    0.9099    0.8683      1409
Trung tính (1)     0.5833    0.0419    0.0782       167
  Tích cực (2)     0.8770    0.8881    0.8825      1590

      accuracy                         0.8531      3166
     macro avg     0.7636    0.6133    0.6097      3166
  weighted avg     0.8407    0.8531    0.8337      3166



In [7]:
!pip install -q angle-emb mlflow

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.1/40.1 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m69.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m82.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m66.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.2/194.2 kB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m512.3/512.3 kB[0m [31m34.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m147.8/147.8 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [8]:
import mlflow
from angle_emb import AnglE
import torch
import os

remote_server_uri = "https://dagshub.com/phamnguyentuong205/aoe-tpp.mlflow"
mlflow.set_tracking_uri(remote_server_uri)

print(f" Đang kết nối tới server: {remote_server_uri}")

RUN_ID = "3c29435e31d7407585d31e381f1f2ed0"
ARTIFACT_PATH = "model_sts_completev2"

print(f"Đang tải model từ Run ID: {RUN_ID}...")

try:
    local_model_path = mlflow.artifacts.download_artifacts(
        run_id=RUN_ID,
        artifact_path=ARTIFACT_PATH
    )
    print(f"Đã tải xong! Model nằm tại: {local_model_path}")
    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    angle = AnglE.from_pretrained(
        local_model_path,
        pooling_strategy='cls',
        is_llm=False
    ).to(device)

    print("Model đã sẵn sàng!")

    # Test thử
    vec = angle.encode("Test model load từ DagsHub", to_numpy=True)
    print("Shape vector:", vec.shape)

except Exception as e:
    print("\nLỖI KẾT NỐI:")
    print(e)

 Đang kết nối tới server: https://dagshub.com/phamnguyentuong205/aoe-tpp.mlflow
Đang tải model từ Run ID: 3c29435e31d7407585d31e381f1f2ed0...


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Đã tải xong! Model nằm tại: /tmp/tmprb985t4v/model_sts_completev2
Model đã sẵn sàng!
Shape vector: (1, 768)


In [9]:

def generate_embeddings(data_split, model, batch_size=64):
    """
    Hàm này nhận vào một tập dữ liệu (train/test),
    lấy cột 'sentence' để encode thành vector,
    và lấy cột 'sentiment' làm nhãn.
    """
    texts = data_split['sentence']     # Lấy danh sách câu
    labels = data_split['sentiment']   # Lấy nhãn (0, 1, 2)

    embeddings = []

    print(f"Đang embedding {len(texts)} câu...")
    for i in tqdm(range(0, len(texts), batch_size)):
        batch_texts = texts[i : i + batch_size]
        # Encode bằng AnglE
        vecs = model.encode(batch_texts, to_numpy=True)
        embeddings.append(vecs)

    return np.vstack(embeddings), np.array(labels)


print("\n--- Xử lý tập TRAIN ---")
X_train, y_train = generate_embeddings(dataset['train'], angle)

print("\n--- Xử lý tập TEST ---")
X_test, y_test = generate_embeddings(dataset['test'], angle)

print(f"\nKích thước X_train: {X_train.shape}")
print(f"Kích thước X_test: {X_test.shape}")

print("\nĐang training Classifier...")
clf = LogisticRegression(max_iter=2000, solver='lbfgs')
clf.fit(X_train, y_train)

print("\nKẾT QUẢ ĐÁNH GIÁ TRÊN TẬP TEST:")
y_pred = clf.predict(X_test)

target_names = ['Tiêu cực (0)', 'Trung tính (1)', 'Tích cực (2)']
print(classification_report(y_test, y_pred, target_names=target_names))



--- Xử lý tập TRAIN ---
Đang embedding 11426 câu...


100%|██████████| 179/179 [00:38<00:00,  4.67it/s]



--- Xử lý tập TEST ---
Đang embedding 3166 câu...


100%|██████████| 50/50 [00:11<00:00,  4.43it/s]



Kích thước X_train: (11426, 768)
Kích thước X_test: (3166, 768)

Đang training Classifier...

KẾT QUẢ ĐÁNH GIÁ TRÊN TẬP TEST:
                precision    recall  f1-score   support

  Tiêu cực (0)       0.92      0.96      0.94      1409
Trung tính (1)       0.65      0.34      0.45       167
  Tích cực (2)       0.93      0.94      0.94      1590

      accuracy                           0.92      3166
     macro avg       0.83      0.75      0.77      3166
  weighted avg       0.91      0.92      0.91      3166

