In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings("ignore")

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    precision_score,
    recall_score,
    f1_score,
    make_scorer,
)

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

import optuna

from utils.other import parse_label, matrix_labels

---

# Read file


In [4]:
df_train = pd.read_csv("../data/processed/train.csv", encoding="utf-8")
df_val = pd.read_csv("../data/processed/val.csv", encoding="utf-8")

In [5]:
df_train.head()

Unnamed: 0,comment,label
0,hrv g xài máy thế_hệ máy sohc ok_hóng awd,{PERFORMANCE#Positive};
1,vios đi ngon đi khỏe nhé_duy nội thất_hơi xấu ...,{PERFORMANCE#Positive};{INTERIOR#Negative};
2,đẹp,{EXTERIOR#Positive};
3,công_nhận hãng xe trung_quốc làm_đẹp chê ngoại...,{BRAND#Positive};{EXTERIOR#Positive};
4,giá rẻ trang_bị rẻ an_toàn tiêu_chuẩn bảo_hành...,{FEATURES#Positive};{COST#Positive};{BRAND#Neu...


In [6]:
df_val.head()

Unnamed: 0,comment,label
0,xe giá accent nổi_bật phân_khúc,{BRAND#Positive};
1,yên ngựa xấu taplo tạm,{INTERIOR#Negative};
2,óp sần miên_man tội hàng tàu,{BRAND#Negative};{INTERIOR#Positive};
3,thiết_kế kushaq đậm_chất châu_âu_sắc nét đường...,{EXTERIOR#Positive};
4,cọc xe yaris_vi an_toàn ga đạp nhầm phanh tự_đ...,{FEATURES#Positive};{COST#Positive};


---

# Labels

## Train

In [7]:
matrix_labels_train , mlb_train = matrix_labels(df_train[["label"]])

In [8]:
matrix_labels_train.head()

Unnamed: 0,BRAND#Negative,BRAND#Neutral,BRAND#Positive,COST#Negative,COST#Neutral,COST#Positive,EXTERIOR#Negative,EXTERIOR#Neutral,EXTERIOR#Positive,FEATURES#Negative,FEATURES#Neutral,FEATURES#Positive,INTERIOR#Negative,INTERIOR#Neutral,INTERIOR#Positive,PERFORMANCE#Negative,PERFORMANCE#Neutral,PERFORMANCE#Positive
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
2,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
3,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0


In [9]:
print(f"Number of labels: {len(mlb_train.classes_)}")
print(f"Labels: {mlb_train.classes_}")

Number of labels: 18
Labels: ['BRAND#Negative' 'BRAND#Neutral' 'BRAND#Positive' 'COST#Negative'
 'COST#Neutral' 'COST#Positive' 'EXTERIOR#Negative' 'EXTERIOR#Neutral'
 'EXTERIOR#Positive' 'FEATURES#Negative' 'FEATURES#Neutral'
 'FEATURES#Positive' 'INTERIOR#Negative' 'INTERIOR#Neutral'
 'INTERIOR#Positive' 'PERFORMANCE#Negative' 'PERFORMANCE#Neutral'
 'PERFORMANCE#Positive']


## Val

In [10]:
matrix_labels_val , mlb_val = matrix_labels(df_val[["label"]])

In [11]:
matrix_labels_val.head()

Unnamed: 0,BRAND#Negative,BRAND#Neutral,BRAND#Positive,COST#Negative,COST#Neutral,COST#Positive,EXTERIOR#Negative,EXTERIOR#Neutral,EXTERIOR#Positive,FEATURES#Negative,FEATURES#Neutral,FEATURES#Positive,INTERIOR#Negative,INTERIOR#Neutral,INTERIOR#Positive,PERFORMANCE#Negative,PERFORMANCE#Neutral,PERFORMANCE#Positive
0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0


In [12]:
print(f"Number of labels: {len(mlb_val.classes_)}")
print(f"Labels: {mlb_val.classes_}")

Number of labels: 18
Labels: ['BRAND#Negative' 'BRAND#Neutral' 'BRAND#Positive' 'COST#Negative'
 'COST#Neutral' 'COST#Positive' 'EXTERIOR#Negative' 'EXTERIOR#Neutral'
 'EXTERIOR#Positive' 'FEATURES#Negative' 'FEATURES#Neutral'
 'FEATURES#Positive' 'INTERIOR#Negative' 'INTERIOR#Neutral'
 'INTERIOR#Positive' 'PERFORMANCE#Negative' 'PERFORMANCE#Neutral'
 'PERFORMANCE#Positive']


---

# Train, val split


In [13]:
# Train
X_train = df_train[["comment"]]
y_train = matrix_labels_train

# Validation
X_val = df_val[["comment"]]
y_val = matrix_labels_val

In [14]:
pd.concat([X_train, pd.DataFrame(y_train, columns=mlb_train.classes_)], axis=1).head()

Unnamed: 0,comment,BRAND#Negative,BRAND#Neutral,BRAND#Positive,COST#Negative,COST#Neutral,COST#Positive,EXTERIOR#Negative,EXTERIOR#Neutral,EXTERIOR#Positive,FEATURES#Negative,FEATURES#Neutral,FEATURES#Positive,INTERIOR#Negative,INTERIOR#Neutral,INTERIOR#Positive,PERFORMANCE#Negative,PERFORMANCE#Neutral,PERFORMANCE#Positive
0,hrv g xài máy thế_hệ máy sohc ok_hóng awd,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,vios đi ngon đi khỏe nhé_duy nội thất_hơi xấu ...,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
2,đẹp,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
3,công_nhận hãng xe trung_quốc làm_đẹp chê ngoại...,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
4,giá rẻ trang_bị rẻ an_toàn tiêu_chuẩn bảo_hành...,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0


In [15]:
pd.concat([X_val, pd.DataFrame(y_val, columns=mlb_val.classes_)], axis=1).head()

Unnamed: 0,comment,BRAND#Negative,BRAND#Neutral,BRAND#Positive,COST#Negative,COST#Neutral,COST#Positive,EXTERIOR#Negative,EXTERIOR#Neutral,EXTERIOR#Positive,FEATURES#Negative,FEATURES#Neutral,FEATURES#Positive,INTERIOR#Negative,INTERIOR#Neutral,INTERIOR#Positive,PERFORMANCE#Negative,PERFORMANCE#Neutral,PERFORMANCE#Positive
0,xe giá accent nổi_bật phân_khúc,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,yên ngựa xấu taplo tạm,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
2,óp sần miên_man tội hàng tàu,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
3,thiết_kế kushaq đậm_chất châu_âu_sắc nét đường...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
4,cọc xe yaris_vi an_toàn ga đạp nhầm phanh tự_đ...,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0


---

# Vectorize


In [16]:
vec = TfidfVectorizer(
    analyzer="char",
    min_df=3,
    max_df=0.95,
    ngram_range=(3, 5),
    sublinear_tf=True,
    max_features=30000,
)

In [17]:
# Fit TF-IDF on training and transform train/test
X_train_vec = vec.fit_transform(X_train["comment"])
X_val_vec = vec.transform(X_val["comment"])

In [18]:
# To dense arrays
X_train_vec = X_train_vec.toarray()
X_val_vec = X_val_vec.toarray()

In [19]:
print(f"Train shape: {X_train_vec.shape}")
print(f"Test shape: {X_val_vec.shape}")
print(f"Vocabulary size: {len(vec.get_feature_names_out())}")

Train shape: (2599, 22167)
Test shape: (1000, 22167)
Vocabulary size: 22167


In [20]:
print("Number of labels train: ", y_train.shape[1])
print("Number of labels test: ", y_val.shape[1])

Number of labels train:  18
Number of labels test:  18


In [21]:
n_features = X_train_vec.shape[1]
n_labels = y_train.shape[1]

---

# FNN


## Model


In [22]:
# Build model function
def build_model(input_dim, output_dim, params):
    model = Sequential()

    model.add(Dense(params["hidden1"], activation="relu", input_shape=(input_dim,)))
    model.add(Dropout(params["dropout1"]))

    if params["n_layers"] == 2:
        model.add(Dense(params["hidden2"], activation="relu"))
        model.add(Dropout(params["dropout2"]))

    model.add(Dense(output_dim, activation="sigmoid"))  # multi-label

    model.compile(
        optimizer=Adam(learning_rate=params["lr"]), loss="binary_crossentropy"
    )

    return model

In [23]:
# Objective function for Optuna
def objective(trial):

    params = {
        "n_layers": trial.suggest_int("n_layers", 1, 2),
        "hidden1": trial.suggest_categorical("hidden1", [256, 512, 768]),
        "hidden2": trial.suggest_categorical("hidden2", [128, 256]),
        "dropout1": trial.suggest_float("dropout1", 0.3, 0.6),
        "dropout2": trial.suggest_float("dropout2", 0.2, 0.5),
        "lr": trial.suggest_float("lr", 1e-4, 5e-3, log=True),
        "batch_size": trial.suggest_categorical("batch_size", [8, 16, 32, 64]),
    }

    model = build_model(input_dim=n_features, output_dim=n_labels, params=params)

    early_stop = EarlyStopping(
        monitor="val_loss", patience=3, restore_best_weights=True
    )

    # Use precomputed numeric vectors for training/validation
    model.fit(
        X_train_vec,
        y_train.values,
        validation_data=(X_val_vec, y_val.values),
        epochs=20,
        batch_size=params["batch_size"],
        callbacks=[early_stop],
        verbose=0,
    )

    y_val_pred = (model.predict(X_val_vec) > 0.5).astype(int)

    f1 = f1_score(y_val.values, y_val_pred, average="micro")

    return f1

In [24]:
# Optuna study
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20)

[I 2025-12-16 14:45:18,407] A new study created in memory with name: no-name-f8bd63e5-7afa-4f7a-aa44-2147758eded0


[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step


[I 2025-12-16 14:45:53,994] Trial 0 finished with value: 0.006983240223463687 and parameters: {'n_layers': 2, 'hidden1': 256, 'hidden2': 256, 'dropout1': 0.5537005698474007, 'dropout2': 0.4500716526519456, 'lr': 0.00010448369348982414, 'batch_size': 64}. Best is trial 0 with value: 0.006983240223463687.


[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step


[I 2025-12-16 14:46:59,632] Trial 1 finished with value: 0.39049064238745573 and parameters: {'n_layers': 1, 'hidden1': 512, 'hidden2': 256, 'dropout1': 0.4371011697307655, 'dropout2': 0.20397519634513342, 'lr': 0.00034883238842732187, 'batch_size': 64}. Best is trial 1 with value: 0.39049064238745573.


[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step


[I 2025-12-16 14:47:11,570] Trial 2 finished with value: 0.41256038647342996 and parameters: {'n_layers': 2, 'hidden1': 256, 'hidden2': 128, 'dropout1': 0.5465679862141792, 'dropout2': 0.29902485629136216, 'lr': 0.003802549423345089, 'batch_size': 64}. Best is trial 2 with value: 0.41256038647342996.


[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step


[I 2025-12-16 14:48:46,580] Trial 3 finished with value: 0.4169921875 and parameters: {'n_layers': 2, 'hidden1': 512, 'hidden2': 256, 'dropout1': 0.35322259582459703, 'dropout2': 0.30666004348169595, 'lr': 0.00026986652256187697, 'batch_size': 16}. Best is trial 3 with value: 0.4169921875.


[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step


[I 2025-12-16 14:50:43,785] Trial 4 finished with value: 0.3831967213114754 and parameters: {'n_layers': 1, 'hidden1': 256, 'hidden2': 256, 'dropout1': 0.4968580681006085, 'dropout2': 0.2117860729732673, 'lr': 0.00039444000325287494, 'batch_size': 8}. Best is trial 3 with value: 0.4169921875.


[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step


[I 2025-12-16 14:51:15,972] Trial 5 finished with value: 0.41993226898887276 and parameters: {'n_layers': 1, 'hidden1': 768, 'hidden2': 256, 'dropout1': 0.5050425629317097, 'dropout2': 0.22783879036020685, 'lr': 0.0019218074465379154, 'batch_size': 64}. Best is trial 5 with value: 0.41993226898887276.


[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step


[I 2025-12-16 14:51:29,411] Trial 6 finished with value: 0.4134520276953511 and parameters: {'n_layers': 1, 'hidden1': 256, 'hidden2': 128, 'dropout1': 0.4932731063771117, 'dropout2': 0.464568437350045, 'lr': 0.0023714064121028427, 'batch_size': 64}. Best is trial 5 with value: 0.41993226898887276.


[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step


[I 2025-12-16 14:52:41,919] Trial 7 finished with value: 0.4253875968992248 and parameters: {'n_layers': 1, 'hidden1': 512, 'hidden2': 256, 'dropout1': 0.4713172339188805, 'dropout2': 0.3921773897669977, 'lr': 0.001187862682036086, 'batch_size': 16}. Best is trial 7 with value: 0.4253875968992248.


[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step


[I 2025-12-16 14:54:33,602] Trial 8 finished with value: 0.413589364844904 and parameters: {'n_layers': 1, 'hidden1': 256, 'hidden2': 256, 'dropout1': 0.3037866594427533, 'dropout2': 0.4298063359425335, 'lr': 0.0004322559110749043, 'batch_size': 8}. Best is trial 7 with value: 0.4253875968992248.


[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step


[I 2025-12-16 14:55:38,726] Trial 9 finished with value: 0.40220661985957873 and parameters: {'n_layers': 1, 'hidden1': 512, 'hidden2': 256, 'dropout1': 0.5137307114506352, 'dropout2': 0.21563815701659514, 'lr': 0.0005405555100435523, 'batch_size': 32}. Best is trial 7 with value: 0.4253875968992248.


[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step


[I 2025-12-16 14:57:23,953] Trial 10 finished with value: 0.4569388665774208 and parameters: {'n_layers': 2, 'hidden1': 768, 'hidden2': 128, 'dropout1': 0.4185888270111561, 'dropout2': 0.38549472711399385, 'lr': 0.0011252250393771477, 'batch_size': 16}. Best is trial 10 with value: 0.4569388665774208.


[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step


[I 2025-12-16 14:58:55,179] Trial 11 finished with value: 0.3514774494556765 and parameters: {'n_layers': 2, 'hidden1': 768, 'hidden2': 128, 'dropout1': 0.42208387070587555, 'dropout2': 0.38099787006966684, 'lr': 0.0011066505876369146, 'batch_size': 16}. Best is trial 10 with value: 0.4569388665774208.


[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step


[I 2025-12-16 15:00:27,665] Trial 12 finished with value: 0.37253912165572944 and parameters: {'n_layers': 2, 'hidden1': 768, 'hidden2': 128, 'dropout1': 0.39412648675175854, 'dropout2': 0.3792414234340175, 'lr': 0.001073315999672655, 'batch_size': 16}. Best is trial 10 with value: 0.4569388665774208.


[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step


[I 2025-12-16 15:01:58,343] Trial 13 finished with value: 0.391 and parameters: {'n_layers': 2, 'hidden1': 768, 'hidden2': 128, 'dropout1': 0.37445960528924416, 'dropout2': 0.40743595574035096, 'lr': 0.001006743341459293, 'batch_size': 16}. Best is trial 10 with value: 0.4569388665774208.


[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step


[I 2025-12-16 15:03:06,632] Trial 14 finished with value: 0.43412322274881515 and parameters: {'n_layers': 1, 'hidden1': 512, 'hidden2': 128, 'dropout1': 0.5939951812272832, 'dropout2': 0.49522617560303994, 'lr': 0.0016922818366178086, 'batch_size': 16}. Best is trial 10 with value: 0.4569388665774208.


[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step


[I 2025-12-16 15:03:59,570] Trial 15 finished with value: 0.4457557875624149 and parameters: {'n_layers': 2, 'hidden1': 768, 'hidden2': 128, 'dropout1': 0.5903773399042189, 'dropout2': 0.49401763106283014, 'lr': 0.004632847086578437, 'batch_size': 32}. Best is trial 10 with value: 0.4569388665774208.


[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step


[I 2025-12-16 15:04:43,772] Trial 16 finished with value: 0.4202967927237913 and parameters: {'n_layers': 2, 'hidden1': 768, 'hidden2': 128, 'dropout1': 0.5965889962800784, 'dropout2': 0.3399389543525108, 'lr': 0.004902722173806423, 'batch_size': 32}. Best is trial 10 with value: 0.4569388665774208.


[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step


[I 2025-12-16 15:05:28,105] Trial 17 finished with value: 0.36855538540071464 and parameters: {'n_layers': 2, 'hidden1': 768, 'hidden2': 128, 'dropout1': 0.4062267681903371, 'dropout2': 0.49131888665154877, 'lr': 0.0029250008548104985, 'batch_size': 32}. Best is trial 10 with value: 0.4569388665774208.


[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step


[I 2025-12-16 15:07:24,927] Trial 18 finished with value: 0.3863751906456533 and parameters: {'n_layers': 2, 'hidden1': 768, 'hidden2': 128, 'dropout1': 0.4528047637969429, 'dropout2': 0.2720504582844143, 'lr': 0.00018897890495099925, 'batch_size': 32}. Best is trial 10 with value: 0.4569388665774208.


[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step


[I 2025-12-16 15:08:30,176] Trial 19 finished with value: 0.4175506268081003 and parameters: {'n_layers': 2, 'hidden1': 768, 'hidden2': 128, 'dropout1': 0.35411411153778377, 'dropout2': 0.35888748881011673, 'lr': 0.0006536774263256457, 'batch_size': 32}. Best is trial 10 with value: 0.4569388665774208.


In [25]:
print("Best F1-micro:", study.best_value)
print("Best trial:", study.best_trial.number)

Best F1-micro: 0.4569388665774208
Best trial: 10


In [26]:
best_params = study.best_params

print("Best params:")
for k, v in best_params.items():
    print(f"  {k}: {v}")

Best params:
  n_layers: 2
  hidden1: 768
  hidden2: 128
  dropout1: 0.4185888270111561
  dropout2: 0.38549472711399385
  lr: 0.0011252250393771477
  batch_size: 16


In [27]:
# Build final model with best hyperparameters
model = build_model(n_features, n_labels, best_params)

In [28]:
# Fit final model
model.fit(
    X_train_vec, y_train, epochs=20, batch_size=best_params["batch_size"], verbose=1
)

Epoch 1/20
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 105ms/step - loss: 0.3059
Epoch 2/20
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 109ms/step - loss: 0.1877
Epoch 3/20
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 110ms/step - loss: 0.1261
Epoch 4/20
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 109ms/step - loss: 0.0817
Epoch 5/20
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 105ms/step - loss: 0.0559
Epoch 6/20
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 104ms/step - loss: 0.0441
Epoch 7/20
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 104ms/step - loss: 0.0328
Epoch 8/20
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 108ms/step - loss: 0.0277
Epoch 9/20
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 111ms/step - loss: 0.0250
Epoch 10/20
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[

<keras.src.callbacks.history.History at 0x1ada5d44c40>

## Eval


In [29]:
# Predict
y_pred = (model.predict(X_val_vec) > 0.5).astype(int)

[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step


In [30]:
# Compute metrics
metrics = {
    "precision_micro": precision_score(
        y_val.values, y_pred, average="micro", zero_division=0
    ),
    "recall_micro": recall_score(
        y_val.values, y_pred, average="micro", zero_division=0
    ),
    "f1_micro": f1_score(y_val.values, y_pred, average="micro", zero_division=0),
    "precision_macro": precision_score(
        y_val.values, y_pred, average="macro", zero_division=0
    ),
    "recall_macro": recall_score(
        y_val.values, y_pred, average="macro", zero_division=0
    ),
    "f1_macro": f1_score(y_val.values, y_pred, average="macro", zero_division=0),
}

matrix_metrics = pd.DataFrame.from_dict(metrics, orient="index", columns=["Score"])

In [31]:
matrix_metrics.round(4)

Unnamed: 0,Score
precision_micro,0.5515
recall_micro,0.4095
f1_micro,0.47
precision_macro,0.3741
recall_macro,0.2775
f1_macro,0.3127


In [32]:
print(classification_report(y_val, y_pred, target_names=y_train.columns))

                      precision    recall  f1-score   support

      BRAND#Negative       0.37      0.33      0.35        96
       BRAND#Neutral       0.00      0.00      0.00        19
      BRAND#Positive       0.57      0.31      0.40       142
       COST#Negative       0.53      0.42      0.47       120
        COST#Neutral       0.00      0.00      0.00        14
       COST#Positive       0.55      0.52      0.53       124
   EXTERIOR#Negative       0.57      0.39      0.46       109
    EXTERIOR#Neutral       0.00      0.00      0.00        18
   EXTERIOR#Positive       0.66      0.61      0.63       193
   FEATURES#Negative       0.57      0.44      0.50        72
    FEATURES#Neutral       0.00      0.00      0.00         7
   FEATURES#Positive       0.67      0.53      0.59        87
   INTERIOR#Negative       0.74      0.26      0.38        66
    INTERIOR#Neutral       0.00      0.00      0.00         9
   INTERIOR#Positive       0.47      0.35      0.40       102
PERFORM

## Test


In [33]:
df_test = pd.read_csv("../data/raw/val.csv")
df_test = df_test.iloc[-6:-1,:]
df_test

Unnamed: 0,brand,model,version,year,segment,comment,label
994,mitsubishi,xforce,ultimate,2024,suv,Máy yếu so với cùng phân khúc nha,{PERFORMANCE#Negative};
995,toyota,vios,,2025,sedan,Dùng khung gầm daihatsu là k thích lắm,{EXTERIOR#Negative};
996,huyndai,accent,,2024,sedan,Accent này mình thấy đầu giống stargetzer . Nó...,{EXTERIOR#Negative};
997,skoda,slavia,style,2025,sedan,"Xe hạng B options full hơn xe Trung ,Hàn nữa.",{FEATURES#Positive};
998,skoda,kushaq,style,2025,suv,"Tiêu hao nhiên liệu thấp, khoảng 6l/100km.",{PERFORMANCE#Positive};


In [34]:
# Try prediction on some samples
samples = df_test["comment"].tolist()
samples_vec = vec.transform(samples).toarray()

# Use the trained final model for predictions
preds = (model.predict(samples_vec) > 0.5).astype(int)


def decode_labels(pred_row, classes):
    return [cls for cls, val in zip(classes, pred_row) if val == 1]


for i, (text, pred_row) in enumerate(zip(samples, preds)):
    labels = decode_labels(pred_row, y_train.columns.tolist())
    print(f"Sample {i+1}:")
    print(f"\tText: {text}")
    print(f"\tPredicted labels: {labels}\n")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step
Sample 1:
	Text: Máy yếu so với cùng phân khúc nha
	Predicted labels: ['PERFORMANCE#Negative']

Sample 2:
	Text: Dùng khung gầm daihatsu là k thích lắm
	Predicted labels: []

Sample 3:
	Text: Accent này mình thấy đầu giống stargetzer . Nói chung theo thẩm mỹ mình thấy xấu.
	Predicted labels: ['EXTERIOR#Negative']

Sample 4:
	Text: Xe hạng B options full hơn xe Trung ,Hàn nữa.
	Predicted labels: ['COST#Negative']

Sample 5:
	Text: Tiêu hao nhiên liệu thấp, khoảng 6l/100km.
	Predicted labels: []

