In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings("ignore")

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    precision_score,
    recall_score,
    f1_score,
    make_scorer,
)

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

import optuna

from utils.other import parse_label, matrix_labels

---

# Read file


In [2]:
df_train = pd.read_csv("../data/processed/train.csv", encoding="utf-8")
df_val = pd.read_csv("../data/processed/val.csv", encoding="utf-8")

In [3]:
df_train.head()

Unnamed: 0,comment,label
0,đuôi dạng coupe đẹp hẳn,{EXTERIOR#Positive};
1,đèn xấu,{EXTERIOR#Negative};
2,yc xăng nội_thất ok xforce chạy ga êm ồn xforc...,{EXTERIOR#Positive};{PERFORMANCE#Negative};{IN...
3,đi hài_lòng bốc ngon âm_rẻ tiết_kiệm xăng_lít ...,{PERFORMANCE#Positive};{COST#Positive};
4,bệ tì_tay màn_hình kết khai đồ trung_nhập indo,{INTERIOR#Positive};


In [4]:
df_val.head()

Unnamed: 0,comment,label
0,mông ok đấy,{EXTERIOR#Positive};
1,đi thử độ êm_khung gầm yc ngon tăng_tốc êm_mượ...,{PERFORMANCE#Positive};
2,chê trung_quốc đi xe trung_quốc xe trung_quốc ...,{BRAND#Positive};
3,định mua tết đồ_đạc thay_thế,{BRAND#Negative};
4,xe 500 t cặp đèn_pha led trăm củ,{COST#Negative};


---

# Labels

## Train

In [5]:
matrix_labels_train , mlb_train = matrix_labels(df_train[["label"]])

In [6]:
matrix_labels_train.head()

Unnamed: 0,BRAND#Negative,BRAND#Neutral,BRAND#Positive,COST#Negative,COST#Neutral,COST#Positive,EXTERIOR#Negative,EXTERIOR#Neutral,EXTERIOR#Positive,FEATURES#Negative,FEATURES#Neutral,FEATURES#Positive,INTERIOR#Negative,INTERIOR#Neutral,INTERIOR#Positive,PERFORMANCE#Negative,PERFORMANCE#Neutral,PERFORMANCE#Positive
0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,1,0,0
3,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0


In [7]:
print(f"Number of labels: {len(mlb_train.classes_)}")
print(f"Labels: {mlb_train.classes_}")

Number of labels: 18
Labels: ['BRAND#Negative' 'BRAND#Neutral' 'BRAND#Positive' 'COST#Negative'
 'COST#Neutral' 'COST#Positive' 'EXTERIOR#Negative' 'EXTERIOR#Neutral'
 'EXTERIOR#Positive' 'FEATURES#Negative' 'FEATURES#Neutral'
 'FEATURES#Positive' 'INTERIOR#Negative' 'INTERIOR#Neutral'
 'INTERIOR#Positive' 'PERFORMANCE#Negative' 'PERFORMANCE#Neutral'
 'PERFORMANCE#Positive']


## Val

In [8]:
matrix_labels_val , mlb_val = matrix_labels(df_val[["label"]])

In [9]:
matrix_labels_val.head()

Unnamed: 0,BRAND#Negative,BRAND#Neutral,BRAND#Positive,COST#Negative,COST#Neutral,COST#Positive,EXTERIOR#Negative,EXTERIOR#Neutral,EXTERIOR#Positive,FEATURES#Negative,FEATURES#Neutral,FEATURES#Positive,INTERIOR#Negative,INTERIOR#Neutral,INTERIOR#Positive,PERFORMANCE#Negative,PERFORMANCE#Neutral,PERFORMANCE#Positive
0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [10]:
print(f"Number of labels: {len(mlb_val.classes_)}")
print(f"Labels: {mlb_val.classes_}")

Number of labels: 18
Labels: ['BRAND#Negative' 'BRAND#Neutral' 'BRAND#Positive' 'COST#Negative'
 'COST#Neutral' 'COST#Positive' 'EXTERIOR#Negative' 'EXTERIOR#Neutral'
 'EXTERIOR#Positive' 'FEATURES#Negative' 'FEATURES#Neutral'
 'FEATURES#Positive' 'INTERIOR#Negative' 'INTERIOR#Neutral'
 'INTERIOR#Positive' 'PERFORMANCE#Negative' 'PERFORMANCE#Neutral'
 'PERFORMANCE#Positive']


---

# Train, val split


In [11]:
# Train
X_train = df_train[["comment"]]
y_train = matrix_labels_train

# Validation
X_val = df_val[["comment"]]
y_val = matrix_labels_val

In [12]:
pd.concat([X_train, pd.DataFrame(y_train, columns=mlb_train.classes_)], axis=1).head()

Unnamed: 0,comment,BRAND#Negative,BRAND#Neutral,BRAND#Positive,COST#Negative,COST#Neutral,COST#Positive,EXTERIOR#Negative,EXTERIOR#Neutral,EXTERIOR#Positive,FEATURES#Negative,FEATURES#Neutral,FEATURES#Positive,INTERIOR#Negative,INTERIOR#Neutral,INTERIOR#Positive,PERFORMANCE#Negative,PERFORMANCE#Neutral,PERFORMANCE#Positive
0,đuôi dạng coupe đẹp hẳn,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
1,đèn xấu,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
2,yc xăng nội_thất ok xforce chạy ga êm ồn xforc...,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,1,0,0
3,đi hài_lòng bốc ngon âm_rẻ tiết_kiệm xăng_lít ...,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1
4,bệ tì_tay màn_hình kết khai đồ trung_nhập indo,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0


In [13]:
pd.concat([X_val, pd.DataFrame(y_val, columns=mlb_val.classes_)], axis=1).head()

Unnamed: 0,comment,BRAND#Negative,BRAND#Neutral,BRAND#Positive,COST#Negative,COST#Neutral,COST#Positive,EXTERIOR#Negative,EXTERIOR#Neutral,EXTERIOR#Positive,FEATURES#Negative,FEATURES#Neutral,FEATURES#Positive,INTERIOR#Negative,INTERIOR#Neutral,INTERIOR#Positive,PERFORMANCE#Negative,PERFORMANCE#Neutral,PERFORMANCE#Positive
0,mông ok đấy,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
1,đi thử độ êm_khung gầm yc ngon tăng_tốc êm_mượ...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,chê trung_quốc đi xe trung_quốc xe trung_quốc ...,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,định mua tết đồ_đạc thay_thế,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,xe 500 t cặp đèn_pha led trăm củ,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


---

# Vectorize


In [14]:
vec = TfidfVectorizer(
    analyzer="char",
    min_df=3,
    max_df=0.95,
    ngram_range=(3, 5),
    sublinear_tf=True,
    max_features=30000,
)

In [15]:
# Fit TF-IDF on training and transform train/test
X_train_vec = vec.fit_transform(X_train["comment"])
X_val_vec = vec.transform(X_val["comment"])

In [16]:
# To dense arrays
X_train_vec = X_train_vec.toarray()
X_val_vec = X_val_vec.toarray()

In [17]:
print(f"Train shape: {X_train_vec.shape}")
print(f"Test shape: {X_val_vec.shape}")
print(f"Vocabulary size: {len(vec.get_feature_names_out())}")

Train shape: (1403, 13912)
Test shape: (500, 13912)
Vocabulary size: 13912


In [None]:
print("Number of labels train: ", y_train.shape[1])
print("Number of labels test: ", y_val.shape[1])

Number of labels train 18
Number of labels test 18


In [19]:
n_features = X_train_vec.shape[1]
n_labels = y_train.shape[1]

---

# FNN


## Model


In [20]:
# Build model function
def build_model(input_dim, output_dim, params):
    model = Sequential()

    model.add(Dense(params["hidden1"], activation="relu", input_shape=(input_dim,)))
    model.add(Dropout(params["dropout1"]))

    if params["n_layers"] == 2:
        model.add(Dense(params["hidden2"], activation="relu"))
        model.add(Dropout(params["dropout2"]))

    model.add(Dense(output_dim, activation="sigmoid"))  # multi-label

    model.compile(
        optimizer=Adam(learning_rate=params["lr"]), loss="binary_crossentropy"
    )

    return model

In [21]:
# Objective function for Optuna
def objective(trial):

    params = {
        "n_layers": trial.suggest_int("n_layers", 1, 2),
        "hidden1": trial.suggest_categorical("hidden1", [256, 512, 768]),
        "hidden2": trial.suggest_categorical("hidden2", [128, 256]),
        "dropout1": trial.suggest_float("dropout1", 0.3, 0.6),
        "dropout2": trial.suggest_float("dropout2", 0.2, 0.5),
        "lr": trial.suggest_float("lr", 1e-4, 5e-3, log=True),
        "batch_size": trial.suggest_categorical("batch_size", [8, 16, 32, 64]),
    }

    model = build_model(input_dim=n_features, output_dim=n_labels, params=params)

    early_stop = EarlyStopping(
        monitor="val_loss", patience=3, restore_best_weights=True
    )

    # Use precomputed numeric vectors for training/validation
    model.fit(
        X_train_vec,
        y_train.values,
        validation_data=(X_val_vec, y_val.values),
        epochs=20,
        batch_size=params["batch_size"],
        callbacks=[early_stop],
        verbose=0,
    )

    y_val_pred = (model.predict(X_val_vec) > 0.5).astype(int)

    f1 = f1_score(y_val.values, y_val_pred, average="micro")

    return f1

In [None]:
# Optuna study
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20)

[I 2025-12-15 15:08:13,632] A new study created in memory with name: no-name-f03a1433-f662-4001-bcd8-b9f997e2543f


[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 


[I 2025-12-15 15:08:46,862] Trial 0 finished with value: 0.3686440677966102 and parameters: {'n_layers': 1, 'hidden1': 256, 'hidden2': 128, 'dropout1': 0.42657009262555295, 'dropout2': 0.4528015757622376, 'lr': 0.00042185515581382035, 'batch_size': 16}. Best is trial 0 with value: 0.3686440677966102.


[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step


[I 2025-12-15 15:09:49,490] Trial 1 finished with value: 0.40357852882703776 and parameters: {'n_layers': 2, 'hidden1': 512, 'hidden2': 128, 'dropout1': 0.5389911682114602, 'dropout2': 0.4238850853944395, 'lr': 0.0004940599624758721, 'batch_size': 8}. Best is trial 1 with value: 0.40357852882703776.


[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step


[I 2025-12-15 15:09:58,943] Trial 2 finished with value: 0.40118577075098816 and parameters: {'n_layers': 2, 'hidden1': 256, 'hidden2': 256, 'dropout1': 0.3850563577493842, 'dropout2': 0.20364514432665104, 'lr': 0.0013515287797965995, 'batch_size': 32}. Best is trial 1 with value: 0.40357852882703776.


[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step


[I 2025-12-15 15:10:11,743] Trial 3 finished with value: 0.3655685441020191 and parameters: {'n_layers': 1, 'hidden1': 256, 'hidden2': 256, 'dropout1': 0.47250234542099445, 'dropout2': 0.4345108552773319, 'lr': 0.0014026415261873379, 'batch_size': 32}. Best is trial 1 with value: 0.40357852882703776.


[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step


[I 2025-12-15 15:10:25,000] Trial 4 finished with value: 0.3863863863863864 and parameters: {'n_layers': 2, 'hidden1': 256, 'hidden2': 256, 'dropout1': 0.44730403825125975, 'dropout2': 0.36454189123631164, 'lr': 0.004286014733242724, 'batch_size': 16}. Best is trial 1 with value: 0.40357852882703776.


[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step


[I 2025-12-15 15:10:37,412] Trial 5 finished with value: 0.41941747572815535 and parameters: {'n_layers': 1, 'hidden1': 768, 'hidden2': 256, 'dropout1': 0.5365802954123201, 'dropout2': 0.21311011237484953, 'lr': 0.004981002961959438, 'batch_size': 64}. Best is trial 5 with value: 0.41941747572815535.


[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step


[I 2025-12-15 15:11:00,892] Trial 6 finished with value: 0.4542056074766355 and parameters: {'n_layers': 1, 'hidden1': 512, 'hidden2': 256, 'dropout1': 0.47717368644427627, 'dropout2': 0.22384524323150884, 'lr': 0.003982323348828108, 'batch_size': 16}. Best is trial 6 with value: 0.4542056074766355.


[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step


[I 2025-12-15 15:11:27,860] Trial 7 finished with value: 0.0 and parameters: {'n_layers': 2, 'hidden1': 512, 'hidden2': 128, 'dropout1': 0.5202149079052921, 'dropout2': 0.34687792572914533, 'lr': 0.00011415765848363277, 'batch_size': 64}. Best is trial 6 with value: 0.4542056074766355.


[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step


[I 2025-12-15 15:11:54,032] Trial 8 finished with value: 0.025034770514603615 and parameters: {'n_layers': 1, 'hidden1': 512, 'hidden2': 256, 'dropout1': 0.5158090032875771, 'dropout2': 0.27848818161119515, 'lr': 0.00015745460580320463, 'batch_size': 64}. Best is trial 6 with value: 0.4542056074766355.


[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step


[I 2025-12-15 15:12:21,157] Trial 9 finished with value: 0.40122199592668023 and parameters: {'n_layers': 1, 'hidden1': 256, 'hidden2': 256, 'dropout1': 0.5271225048128141, 'dropout2': 0.26879319208463714, 'lr': 0.0017821009899595358, 'batch_size': 8}. Best is trial 6 with value: 0.4542056074766355.


In [23]:
print("Best F1-micro:", study.best_value)
print("Best trial:", study.best_trial.number)

Best F1-micro: 0.4542056074766355
Best trial: 6


In [24]:
best_params = study.best_params

print("Best params:")
for k, v in best_params.items():
    print(f"  {k}: {v}")

Best params:
  n_layers: 1
  hidden1: 512
  hidden2: 256
  dropout1: 0.47717368644427627
  dropout2: 0.22384524323150884
  lr: 0.003982323348828108
  batch_size: 16


In [25]:
# Build final model with best hyperparameters
model = build_model(n_features, n_labels, best_params)

In [26]:
# Fit final model
model.fit(
    X_train_vec, y_train, epochs=20, batch_size=best_params["batch_size"], verbose=1
)

Epoch 1/20
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 47ms/step - loss: 0.2947
Epoch 2/20
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 49ms/step - loss: 0.1523
Epoch 3/20
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 50ms/step - loss: 0.0817
Epoch 4/20
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 50ms/step - loss: 0.0435
Epoch 5/20
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 51ms/step - loss: 0.0276
Epoch 6/20
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 49ms/step - loss: 0.0178
Epoch 7/20
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 49ms/step - loss: 0.0143
Epoch 8/20
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 54ms/step - loss: 0.0122
Epoch 9/20
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 53ms/step - loss: 0.0108
Epoch 10/20
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 54ms/step - loss: 0.0111

<keras.src.callbacks.history.History at 0x212450dc5e0>

## Eval


In [27]:
# Predict
y_pred = (model.predict(X_val_vec) > 0.5).astype(int)

[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step


In [28]:
# Compute metrics
metrics = {
    "precision_micro": precision_score(
        y_val.values, y_pred, average="micro", zero_division=0
    ),
    "recall_micro": recall_score(
        y_val.values, y_pred, average="micro", zero_division=0
    ),
    "f1_micro": f1_score(y_val.values, y_pred, average="micro", zero_division=0),
    "precision_macro": precision_score(
        y_val.values, y_pred, average="macro", zero_division=0
    ),
    "recall_macro": recall_score(
        y_val.values, y_pred, average="macro", zero_division=0
    ),
    "f1_macro": f1_score(y_val.values, y_pred, average="macro", zero_division=0),
}

matrix_metrics = pd.DataFrame.from_dict(metrics, orient="index", columns=["Score"])

In [29]:
matrix_metrics.round(4)

Unnamed: 0,Score
precision_micro,0.6076
recall_micro,0.3822
f1_micro,0.4693
precision_macro,0.4192
recall_macro,0.269
f1_macro,0.3225


In [30]:
print(classification_report(y_val, y_pred, target_names=y_train.columns))

                      precision    recall  f1-score   support

      BRAND#Negative       0.53      0.30      0.38        63
       BRAND#Neutral       0.00      0.00      0.00        10
      BRAND#Positive       0.46      0.29      0.35        77
       COST#Negative       0.76      0.49      0.60        59
        COST#Neutral       0.00      0.00      0.00        10
       COST#Positive       0.67      0.42      0.52        52
   EXTERIOR#Negative       0.59      0.52      0.55        63
    EXTERIOR#Neutral       0.33      0.09      0.14        11
   EXTERIOR#Positive       0.78      0.54      0.64        95
   FEATURES#Negative       0.64      0.55      0.59        38
    FEATURES#Neutral       0.00      0.00      0.00         6
   FEATURES#Positive       0.58      0.19      0.29        36
   INTERIOR#Negative       0.55      0.35      0.43        34
    INTERIOR#Neutral       0.00      0.00      0.00         5
   INTERIOR#Positive       0.44      0.26      0.32        47
PERFORM

## Test


In [31]:
df_test = pd.read_csv("../data/raw/val.csv")
df_test = df_test.iloc[-6:-1,:]
df_test

Unnamed: 0,brand,model,version,year,segment,comment,label
494,hyundai,creta,n line,2025,suv,Bỏ cái viền trắng kéo từ nóc xe xuống cột c đi...,{EXTERIOR#Negative};
495,mitsubishi,xforce,ultimate,2024,suv,XF mà có động cơ khỏe hơn tí nữa thì đúng là k...,{PERFORMANCE#Neutral};
496,huyndai,accent,,2024,sedan,Nhìn lạ nhưng phải đi mới biet,{EXTERIOR#Neutral};{INTERIOR#Neutral};
497,kia,seltos,facelift,2024,suv,Xe Hàn lại đắt giá hơn xe Nhật nhiều...,{BRAND#Negative};{COST#Negative};
498,hyundai,creta,n line,2025,suv,Cụm điều hòa nhìn chán thật đấy. Nút siêu rối ...,{INTERIOR#Negative};


In [32]:
# Try prediction on some samples
samples = df_test["comment"].tolist()
samples_vec = vec.transform(samples).toarray()

# Use the trained final model for predictions
preds = (model.predict(samples_vec) > 0.5).astype(int)


def decode_labels(pred_row, classes):
    return [cls for cls, val in zip(classes, pred_row) if val == 1]


for i, (text, pred_row) in enumerate(zip(samples, preds)):
    labels = decode_labels(pred_row, y_train.columns.tolist())
    print(f"Sample {i+1}:")
    print(f"\tText: {text}")
    print(f"\tPredicted labels: {labels}\n")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
Sample 1:
	Text: Bỏ cái viền trắng kéo từ nóc xe xuống cột c đi , trông tởm như bản cũ . Bản này trông còn được, bản cũ nhìn xấu thậm tệ .
	Predicted labels: ['EXTERIOR#Negative']

Sample 2:
	Text: XF mà có động cơ khỏe hơn tí nữa thì đúng là ko còn gì để chê.
	Predicted labels: ['PERFORMANCE#Positive']

Sample 3:
	Text: Nhìn lạ nhưng phải đi mới biet
	Predicted labels: []

Sample 4:
	Text: Xe Hàn lại đắt giá hơn xe Nhật nhiều...
	Predicted labels: ['COST#Negative']

Sample 5:
	Text: Cụm điều hòa nhìn chán thật đấy. Nút siêu rối rắm, hoàn thiện nhìn bị rẻ tiền, không ăn nhập gì với cụm màn đôi bên trên cả.
	Predicted labels: []

