In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings("ignore")

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    precision_score,
    recall_score,
    f1_score,
    make_scorer,
)

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

import optuna

from utils.other import parse_label

---

# Read file


In [2]:
df = pd.read_csv("../data/processed/train_preprocessed.csv")

In [3]:
df.head()

Unnamed: 0,comment,label
0,mẫu đẹp hiện_đại mặt đồng_hồ trung_tâm âm mặt ...,{EXTERIOR#Positive};{INTERIOR#Negative};
1,creta tầm giá xforce chút phiên_bản rẻ thiết_k...,{COST#Neutral};{EXTERIOR#Neutral};
2,giá cx5 dl mặc_dù rộng say cx5,{COST#Positive};
3,giá đợi chương_trình khuyến_mại,{COST#Negative};
4,creta xe mượt_mà hyundai phân_khúc khung gầm t...,{EXTERIOR#Positive};{INTERIOR#Positive};{PERFO...


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 908 entries, 0 to 907
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   comment  908 non-null    object
 1   label    908 non-null    object
dtypes: object(2)
memory usage: 14.3+ KB


---

# Label


In [5]:
mlb = MultiLabelBinarizer()

In [6]:
# Parse labels to lists
df["parsed_labels"] = df["label"].apply(parse_label)

In [7]:
df.head()

Unnamed: 0,comment,label,parsed_labels
0,mẫu đẹp hiện_đại mặt đồng_hồ trung_tâm âm mặt ...,{EXTERIOR#Positive};{INTERIOR#Negative};,"[EXTERIOR#Positive, INTERIOR#Negative]"
1,creta tầm giá xforce chút phiên_bản rẻ thiết_k...,{COST#Neutral};{EXTERIOR#Neutral};,"[COST#Neutral, EXTERIOR#Neutral]"
2,giá cx5 dl mặc_dù rộng say cx5,{COST#Positive};,[COST#Positive]
3,giá đợi chương_trình khuyến_mại,{COST#Negative};,[COST#Negative]
4,creta xe mượt_mà hyundai phân_khúc khung gầm t...,{EXTERIOR#Positive};{INTERIOR#Positive};{PERFO...,"[EXTERIOR#Positive, INTERIOR#Positive, PERFORM..."


In [8]:
# Binary matrix for multi-label classification
matrix_label = mlb.fit_transform(df["parsed_labels"])

# To dataframe
y = pd.DataFrame(matrix_label, columns=mlb.classes_)

In [9]:
y.head()

Unnamed: 0,BRAND#Negative,BRAND#Neutral,BRAND#Positive,COST#Negative,COST#Neutral,COST#Positive,EXTERIOR#Negative,EXTERIOR#Neutral,EXTERIOR#Positive,FEATURES#Negative,FEATURES#Neutral,FEATURES#Positive,INTERIOR#Negative,INTERIOR#Neutral,INTERIOR#Positive,PERFORMANCE#Negative,PERFORMANCE#Neutral,PERFORMANCE#Positive
0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0
1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1


In [10]:
X = df.drop(columns=["label", "parsed_labels"]).copy()

In [11]:
X.head()

Unnamed: 0,comment
0,mẫu đẹp hiện_đại mặt đồng_hồ trung_tâm âm mặt ...
1,creta tầm giá xforce chút phiên_bản rẻ thiết_k...
2,giá cx5 dl mặc_dù rộng say cx5
3,giá đợi chương_trình khuyến_mại
4,creta xe mượt_mà hyundai phân_khúc khung gầm t...


---

# Train, val split


In [12]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.3, random_state=42
)

---

# Vectorize


In [13]:
vec = TfidfVectorizer(
    min_df=3,
    max_df=0.95,
    ngram_range=(1, 2),
    sublinear_tf=True,
    max_features=30000,
)

In [14]:
# Fit TF-IDF on training and transform train/test
X_train_vec = vec.fit_transform(X_train["comment"])
X_val_vec = vec.transform(X_val["comment"])

In [15]:
# To dense arrays
X_train_vec = X_train_vec.toarray()
X_val_vec = X_val_vec.toarray()

In [16]:
print(f"Train shape: {X_train_vec.shape}")
print(f"Test shape: {X_val_vec.shape}")
print(f"Vocabulary size: {len(vec.get_feature_names_out())}")

Train shape: (635, 696)
Test shape: (273, 696)
Vocabulary size: 696


In [17]:
print("Number of labels:", y.shape[1])
print("Number of labels train", y_train.shape[1])
print("Number of labels test", y_val.shape[1])

Number of labels: 18
Number of labels train 18
Number of labels test 18


In [18]:
n_features = X_train_vec.shape[1]
n_labels = y_train.shape[1]

---

# FNN


## Model


In [19]:
# Build model function
def build_model(input_dim, output_dim, params):
    model = Sequential()

    model.add(Dense(params["hidden1"], activation="relu", input_shape=(input_dim,)))
    model.add(Dropout(params["dropout1"]))

    if params["n_layers"] == 2:
        model.add(Dense(params["hidden2"], activation="relu"))
        model.add(Dropout(params["dropout2"]))

    model.add(Dense(output_dim, activation="sigmoid"))  # multi-label

    model.compile(
        optimizer=Adam(learning_rate=params["lr"]), loss="binary_crossentropy"
    )

    return model

In [20]:
# Objective function for Optuna
def objective(trial):

    params = {
        "n_layers": trial.suggest_int("n_layers", 1, 2),
        "hidden1": trial.suggest_categorical("hidden1", [256, 512, 768]),
        "hidden2": trial.suggest_categorical("hidden2", [128, 256]),
        "dropout1": trial.suggest_float("dropout1", 0.3, 0.6),
        "dropout2": trial.suggest_float("dropout2", 0.2, 0.5),
        "lr": trial.suggest_float("lr", 1e-4, 5e-3, log=True),
        "batch_size": trial.suggest_categorical("batch_size", [8, 16, 32, 64]),
    }

    model = build_model(input_dim=n_features, output_dim=n_labels, params=params)

    early_stop = EarlyStopping(
        monitor="val_loss", patience=3, restore_best_weights=True
    )

    # Use precomputed numeric vectors for training/validation
    model.fit(
        X_train_vec,
        y_train.values,
        validation_data=(X_val_vec, y_val.values),
        epochs=20,
        batch_size=params["batch_size"],
        callbacks=[early_stop],
        verbose=0,
    )

    y_val_pred = (model.predict(X_val_vec) > 0.5).astype(int)

    f1 = f1_score(y_val.values, y_val_pred, average="micro")

    return f1

In [21]:
# Optuna study
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=10)

[I 2025-12-14 15:21:59,365] A new study created in memory with name: no-name-fa903507-c36f-473a-b6fc-88f9e389e9e8


[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step 


[I 2025-12-14 15:22:03,426] Trial 0 finished with value: 0.3327239488117002 and parameters: {'n_layers': 2, 'hidden1': 512, 'hidden2': 128, 'dropout1': 0.5056625147724054, 'dropout2': 0.28416456889148844, 'lr': 0.0030782105213128345, 'batch_size': 64}. Best is trial 0 with value: 0.3327239488117002.


[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step


[I 2025-12-14 15:22:12,553] Trial 1 finished with value: 0.3563636363636364 and parameters: {'n_layers': 2, 'hidden1': 512, 'hidden2': 256, 'dropout1': 0.5932024941742347, 'dropout2': 0.3948496710708027, 'lr': 0.00036044367173250486, 'batch_size': 8}. Best is trial 1 with value: 0.3563636363636364.


[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step 


[I 2025-12-14 15:22:17,639] Trial 2 finished with value: 0.2845691382765531 and parameters: {'n_layers': 1, 'hidden1': 768, 'hidden2': 128, 'dropout1': 0.4554314918595179, 'dropout2': 0.22928026452470018, 'lr': 0.0005324602388849972, 'batch_size': 32}. Best is trial 1 with value: 0.3563636363636364.


[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step 


[I 2025-12-14 15:22:21,544] Trial 3 finished with value: 0.0 and parameters: {'n_layers': 1, 'hidden1': 256, 'hidden2': 256, 'dropout1': 0.49870961916227974, 'dropout2': 0.2515133274905639, 'lr': 0.0002074181528223323, 'batch_size': 64}. Best is trial 1 with value: 0.3563636363636364.


[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step 


[I 2025-12-14 15:22:24,543] Trial 4 finished with value: 0.4073455759599332 and parameters: {'n_layers': 1, 'hidden1': 512, 'hidden2': 128, 'dropout1': 0.39219018283420026, 'dropout2': 0.45984381882744013, 'lr': 0.003779037727269396, 'batch_size': 8}. Best is trial 4 with value: 0.4073455759599332.


[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step 


[I 2025-12-14 15:22:32,622] Trial 5 finished with value: 0.3056092843326886 and parameters: {'n_layers': 2, 'hidden1': 256, 'hidden2': 128, 'dropout1': 0.44516746067654434, 'dropout2': 0.4058159814301403, 'lr': 0.00032749545287243266, 'batch_size': 8}. Best is trial 4 with value: 0.4073455759599332.


[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 


[I 2025-12-14 15:22:34,316] Trial 6 finished with value: 0.32209737827715357 and parameters: {'n_layers': 2, 'hidden1': 512, 'hidden2': 128, 'dropout1': 0.43818591381580185, 'dropout2': 0.2287629383320654, 'lr': 0.004277485061267101, 'batch_size': 64}. Best is trial 4 with value: 0.4073455759599332.


[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 


[I 2025-12-14 15:22:36,522] Trial 7 finished with value: 0.3473491773308958 and parameters: {'n_layers': 1, 'hidden1': 512, 'hidden2': 256, 'dropout1': 0.532131951096319, 'dropout2': 0.46854262368093963, 'lr': 0.003096747656733511, 'batch_size': 8}. Best is trial 4 with value: 0.4073455759599332.


[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 


[I 2025-12-14 15:22:39,068] Trial 8 finished with value: 0.37545126353790614 and parameters: {'n_layers': 1, 'hidden1': 512, 'hidden2': 128, 'dropout1': 0.38907909305295274, 'dropout2': 0.3317614937762633, 'lr': 0.0014355763741719606, 'batch_size': 16}. Best is trial 4 with value: 0.4073455759599332.


[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 


[I 2025-12-14 15:22:40,615] Trial 9 finished with value: 0.3903281519861831 and parameters: {'n_layers': 1, 'hidden1': 512, 'hidden2': 128, 'dropout1': 0.43634887783326815, 'dropout2': 0.4454533222200874, 'lr': 0.004534024422407322, 'batch_size': 32}. Best is trial 4 with value: 0.4073455759599332.


In [22]:
print("Best F1-micro:", study.best_value)
print("Best params:")
for k, v in study.best_params.items():
    print(f"  {k}: {v}")

Best F1-micro: 0.4073455759599332
Best params:
  n_layers: 1
  hidden1: 512
  hidden2: 128
  dropout1: 0.39219018283420026
  dropout2: 0.45984381882744013
  lr: 0.003779037727269396
  batch_size: 8


In [23]:
best_params = study.best_params
best_params

{'n_layers': 1,
 'hidden1': 512,
 'hidden2': 128,
 'dropout1': 0.39219018283420026,
 'dropout2': 0.45984381882744013,
 'lr': 0.003779037727269396,
 'batch_size': 8}

In [24]:
# Build final model with best hyperparameters
model = build_model(n_features, n_labels, best_params)

In [25]:
model.fit(
    X_train_vec, y_train, epochs=20, batch_size=best_params["batch_size"], verbose=1
)

Epoch 1/20
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.3198
Epoch 2/20
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.1946
Epoch 3/20
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.1362
Epoch 4/20
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0922
Epoch 5/20
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0616
Epoch 6/20
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0417
Epoch 7/20
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0307
Epoch 8/20
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0229
Epoch 9/20
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0193
Epoch 10/20
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0158
Epoch 11/

<keras.src.callbacks.history.History at 0x256018159c0>

## Eval


In [26]:
# Predict
y_pred = (model.predict(X_val_vec) > 0.5).astype(int)

[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 


In [27]:
# Compute metrics
metrics = {
    "precision_micro": precision_score(
        y_val.values, y_pred, average="micro", zero_division=0
    ),
    "recall_micro": recall_score(
        y_val.values, y_pred, average="micro", zero_division=0
    ),
    "f1_micro": f1_score(y_val.values, y_pred, average="micro", zero_division=0),
    "precision_macro": precision_score(
        y_val.values, y_pred, average="macro", zero_division=0
    ),
    "recall_macro": recall_score(
        y_val.values, y_pred, average="macro", zero_division=0
    ),
    "f1_macro": f1_score(y_val.values, y_pred, average="macro", zero_division=0),
}

matrix_metrics = pd.DataFrame.from_dict(metrics, orient="index", columns=["Score"])

In [28]:
matrix_metrics.round(4)

Unnamed: 0,Score
precision_micro,0.5068
recall_micro,0.3704
f1_micro,0.428
precision_macro,0.3409
recall_macro,0.2445
f1_macro,0.277


In [29]:
print(classification_report(y_val, y_pred, target_names=mlb.classes_))

                      precision    recall  f1-score   support

      BRAND#Negative       0.41      0.29      0.34        42
       BRAND#Neutral       0.00      0.00      0.00         2
      BRAND#Positive       0.29      0.20      0.24        20
       COST#Negative       0.52      0.57      0.54        46
        COST#Neutral       0.00      0.00      0.00         4
       COST#Positive       0.34      0.41      0.38        29
   EXTERIOR#Negative       0.57      0.32      0.41        37
    EXTERIOR#Neutral       0.00      0.00      0.00         3
   EXTERIOR#Positive       0.57      0.50      0.53        52
   FEATURES#Negative       0.56      0.17      0.26        30
    FEATURES#Neutral       0.00      0.00      0.00         1
   FEATURES#Positive       0.40      0.38      0.39        21
   INTERIOR#Negative       0.50      0.35      0.41        17
    INTERIOR#Neutral       0.00      0.00      0.00         1
   INTERIOR#Positive       0.62      0.28      0.38        29
PERFORM

## Test


In [30]:
df_test = pd.read_csv("../data/raw/train.csv")
df_test = df_test.iloc[700:705,:]

In [31]:
# Try prediction on some samples
samples = df_test["comment"].tolist()
samples_vec = vec.transform(samples).toarray()

# Use the trained final model for predictions
preds = (model.predict(samples_vec) > 0.5).astype(int)


def decode_labels(pred_row, classes):
    return [cls for cls, val in zip(classes, pred_row) if val == 1]


for i, (text, pred_row) in enumerate(zip(samples, preds)):
    labels = decode_labels(pred_row, y.columns.tolist())
    print(f"Sample {i+1}:")
    print(f"\tText: {text}")
    print(f"\tPredicted labels: {labels}\n")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
Sample 1:
	Text: Xe điện không trạm sạc xài tốt ở miền Nam, chính xác là miền Tây, từ Sài Gòn đến Cần Thơ chỉ 150 km. Trạm sạc 7kw /11kw thì rất nhiều quán cà phê ở các tỉnh miền Tây sẵn sàng lắp đặt cho khách hàng sạc. Dân miền Tây hào phóng, sẵn sàng giúp đỡ người dưng. Chạy xe gần hết điện bí quá tấp vô nhà dân bên đường xin sạc nhờ 1-2 tiếng. Người ta còn không lấy tiền điện nữa. Miền Bắc thì tui không rõ.
	Predicted labels: ['FEATURES#Positive', 'PERFORMANCE#Positive']

Sample 2:
	Text: Điểm sướng nhất của phanh tái sinh khi đổ đèo là giảm sử dụng phanh chân, nếu quen thì gần như không phải dùng phanh chân mấy. Còn thêm được mấy % pin thì được thôi, k quan trọng lắm.
	Predicted labels: ['FEATURES#Positive']

Sample 3:
	Text: Atto3 đi đầm chắc, êm ái, không gian rộng rãi, thật tuyệt vời .
	Predicted labels: ['EXTERIOR#Positive', 'PERFORMANCE#Positive']

Sample 4:
	Text: con này bên ngoài đẹp vãi luôn đấy
	Predi