In [50]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings("ignore")

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    make_scorer,
)

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

import optuna

---

# Read file


In [51]:
df_train = pd.read_csv("../data/processed/train.csv", encoding="utf-8")
df_val = pd.read_csv("../data/processed/val.csv", encoding="utf-8")

In [52]:
df_train.head()

Unnamed: 0,text,label
0,vf6 xe gia_đình ok việt_nam chủ_yếu mua xe gia...,negative
1,xe_điện ồn xe xăng sai sai,negative
2,form đẹp ác,positive
3,thể_thao đấy,positive
4,xe vinfast chạy dịch_vụ phổ_biến sạc miễn_phí ...,negative


In [53]:
df_val.head()

Unnamed: 0,text,label
0,đại_lý dường_như đóng_cửa rào_cảm dân ta dè mu...,negative
1,base trần_truồng vòng tua_máy 48 x tiệm cận xe...,negative
2,cross chạy ngon,positive
3,khoang lái bố_cục đẹp,positive
4,đầu xe xấu,negative


---

# Labels

## Train

In [54]:
matrix_labels_train = df_train["label"]

In [55]:
matrix_labels_train.head()

0    negative
1    negative
2    positive
3    positive
4    negative
Name: label, dtype: object

In [56]:
print(f"Number of labels: {len(matrix_labels_train.unique())}")
print(f"Labels: {matrix_labels_train.unique()}")

Number of labels: 3
Labels: ['negative' 'positive' 'neutral']


## Val

In [57]:
matrix_labels_val = df_val["label"]

In [58]:
matrix_labels_val.head()

0    negative
1    negative
2    positive
3    positive
4    negative
Name: label, dtype: object

In [59]:
print(f"Number of labels: {len(matrix_labels_val.unique())}")
print(f"Labels: {matrix_labels_val.unique()}")

Number of labels: 3
Labels: ['negative' 'positive' 'neutral']


---

# Train, val split


In [60]:
# Train
X_train = df_train[["text"]]
y_train = matrix_labels_train

# Validation
X_val = df_val[["text"]]
y_val = matrix_labels_val

In [61]:
# Encode labels
le= LabelEncoder()
y_train = le.fit_transform(y_train).astype(np.int64)
y_val = le.transform(y_val).astype(np.int64)

---

# Vectorize


In [62]:
vec = TfidfVectorizer(
    analyzer="char",
    min_df=3,
    max_df=0.95,
    ngram_range=(3, 5),
    sublinear_tf=True,
    max_features=30000,
)

In [63]:
# Fit TF-IDF on training and transform train/test
X_train_vec = vec.fit_transform(X_train["text"])
X_val_vec = vec.transform(X_val["text"])

In [64]:
# To dense arrays
X_train_vec = X_train_vec.toarray().astype(np.float32)
X_val_vec = X_val_vec.toarray().astype(np.float32)

In [65]:
print(f"Train shape: {X_train_vec.shape}")
print(f"Test shape: {X_val_vec.shape}")
print(f"Vocabulary size: {len(vec.get_feature_names_out())}")

Train shape: (3593, 28464)
Test shape: (1040, 28464)
Vocabulary size: 28464


In [66]:
print("Number of classes (train): ", len(le.classes_))
print("Number of classes (val): ", len(le.classes_))

Number of classes (train):  3
Number of classes (val):  3


In [67]:
n_features = X_train_vec.shape[1]
n_classes = len(le.classes_)

---

# FNN


## Model


In [68]:
# Build model function
def build_model(input_dim, output_dim, params):
    model = Sequential()

    model.add(Dense(params["hidden1"], activation="relu", input_shape=(input_dim,)))
    model.add(Dropout(params["dropout1"]))

    if params["n_layers"] == 2:
        model.add(Dense(params["hidden2"], activation="relu"))
        model.add(Dropout(params["dropout2"]))

    # Single-label multiclass: softmax + sparse categorical crossentropy
    model.add(Dense(output_dim, activation="softmax"))

    model.compile(
        optimizer=Adam(learning_rate=params["lr"]),
        loss="sparse_categorical_crossentropy",
        metrics=["accuracy"],
    )

    return model

In [69]:
# Objective function for Optuna
def objective(trial):

    params = {
        "n_layers": trial.suggest_int("n_layers", 1, 2),
        "hidden1": trial.suggest_categorical("hidden1", [256, 512, 768]),
        "hidden2": trial.suggest_categorical("hidden2", [128, 256]),
        "dropout1": trial.suggest_float("dropout1", 0.3, 0.6),
        "dropout2": trial.suggest_float("dropout2", 0.2, 0.5),
        "lr": trial.suggest_float("lr", 1e-4, 5e-3, log=True),
        "batch_size": trial.suggest_categorical("batch_size", [8, 16, 32, 64]),
    }

    model = build_model(input_dim=n_features, output_dim=n_classes, params=params)

    early_stop = EarlyStopping(
        monitor="val_loss", patience=3, restore_best_weights=True
    )

    # Use precomputed numeric vectors for training/validation
    model.fit(
        X_train_vec,
        y_train,
        validation_data=(X_val_vec, y_val),
        epochs=10,
        batch_size=params["batch_size"],
        callbacks=[early_stop],
        verbose=0,
    )

    y_val_prob = model.predict(X_val_vec)
    y_val_pred = np.argmax(y_val_prob, axis=1)

    f1 = f1_score(y_val, y_val_pred, average="macro")

    return f1

In [70]:
# Optuna study
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=10)

[I 2025-12-17 15:56:28,483] A new study created in memory with name: no-name-87ff2add-90c0-4a92-85bc-381f590a1c14


[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step


[I 2025-12-17 15:57:49,228] Trial 0 finished with value: 0.47435553698842536 and parameters: {'n_layers': 2, 'hidden1': 768, 'hidden2': 256, 'dropout1': 0.4533452534345658, 'dropout2': 0.4802488627705372, 'lr': 0.0002224999174163321, 'batch_size': 32}. Best is trial 0 with value: 0.47435553698842536.


[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step


[I 2025-12-17 15:59:20,158] Trial 1 finished with value: 0.47018680867810464 and parameters: {'n_layers': 2, 'hidden1': 256, 'hidden2': 256, 'dropout1': 0.4299660083876867, 'dropout2': 0.2381217076017978, 'lr': 0.0005552925302748689, 'batch_size': 8}. Best is trial 0 with value: 0.47435553698842536.


[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step


[I 2025-12-17 15:59:57,909] Trial 2 finished with value: 0.48567758554791335 and parameters: {'n_layers': 1, 'hidden1': 256, 'hidden2': 128, 'dropout1': 0.3215464052345593, 'dropout2': 0.31867535636587146, 'lr': 0.0003107196614361961, 'batch_size': 32}. Best is trial 2 with value: 0.48567758554791335.


[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step


[I 2025-12-17 16:01:19,409] Trial 3 finished with value: 0.48285878658429954 and parameters: {'n_layers': 1, 'hidden1': 512, 'hidden2': 128, 'dropout1': 0.3629106415625857, 'dropout2': 0.29623759603389527, 'lr': 0.0020702135071893417, 'batch_size': 16}. Best is trial 2 with value: 0.48567758554791335.


[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step


[I 2025-12-17 16:01:31,551] Trial 4 finished with value: 0.47971287881684654 and parameters: {'n_layers': 2, 'hidden1': 256, 'hidden2': 128, 'dropout1': 0.4666012556615381, 'dropout2': 0.2784187957247404, 'lr': 0.0016683220730975483, 'batch_size': 64}. Best is trial 2 with value: 0.48567758554791335.


[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step


[I 2025-12-17 16:02:22,992] Trial 5 finished with value: 0.4891066639840074 and parameters: {'n_layers': 1, 'hidden1': 512, 'hidden2': 128, 'dropout1': 0.315962304834481, 'dropout2': 0.4267260985406795, 'lr': 0.00033627140005213743, 'batch_size': 32}. Best is trial 5 with value: 0.4891066639840074.


[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step


[I 2025-12-17 16:08:13,415] Trial 6 finished with value: 0.48054048321574955 and parameters: {'n_layers': 1, 'hidden1': 768, 'hidden2': 256, 'dropout1': 0.5544219970099996, 'dropout2': 0.3605257210492224, 'lr': 0.0001267341818150112, 'batch_size': 8}. Best is trial 5 with value: 0.4891066639840074.


[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step


[I 2025-12-17 16:10:20,883] Trial 7 finished with value: 0.4841619242670965 and parameters: {'n_layers': 2, 'hidden1': 768, 'hidden2': 256, 'dropout1': 0.4091308538210318, 'dropout2': 0.29428531084160403, 'lr': 0.0006031422455147199, 'batch_size': 16}. Best is trial 5 with value: 0.4891066639840074.


[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step


[I 2025-12-17 16:10:59,256] Trial 8 finished with value: 0.48251971308073677 and parameters: {'n_layers': 2, 'hidden1': 512, 'hidden2': 256, 'dropout1': 0.3344845376899689, 'dropout2': 0.2837931892404266, 'lr': 0.000154349011025825, 'batch_size': 64}. Best is trial 5 with value: 0.4891066639840074.


[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step


[I 2025-12-17 16:11:44,348] Trial 9 finished with value: 0.4774964099331022 and parameters: {'n_layers': 1, 'hidden1': 512, 'hidden2': 128, 'dropout1': 0.5084606605559054, 'dropout2': 0.3153202833023761, 'lr': 0.0015308205441287462, 'batch_size': 32}. Best is trial 5 with value: 0.4891066639840074.


In [71]:
print("Best F1-macro:", study.best_value)
print("Best trial:", study.best_trial.number)

Best F1-macro: 0.4891066639840074
Best trial: 5


In [72]:
best_params = study.best_params

print("Best params:")
for k, v in best_params.items():
    print(f"  {k}: {v}")

Best params:
  n_layers: 1
  hidden1: 512
  hidden2: 128
  dropout1: 0.315962304834481
  dropout2: 0.4267260985406795
  lr: 0.00033627140005213743
  batch_size: 32


In [73]:
# Build final model with best hyperparameters
model = build_model(n_features, n_classes, best_params)

In [74]:
# Fit final model
model.fit(
    X_train_vec, y_train, epochs=10, batch_size=best_params["batch_size"], verbose=1
)

Epoch 1/10
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 94ms/step - accuracy: 0.5578 - loss: 0.9671
Epoch 2/10
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 97ms/step - accuracy: 0.7598 - loss: 0.7085
Epoch 3/10
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 91ms/step - accuracy: 0.8163 - loss: 0.4955
Epoch 4/10
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 93ms/step - accuracy: 0.8945 - loss: 0.3379
Epoch 5/10
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 93ms/step - accuracy: 0.9438 - loss: 0.2262
Epoch 6/10
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 90ms/step - accuracy: 0.9738 - loss: 0.1511
Epoch 7/10
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 89ms/step - accuracy: 0.9889 - loss: 0.1015
Epoch 8/10
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 94ms/step - accuracy: 0.9928 - loss: 0.0703
Epoch 9/10
[1m113/113[

<keras.src.callbacks.history.History at 0x1fd928e01c0>

## Eval


In [75]:
# Predict
y_prob = model.predict(X_val_vec)
y_pred = np.argmax(y_prob, axis=1)

[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step


In [76]:
# Compute metrics
metrics = {
    "accuracy_score": accuracy_score(y_val, y_pred),
    "precision_macro": precision_score(y_val, y_pred, average="macro", zero_division=0),
    "recall_macro": recall_score(y_val, y_pred, average="macro", zero_division=0),
    "f1_macro": f1_score(y_val, y_pred, average="macro", zero_division=0),
}

matrix_metrics = pd.DataFrame.from_dict(metrics, orient="index", columns=["score"])

In [77]:
matrix_metrics.round(4)

Unnamed: 0,score
accuracy_score,0.6625
precision_macro,0.5318
recall_macro,0.5174
f1_macro,0.5173


In [78]:
print(classification_report(y_val, y_pred, target_names=le.classes_, zero_division=0))

              precision    recall  f1-score   support

    negative       0.65      0.68      0.66       410
     neutral       0.24      0.11      0.15       105
    positive       0.72      0.76      0.74       525

    accuracy                           0.66      1040
   macro avg       0.53      0.52      0.52      1040
weighted avg       0.64      0.66      0.65      1040



## Test


In [79]:
df_test = pd.read_csv("../data/raw/val.csv")
df_test = df_test.iloc[-6:-1,:]
df_test

Unnamed: 0,brand,model,version,year,segment,comment,label
1034,huyndai,accent,,2024,sedan,Xe này xấu hơn phom 2023,negative
1035,skoda,kushaq,style,2025,suv,"Ghế lái chỉnh điện, tiện lợi.",positive
1036,honda,city,1.5 rs,2024,sedan,"City là nhất rồi, từ mẫu mã đẹp, vận hành mạnh...",positive
1037,vinfast,vf 6,,2024,suv,Nhờ tipcar chuyển lời hộ đến đội ngũ Vinfast r...,negative
1038,nissan,almera,,2024,sedan,"sai lầm của nissan khi xuống 3 máy , đi tầm 1-...",negative


In [80]:
# Try prediction on some samples
samples = df_test["comment"].tolist()
samples_vec = vec.transform(samples).toarray()

# Use the trained final model for predictions
probs = model.predict(samples_vec)
preds = np.argmax(probs, axis=1)

for i, (text, pred_idx) in enumerate(zip(samples, preds)):
    label_name = le.inverse_transform([pred_idx])[0]
    confidence = probs[i][pred_idx]
    print(f"Sample {i+1}:")
    print(f"\tText: {text}")
    print(f"\tPredicted label: {label_name}")
    print(f"\tConfidence: {confidence:.4f}\n")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
Sample 1:
	Text: Xe này xấu hơn phom 2023
	Predicted label: negative
	Confidence: 0.7272

Sample 2:
	Text: Ghế lái chỉnh điện, tiện lợi.
	Predicted label: positive
	Confidence: 0.9951

Sample 3:
	Text: City là nhất rồi, từ mẫu mã đẹp, vận hành mạnh mẽ, cảm giác lái tốt, tính năng ok. Hơn hẳn so với vios và accent
	Predicted label: positive
	Confidence: 0.9912

Sample 4:
	Text: Nhờ tipcar chuyển lời hộ đến đội ngũ Vinfast rằng nên để màn hình ở vị trí trung tâm, quay thẳng theo dọc xe, ko nên để màn hình nghiêng về người lái, nhìn ko cân đối, gây xấu tổng thể cái xe. Mấy cái xe điện đều để vậy xấu, ko đẹp bằng mấy cái xe xăng của Vin, hoặc 1 số xe hãng khác. Mong vin thay đổi để phát triển tốt hơn.
	Predicted label: negative
	Confidence: 0.9972

Sample 5:
	Text: sai lầm của nissan khi xuống 3 máy , đi tầm 1-2 năm máy i3 sẽ cho thấy rõ nhược điểm của nó
	Predicted label: negative
	Confidence: 0.9937

