In [33]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings("ignore")

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    make_scorer,
)

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

import optuna

---

# Read file


In [34]:
df_train = pd.read_csv("../data/processed/train.csv", encoding="utf-8")
df_val = pd.read_csv("../data/processed/val.csv", encoding="utf-8")

In [35]:
df_train.head()

Unnamed: 0,comment,label,text
0,vf6 xe cho cá_nhân dùng chứ gia_đình dùng thì ...,negative,vf6 xe gia_đình ok việt_nam chủ_yếu mua xe gia...
1,xe_điện mà ồn hơn xe xăng có gì đó sai sai,negative,xe_điện ồn xe xăng sai sai
2,form này đẹp ác,positive,form đẹp ác
3,tôi thấy nó thể_thao đấy chứ,positive,thể_thao đấy
4,hiện_tại xe vinfast chạy dịch_vụ khá phổ_biến ...,negative,xe vinfast chạy dịch_vụ phổ_biến sạc miễn_phí ...


In [36]:
df_val.head()

Unnamed: 0,comment,label,text
0,đại_lý quá ít và dường_như sắp đóng_cửa là rào...,negative,đại_lý dường_như đóng_cửa rào_cảm dân ta dè mu...
1,bản base con này khá trần_truồng đến vòng tua_...,negative,base trần_truồng vòng tua_máy 48 x tiệm cận xe...
2,cross chạy ngon hơn,positive,cross chạy ngon
3,khoang lái bố_cục nhìn đẹp,positive,khoang lái bố_cục đẹp
4,đầu xe xấu quá,negative,đầu xe xấu


---

# Labels

## Train

In [37]:
matrix_labels_train = df_train["label"]

In [38]:
matrix_labels_train.head()

0    negative
1    negative
2    positive
3    positive
4    negative
Name: label, dtype: object

In [39]:
print(f"Number of labels: {len(matrix_labels_train.unique())}")
print(f"Labels: {matrix_labels_train.unique()}")

Number of labels: 3
Labels: ['negative' 'positive' 'neutral']


## Val

In [40]:
matrix_labels_val = df_val["label"]

In [41]:
matrix_labels_val.head()

0    negative
1    negative
2    positive
3    positive
4    negative
Name: label, dtype: object

In [42]:
print(f"Number of labels: {len(matrix_labels_val.unique())}")
print(f"Labels: {matrix_labels_val.unique()}")

Number of labels: 3
Labels: ['negative' 'positive' 'neutral']


---

# Train, val split


In [None]:
# Train
X_train = df_train[["comment"]]
y_train = matrix_labels_train

# Validation
X_val = df_val[["comment"]]
y_val = matrix_labels_val

In [44]:
# Encode labels
le= LabelEncoder()
y_train = le.fit_transform(y_train).astype(np.int64)
y_val = le.transform(y_val).astype(np.int64)

---

# Vectorize


In [45]:
vec = TfidfVectorizer(
    analyzer="char",
    min_df=3,
    max_df=0.95,
    ngram_range=(3, 5),
)

In [None]:
# Fit TF-IDF on training and transform train/test
X_train_vec = vec.fit_transform(X_train["comment"])
X_val_vec = vec.transform(X_val["comment"])

In [47]:
# To dense arrays
X_train_vec = X_train_vec.toarray().astype(np.float32)
X_val_vec = X_val_vec.toarray().astype(np.float32)

In [48]:
print(f"Train shape: {X_train_vec.shape}")
print(f"Test shape: {X_val_vec.shape}")
print(f"Vocabulary size: {len(vec.get_feature_names_out())}")

Train shape: (3592, 28463)
Test shape: (1040, 28463)
Vocabulary size: 28463


In [49]:
print("Number of classes (train): ", len(le.classes_))
print("Number of classes (val): ", len(le.classes_))

Number of classes (train):  3
Number of classes (val):  3


In [50]:
n_features = X_train_vec.shape[1]
n_classes = len(le.classes_)

---

# FNN


## Model


In [51]:
# Build model function
def build_model(input_dim, output_dim, params):
    model = Sequential()

    model.add(Dense(params["hidden1"], activation="relu", input_shape=(input_dim,)))
    model.add(Dropout(params["dropout1"]))

    if params["n_layers"] == 2:
        model.add(Dense(params["hidden2"], activation="relu"))
        model.add(Dropout(params["dropout2"]))

    # Single-label multiclass: softmax + sparse categorical crossentropy
    model.add(Dense(output_dim, activation="softmax"))

    model.compile(
        optimizer=Adam(learning_rate=params["lr"]),
        loss="sparse_categorical_crossentropy",
        metrics=["accuracy"],
    )

    return model

In [52]:
# Objective function for Optuna
def objective(trial):

    params = {
        "n_layers": trial.suggest_int("n_layers", 1, 2),
        "hidden1": trial.suggest_categorical("hidden1", [256, 512, 768]),
        "hidden2": trial.suggest_categorical("hidden2", [128, 256]),
        "dropout1": trial.suggest_float("dropout1", 0.3, 0.6),
        "dropout2": trial.suggest_float("dropout2", 0.2, 0.5),
        "lr": trial.suggest_float("lr", 1e-4, 5e-3, log=True),
        "batch_size": trial.suggest_categorical("batch_size", [8, 16, 32, 64]),
    }

    model = build_model(input_dim=n_features, output_dim=n_classes, params=params)

    early_stop = EarlyStopping(
        monitor="val_loss", patience=3, restore_best_weights=True
    )

    # Use precomputed numeric vectors for training/validation
    model.fit(
        X_train_vec,
        y_train,
        validation_data=(X_val_vec, y_val),
        epochs=20,
        batch_size=params["batch_size"],
        callbacks=[early_stop],
        verbose=0,
    )

    y_val_prob = model.predict(X_val_vec)
    y_val_pred = np.argmax(y_val_prob, axis=1)

    f1 = f1_score(y_val, y_val_pred, average="macro")

    return f1

In [None]:
# Optuna study
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=10)

In [None]:
print("Best f1:", study.best_value)
print("Best trial:", study.best_trial.number)

Best F1-macro: 0.4971091475575549
Best trial: 1


In [None]:
best_params = study.best_params

print("Best params:")
for k, v in best_params.items():
    print(f"  {k}: {v}")

Best params:
  n_layers: 1
  hidden1: 256
  hidden2: 256
  dropout1: 0.5172951825972278
  dropout2: 0.3833761552479368
  lr: 0.0021434166412957903
  batch_size: 32


In [None]:
# Build final model with best hyperparameters
model = build_model(n_features, n_classes, best_params)

In [None]:
# Fit final model
model.fit(
    X_train_vec, y_train, epochs=20, batch_size=best_params["batch_size"], verbose=1
)

Epoch 1/20
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.5749 - loss: 0.9171
Epoch 2/20
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.7280 - loss: 0.6740
Epoch 3/20
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.7851 - loss: 0.5176
Epoch 4/20
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8491 - loss: 0.3975
Epoch 5/20
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8953 - loss: 0.2963
Epoch 6/20
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9234 - loss: 0.2261
Epoch 7/20
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9432 - loss: 0.1770
Epoch 8/20
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9621 - loss: 0.1381
Epoch 9/20
[1m113/113[0m [32m━━━━━━━━

<keras.src.callbacks.history.History at 0x21fba177640>

## Eval


In [None]:
# Predict
y_prob = model.predict(X_val_vec)
y_pred = np.argmax(y_prob, axis=1)

[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 


In [None]:
# Compute metrics
metrics = {
    "accuracy_score": accuracy_score(y_val, y_pred),
    "precision_macro": precision_score(y_val, y_pred, average="macro", zero_division=0),
    "recall_macro": recall_score(y_val, y_pred, average="macro", zero_division=0),
    "f1_macro": f1_score(y_val, y_pred, average="macro", zero_division=0),
}

matrix_metrics = pd.DataFrame.from_dict(metrics, orient="index", columns=["score"])

In [None]:
matrix_metrics.round(4)

Unnamed: 0,score
accuracy_score,0.6404
precision_macro,0.5078
recall_macro,0.5081
f1_macro,0.5072


In [None]:
print(classification_report(y_val, y_pred, target_names=le.classes_, zero_division=0))

              precision    recall  f1-score   support

    negative       0.64      0.68      0.66       410
     neutral       0.16      0.13      0.15       105
    positive       0.72      0.71      0.71       525

    accuracy                           0.64      1040
   macro avg       0.51      0.51      0.51      1040
weighted avg       0.63      0.64      0.64      1040



## Test


In [None]:
df_test = pd.read_csv("../data/raw/val.csv")
df_test = df_test.iloc[-6:-1,:]
df_test

Unnamed: 0,brand,model,version,year,segment,comment,label
1034,huyndai,accent,,2024,sedan,Xe này xấu hơn phom 2023,negative
1035,skoda,kushaq,style,2025,suv,"Ghế lái chỉnh điện, tiện lợi.",positive
1036,honda,city,1.5 rs,2024,sedan,"City là nhất rồi, từ mẫu mã đẹp, vận hành mạnh...",positive
1037,vinfast,vf 6,,2024,suv,Nhờ tipcar chuyển lời hộ đến đội ngũ Vinfast r...,negative
1038,nissan,almera,,2024,sedan,"sai lầm của nissan khi xuống 3 máy , đi tầm 1-...",negative


In [None]:
# Try prediction on some samples
samples = df_test["comment"].tolist()
samples_vec = vec.transform(samples).toarray()

# Use the trained final model for predictions
probs = model.predict(samples_vec)
preds = np.argmax(probs, axis=1)

for i, (text, pred_idx) in enumerate(zip(samples, preds)):
    label_name = le.inverse_transform([pred_idx])[0]
    confidence = probs[i][pred_idx]
    print(f"Sample {i+1}:")
    print(f"\tText: {text}")
    print(f"\tPredicted label: {label_name}")
    print(f"\tConfidence: {confidence:.4f}\n")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
Sample 1:
	Text: Xe này xấu hơn phom 2023
	Predicted label: negative
	Confidence: 0.9539

Sample 2:
	Text: Ghế lái chỉnh điện, tiện lợi.
	Predicted label: neutral
	Confidence: 0.5907

Sample 3:
	Text: City là nhất rồi, từ mẫu mã đẹp, vận hành mạnh mẽ, cảm giác lái tốt, tính năng ok. Hơn hẳn so với vios và accent
	Predicted label: positive
	Confidence: 0.6123

Sample 4:
	Text: Nhờ tipcar chuyển lời hộ đến đội ngũ Vinfast rằng nên để màn hình ở vị trí trung tâm, quay thẳng theo dọc xe, ko nên để màn hình nghiêng về người lái, nhìn ko cân đối, gây xấu tổng thể cái xe. Mấy cái xe điện đều để vậy xấu, ko đẹp bằng mấy cái xe xăng của Vin, hoặc 1 số xe hãng khác. Mong vin thay đổi để phát triển tốt hơn.
	Predicted label: negative
	Confidence: 0.9395

Sample 5:
	Text: sai lầm của nissan khi xuống 3 máy , đi tầm 1-2 năm máy i3 sẽ cho thấy rõ nhược điểm của nó
	Predicted label: negative
	Confidence: 0.7894

