---

# Lib


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings("ignore")

from sklearn.preprocessing import LabelEncoder
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    make_scorer,
)

from xgboost import XGBClassifier
import optuna

---

# Read file


In [2]:
df_train = pd.read_csv("../data/processed/train.csv", encoding="utf-8")
df_val = pd.read_csv("../data/processed/val.csv", encoding="utf-8")

In [3]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3592 entries, 0 to 3591
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   comment  3592 non-null   object
 1   label    3592 non-null   object
 2   text     3592 non-null   object
dtypes: object(3)
memory usage: 84.3+ KB


In [4]:
df_val.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1040 entries, 0 to 1039
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   comment  1040 non-null   object
 1   label    1040 non-null   object
 2   text     1040 non-null   object
dtypes: object(3)
memory usage: 24.5+ KB


---

# Labels

## Train

In [5]:
matrix_labels_train = df_train["label"]

In [6]:
matrix_labels_train.head()

0    negative
1    negative
2    positive
3    positive
4    negative
Name: label, dtype: object

In [7]:
print(f"Number of labels: {len(matrix_labels_train.unique())}")
print(f"Labels: {matrix_labels_train.unique()}")

Number of labels: 3
Labels: ['negative' 'positive' 'neutral']


## Val

In [8]:
matrix_labels_val = df_val["label"]

In [9]:
matrix_labels_val.head()

0    negative
1    negative
2    positive
3    positive
4    negative
Name: label, dtype: object

In [10]:
print(f"Number of labels: {len(matrix_labels_val.unique())}")
print(f"Labels: {matrix_labels_val.unique()}")

Number of labels: 3
Labels: ['negative' 'positive' 'neutral']


---

# Train, val split


In [11]:
# Train
X_train = df_train[["comment"]]
y_train = matrix_labels_train

# Validation
X_val = df_val[["comment"]]
y_val = matrix_labels_val

In [12]:
# Encode labels
le= LabelEncoder()
y_train = le.fit_transform(y_train).astype(np.int64)

---

# Vectorize

In [13]:
vec = TfidfVectorizer(
    analyzer="char",
    min_df=3,
    max_df=0.95,
    ngram_range=(3, 5),
    max_features=30000,
)

In [14]:
# Fit TF-IDF on training and transform train/test
X_train_vec = vec.fit_transform(X_train["comment"])
X_val_vec = vec.transform(X_val["comment"])

In [16]:
print(f"Train shape: {X_train_vec.shape}")
print(f"Test shape: {X_val_vec.shape}")
print(f"Vocabulary size: {len(vec.get_feature_names_out())}")

Train shape: (3592, 30000)
Test shape: (1040, 30000)
Vocabulary size: 30000


---

# Load Model


---

## Svm

### Model


In [None]:
# SVC
svc = SVC(random_state=42)

In [20]:
# hyperparameters
param_grid = {
    "C": np.linspace(1, 10, 10),
    "kernel": ["linear", "rbf", "poly", "sigmoid"],
    "gamma": ["scale", "auto"],
}

In [21]:
scoring = 'f1_macro'

# Grid search
grid = GridSearchCV(
    svc,
    param_grid=param_grid,
    scoring=scoring,
    cv=5,
    n_jobs=-1,
    verbose=1,
)

In [22]:
# Fit
grid.fit(X_train_vec, y_train)

Fitting 5 folds for each of 80 candidates, totalling 400 fits


0,1,2
,estimator,SVC()
,param_grid,"{'C': array([ 1., ...8., 9., 10.]), 'gamma': ['scale', 'auto'], 'kernel': ['linear', 'rbf', ...]}"
,scoring,'f1_macro'
,n_jobs,-1
,refit,True
,cv,5
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,C,np.float64(4.0)
,kernel,'linear'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


In [31]:
print("Best params:")
for item in grid.best_params_.items():
    print(f"\t{item[0]}: {item[1]}")
print()
print(f"Best CV score (f1): {grid.best_score_:.2f}")

Best params:
	C: 4.0
	gamma: scale
	kernel: linear

Best CV score (f1): 0.54


### Eval


In [32]:
# Predict
y_pred = grid.predict(X_val_vec)
y_pred = le.inverse_transform(y_pred)

In [33]:
# Compute metrics
metrics = {
    "accuracy_score": accuracy_score(y_val, y_pred),
    "precision_macro": precision_score(y_val, y_pred, average="macro", zero_division=0),
    "recall_macro": recall_score(y_val, y_pred, average="macro", zero_division=0),
    "f1_macro": f1_score(y_val, y_pred, average="macro", zero_division=0),
}

matrix_metrics = pd.DataFrame.from_dict(metrics, orient="index", columns=["score"])

In [34]:
matrix_metrics.round(4)

Unnamed: 0,score
accuracy_score,0.6923
precision_macro,0.5531
recall_macro,0.5474
f1_macro,0.5461


In [35]:
# Classified report
print(classification_report(y_val.values, y_pred, target_names=le.classes_, zero_division=0))

              precision    recall  f1-score   support

    negative       0.67      0.75      0.71       410
     neutral       0.22      0.13      0.17       105
    positive       0.77      0.76      0.77       525

    accuracy                           0.69      1040
   macro avg       0.55      0.55      0.55      1040
weighted avg       0.68      0.69      0.68      1040



### Test


In [38]:
df_test = pd.read_csv("../data/raw/test.csv")
df_test = df_test.iloc[10:15,:]
df_test

Unnamed: 0,brand,model,version,year,segment,comment,label
10,mitsubishi,xforce,ultimate,2024,suv,"nhìn đèn hậu , đầu xe nó kém sang mang phân kh...",negative
11,mg,zs,std,2024,suv,Mình mới lấy ZS hồi tháng 5...tới nay đi chưa ...,negative
12,kia,seltos,facelift,2024,suv,Tự Kia + Thaco làm khó mình =)))) dải sản phẩm...,negative
13,honda,hr-v,g,2025,suv,con xf chả có điểm gì ăn được hrv luôn...ai đã...,positive
14,toyota,vios,g,2025,sedan,Cái đèn xi nhan chán nhỉ,negative


In [39]:
# Try prediction on some samples
samples = df_test["comment"].tolist()
samples_vec = vec.transform(samples)
preds = grid.best_estimator_.predict(samples_vec)
preds = le.inverse_transform(preds)

for i, (text, pred) in enumerate(zip(samples, preds), start=1):
    print(f"Sample {i}:")
    print(f"\tText: {text}")
    print(f"\tPredicted: {pred}\n")

Sample 1:
	Text: nhìn đèn hậu , đầu xe nó kém sang mang phân khúc xe cỏ hơn .. .
	Predicted: negative

Sample 2:
	Text: Mình mới lấy ZS hồi tháng 5...tới nay đi chưa thấy vde gì lớn ngoài khá hao xăng vs mỗi lần bật tắt điều hòa hơi bất tiện
	Predicted: negative

Sample 3:
	Text: Tự Kia + Thaco làm khó mình =)))) dải sản phẩm xe thì con nào cũng ngáo giá, ế toàn phần. Các hãng thì đua nhau giảm sâu nhưng riêng các cụ này thì không, có giảm nhưng ko đáng kể thì bảo sao mất hút hết khách kkkk
	Predicted: negative

Sample 4:
	Text: con xf chả có điểm gì ăn được hrv luôn...ai đã từng lái thử 2 con xe sẽ biết, cảm giác lái con xf chán hơn hẳn hrv... sau khi lái thử cả 2 con xe thì t đã chốt cọc hrv tuần trước
	Predicted: positive

Sample 5:
	Text: Cái đèn xi nhan chán nhỉ
	Predicted: negative



---

## Logistic regression

### Model


In [None]:
log_reg = LogisticRegression(random_state=42, max_iter=1000)

In [237]:
# hyperparameters
param_grid = {
    "C": np.linspace(1, 10, 10),
    "penalty": ["l1", "l2", "elasticnet"],
    "solver": ["lbfgs", "liblinear", "newton-cg", "newton-cholesky", "sag", "saga"],
}

In [None]:
scoring = 'f1'

# Grid search
grid = GridSearchCV(
    log_reg,
    param_grid=param_grid,
    scoring=scoring,
    cv=5,
    n_jobs=-1,
    verbose=0,
)

In [239]:
grid.fit(X_train_vec, y_train)

0,1,2
,estimator,LogisticRegression()
,param_grid,"{'C': array([ 1., ...8., 9., 10.]), 'penalty': ['l1', 'l2', ...], 'solver': ['lbfgs', 'liblinear', ...]}"
,scoring,'accuracy'
,n_jobs,-1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,np.float64(3.0)
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'liblinear'
,max_iter,100


In [240]:
print("Best params:")
for item in grid.best_params_.items():
    print(f"\t{item[0]}: {item[1]}")
print()
print(f"Best CV score (f1): {grid.best_score_:.2f}")

Best params:
	C: 3.0
	penalty: l2
	solver: liblinear

Best CV score (f1): 0.70


### Eval


In [241]:
# Predict
y_pred = grid.predict(X_val_vec)
y_pred = le.inverse_transform(y_pred)

In [242]:
# Compute metrics
metrics = {
    "accuracy_score": accuracy_score(y_val, y_pred),
    "precision_macro": precision_score(y_val, y_pred, average="macro", zero_division=0),
    "recall_macro": recall_score(y_val, y_pred, average="macro", zero_division=0),
    "f1_macro": f1_score(y_val, y_pred, average="macro", zero_division=0),
}

matrix_metrics = pd.DataFrame.from_dict(metrics, orient="index", columns=["score"])

In [243]:
matrix_metrics.round(4)

Unnamed: 0,score
accuracy_score,0.7183
precision_macro,0.6264
recall_macro,0.5375
f1_macro,0.5235


In [244]:
# Classified report
print(classification_report(y_val.values, y_pred, target_names=le.classes_, zero_division=0))

              precision    recall  f1-score   support

    negative       0.70      0.73      0.71       410
     neutral       0.44      0.04      0.07       105
    positive       0.74      0.85      0.79       525

    accuracy                           0.72      1040
   macro avg       0.63      0.54      0.52      1040
weighted avg       0.69      0.72      0.69      1040



### Test


In [249]:
df_test = pd.read_csv("../data/raw/test.csv")
df_test = df_test.iloc[150:155,:]
df_test

Unnamed: 0,brand,model,version,year,segment,comment,label
150,skoda,slavia,style,2025,sedan,5-6km trên 100 cây luôn mà,positive
151,mazda,cx-3,1.5 at,2025,suv,Với nhu cầu chạy phố và đéo cần chở ai thì các...,neutral
152,toyota,corolla cross,v,2024,suv,"Không biết chê gì luôn, mọi thứ đều cân bằng, ...",positive
153,toyota,vios,e,2024,sedan,Vô lăng đánh lái hơi nặng bác ơi,negative
154,honda,city,1.5 rs,2024,sedan,mỗi cái chưa có cảm biến áp xuất lốp,negative


In [250]:
# Try prediction on some samples
samples = df_test["comment"].tolist()
samples_vec = vec.transform(samples)
preds = grid.best_estimator_.predict(samples_vec)
preds = le.inverse_transform(preds)

for i, (text, pred) in enumerate(zip(samples, preds), start=1):
    print(f"Sample {i}:")
    print(f"\tText: {text}")
    print(f"\tPredicted: {pred}\n")

Sample 1:
	Text: 5-6km trên 100 cây luôn mà
	Predicted: positive

Sample 2:
	Text: Với nhu cầu chạy phố và đéo cần chở ai thì các ô cmt hàng sau nên “nín”, các ô nên nhớ mua xe theo nhu cầu chứ ko phải mua xe phải thoat mãn cmt tiêu chuẩn của số đông =))
	Predicted: positive

Sample 3:
	Text: Không biết chê gì luôn, mọi thứ đều cân bằng, hệ thống treo ổn, cách âm ổn. Quá phù hợp trong tầm tiền, mà lại là Toy
	Predicted: positive

Sample 4:
	Text: Vô lăng đánh lái hơi nặng bác ơi
	Predicted: negative

Sample 5:
	Text: mỗi cái chưa có cảm biến áp xuất lốp
	Predicted: negative



---

## Xgboost

### Model


In [None]:
def objective(trial):

    params = {
        "objective": "multi:softprob",
        "num_class": 3,
        "eval_metric": "mlogloss",
        "tree_method": "hist",
        "random_state": 42,

        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "n_estimators": trial.suggest_int("n_estimators", 200, 800),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 5),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 1.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 1.0),
    }

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    f1_scores = []

    for train_idx, val_idx in skf.split(X_train_vec, y_train):
        X_tr, X_val = X_train_vec[train_idx], X_train_vec[val_idx]
        y_tr, y_val = y_train[train_idx], y_train[val_idx]

        model = XGBClassifier(**params)
        model.fit(X_tr, y_tr)

        y_pred = model.predict(X_val)
        f1 = f1_score(y_val, y_pred, average="macro")
        f1_scores.append(f1)

    return np.mean(f1_scores)

In [252]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=10)

[I 2025-12-17 18:09:00,676] A new study created in memory with name: no-name-f81d4a4e-a00a-4c13-a61f-b43ee85137e5


[I 2025-12-17 18:15:16,094] Trial 0 finished with value: 0.4775998070633166 and parameters: {'max_depth': 7, 'learning_rate': 0.024738495464990965, 'n_estimators': 257, 'subsample': 0.664088616340235, 'colsample_bytree': 0.8053836577353533, 'min_child_weight': 7, 'gamma': 0.13917917111205325, 'reg_alpha': 2.4701019033804386, 'reg_lambda': 4.062743864858036}. Best is trial 0 with value: 0.4775998070633166.
[I 2025-12-17 18:19:11,426] Trial 1 finished with value: 0.5072955636700508 and parameters: {'max_depth': 3, 'learning_rate': 0.07096230041036487, 'n_estimators': 361, 'subsample': 0.8475696807517631, 'colsample_bytree': 0.6924092433546372, 'min_child_weight': 6, 'gamma': 0.2510625946051914, 'reg_alpha': 2.346607100760388, 'reg_lambda': 2.334644907221353}. Best is trial 1 with value: 0.5072955636700508.
[I 2025-12-17 18:23:15,179] Trial 2 finished with value: 0.49260199434142543 and parameters: {'max_depth': 5, 'learning_rate': 0.049418470041977955, 'n_estimators': 406, 'subsample': 0

In [254]:
print("Best F1-macro:", study.best_value)
print("Best trial:", study.best_trial.number)

Best F1-macro: 0.5314066426930667
Best trial: 6


In [255]:
best_params = study.best_params

print("Best params:")
for k, v in best_params.items():
    print(f"  {k}: {v}")

Best params:
  max_depth: 5
  learning_rate: 0.10626486740244691
  n_estimators: 421
  subsample: 0.8652682861346357
  colsample_bytree: 0.693211299714547
  min_child_weight: 7
  gamma: 0.7929982231128929
  reg_alpha: 1.0767350295379696
  reg_lambda: 3.287259555024159


In [256]:
# Build final model with best hyperparameters
best_params.update(
    {
        "objective": "multi:softprob",
        "num_class": 3,
        "eval_metric": "mlogloss",
        "tree_method": "hist",
        "random_state": 42,
    }
)

xgb_model = XGBClassifier(**best_params)

In [258]:
xgb_model.fit(X_train_vec, y_train)

0,1,2
,objective,'multi:softprob'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.693211299714547
,device,
,early_stopping_rounds,
,enable_categorical,False


### Eval


In [259]:
y_pred = xgb_model.predict(X_val_vec)
y_pred = le.inverse_transform(y_pred)

In [260]:
# Compute metrics
metrics = {
    "accuracy_score": accuracy_score(y_val, y_pred),
    "precision_macro": precision_score(y_val, y_pred, average="macro", zero_division=0),
    "recall_macro": recall_score(y_val, y_pred, average="macro", zero_division=0),
    "f1_macro": f1_score(y_val, y_pred, average="macro", zero_division=0),
}

matrix_metrics = pd.DataFrame.from_dict(metrics, orient="index", columns=["score"])

In [261]:
matrix_metrics.round(4)

Unnamed: 0,score
accuracy_score,0.6731
precision_macro,0.5529
recall_macro,0.5201
f1_macro,0.5189


In [262]:
# Classified report
print(classification_report(y_val, y_pred, target_names=le.classes_, zero_division=0))

              precision    recall  f1-score   support

    negative       0.66      0.69      0.67       410
     neutral       0.29      0.10      0.14       105
    positive       0.71      0.78      0.74       525

    accuracy                           0.67      1040
   macro avg       0.55      0.52      0.52      1040
weighted avg       0.65      0.67      0.65      1040



### Test


In [263]:
df_test = pd.read_csv("../data/raw/test.csv")
df_test = df_test.iloc[100:105,:]
df_test

Unnamed: 0,brand,model,version,year,segment,comment,label
100,byd,sealion 6,,2025,suv,Hệ thống treo chưa được tốt lắm.,negative
101,toyota,corolla cross,v,2024,suv,Cửa sổ trời toàn cảnh lại là nhược điểm. Cửa s...,negative
102,geely,coolray,flagship,2025,suv,nếu geely thật sự vào thì có thể mua chiếc để ...,positive
103,toyota,vios,,2025,sedan,Hút máu khách hàng mang tiền về cho chủ,negative
104,mitsubishi,xforce,ultimate,2024,suv,cái gì cũng đẹp mỗi tội là bê nguyên 1 cái bàn...,neutral


In [264]:
# Try prediction on some samples
samples = df_test["comment"].tolist()
samples_vec = vec.transform(samples)
preds = grid.best_estimator_.predict(samples_vec)
preds = le.inverse_transform(preds)

for i, (text, pred) in enumerate(zip(samples, preds), start=1):
    print(f"Sample {i}:")
    print(f"\tText: {text}")
    print(f"\tPredicted: {pred}\n")

Sample 1:
	Text: Hệ thống treo chưa được tốt lắm.
	Predicted: negative

Sample 2:
	Text: Cửa sổ trời toàn cảnh lại là nhược điểm. Cửa sổ trời nhỏ thôi như Cx5 lại ngon. Mặt ca lăng mới không đẹp như bản cũ. Ý kiến cá nhân
	Predicted: negative

Sample 3:
	Text: nếu geely thật sự vào thì có thể mua chiếc để dùng rồi
	Predicted: negative

Sample 4:
	Text: Hút máu khách hàng mang tiền về cho chủ
	Predicted: positive

Sample 5:
	Text: cái gì cũng đẹp mỗi tội là bê nguyên 1 cái bàn thờ đặt lên xe thì nhìn xấu quá thể
	Predicted: negative

