---

# Lib


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings("ignore")

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    make_scorer,
)

from xgboost import XGBClassifier

from utils.other import parse_label, matrix_labels

---

# Read file


In [2]:
df_train = pd.read_csv("../data/processed/train.csv", encoding="utf-8")
df_val = pd.read_csv("../data/processed/val.csv", encoding="utf-8")

In [3]:
df_train.head()

Unnamed: 0,comment,label
0,đuôi dạng coupe đẹp hẳn,{EXTERIOR#Positive};
1,đèn xấu,{EXTERIOR#Negative};
2,yc xăng nội_thất ok xforce chạy ga êm ồn xforc...,{EXTERIOR#Positive};{PERFORMANCE#Negative};{IN...
3,đi hài_lòng bốc ngon âm_rẻ tiết_kiệm xăng_lít ...,{PERFORMANCE#Positive};{COST#Positive};
4,bệ tì_tay màn_hình kết khai đồ trung_nhập indo,{INTERIOR#Positive};


In [4]:
df_val.head()

Unnamed: 0,comment,label
0,mông ok đấy,{EXTERIOR#Positive};
1,đi thử độ êm_khung gầm yc ngon tăng_tốc êm_mượ...,{PERFORMANCE#Positive};
2,chê trung_quốc đi xe trung_quốc xe trung_quốc ...,{BRAND#Positive};
3,định mua tết đồ_đạc thay_thế,{BRAND#Negative};
4,xe 500 t cặp đèn_pha led trăm củ,{COST#Negative};


---

# Labels

## Train

In [5]:
matrix_labels_train , mlb_train = matrix_labels(df_train[["label"]])

In [6]:
matrix_labels_train.head()

Unnamed: 0,BRAND#Negative,BRAND#Neutral,BRAND#Positive,COST#Negative,COST#Neutral,COST#Positive,EXTERIOR#Negative,EXTERIOR#Neutral,EXTERIOR#Positive,FEATURES#Negative,FEATURES#Neutral,FEATURES#Positive,INTERIOR#Negative,INTERIOR#Neutral,INTERIOR#Positive,PERFORMANCE#Negative,PERFORMANCE#Neutral,PERFORMANCE#Positive
0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,1,0,0
3,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0


In [7]:
print(f"Number of labels: {len(mlb_train.classes_)}")
print(f"Labels: {mlb_train.classes_}")

Number of labels: 18
Labels: ['BRAND#Negative' 'BRAND#Neutral' 'BRAND#Positive' 'COST#Negative'
 'COST#Neutral' 'COST#Positive' 'EXTERIOR#Negative' 'EXTERIOR#Neutral'
 'EXTERIOR#Positive' 'FEATURES#Negative' 'FEATURES#Neutral'
 'FEATURES#Positive' 'INTERIOR#Negative' 'INTERIOR#Neutral'
 'INTERIOR#Positive' 'PERFORMANCE#Negative' 'PERFORMANCE#Neutral'
 'PERFORMANCE#Positive']


## Val

In [8]:
matrix_labels_val , mlb_val = matrix_labels(df_val[["label"]])

In [9]:
matrix_labels_val.head()

Unnamed: 0,BRAND#Negative,BRAND#Neutral,BRAND#Positive,COST#Negative,COST#Neutral,COST#Positive,EXTERIOR#Negative,EXTERIOR#Neutral,EXTERIOR#Positive,FEATURES#Negative,FEATURES#Neutral,FEATURES#Positive,INTERIOR#Negative,INTERIOR#Neutral,INTERIOR#Positive,PERFORMANCE#Negative,PERFORMANCE#Neutral,PERFORMANCE#Positive
0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [10]:
print(f"Number of labels: {len(mlb_val.classes_)}")
print(f"Labels: {mlb_val.classes_}")

Number of labels: 18
Labels: ['BRAND#Negative' 'BRAND#Neutral' 'BRAND#Positive' 'COST#Negative'
 'COST#Neutral' 'COST#Positive' 'EXTERIOR#Negative' 'EXTERIOR#Neutral'
 'EXTERIOR#Positive' 'FEATURES#Negative' 'FEATURES#Neutral'
 'FEATURES#Positive' 'INTERIOR#Negative' 'INTERIOR#Neutral'
 'INTERIOR#Positive' 'PERFORMANCE#Negative' 'PERFORMANCE#Neutral'
 'PERFORMANCE#Positive']


---

# Train, val split


In [11]:
# Train
X_train = df_train[["comment"]]
y_train = matrix_labels_train

# Validation
X_val = df_val[["comment"]]
y_val = matrix_labels_val

In [12]:
pd.concat([X_train, pd.DataFrame(y_train, columns=mlb_train.classes_)], axis=1).head()

Unnamed: 0,comment,BRAND#Negative,BRAND#Neutral,BRAND#Positive,COST#Negative,COST#Neutral,COST#Positive,EXTERIOR#Negative,EXTERIOR#Neutral,EXTERIOR#Positive,FEATURES#Negative,FEATURES#Neutral,FEATURES#Positive,INTERIOR#Negative,INTERIOR#Neutral,INTERIOR#Positive,PERFORMANCE#Negative,PERFORMANCE#Neutral,PERFORMANCE#Positive
0,đuôi dạng coupe đẹp hẳn,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
1,đèn xấu,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
2,yc xăng nội_thất ok xforce chạy ga êm ồn xforc...,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,1,0,0
3,đi hài_lòng bốc ngon âm_rẻ tiết_kiệm xăng_lít ...,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1
4,bệ tì_tay màn_hình kết khai đồ trung_nhập indo,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0


In [13]:
pd.concat([X_val, pd.DataFrame(y_val, columns=mlb_val.classes_)], axis=1).head()

Unnamed: 0,comment,BRAND#Negative,BRAND#Neutral,BRAND#Positive,COST#Negative,COST#Neutral,COST#Positive,EXTERIOR#Negative,EXTERIOR#Neutral,EXTERIOR#Positive,FEATURES#Negative,FEATURES#Neutral,FEATURES#Positive,INTERIOR#Negative,INTERIOR#Neutral,INTERIOR#Positive,PERFORMANCE#Negative,PERFORMANCE#Neutral,PERFORMANCE#Positive
0,mông ok đấy,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
1,đi thử độ êm_khung gầm yc ngon tăng_tốc êm_mượ...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,chê trung_quốc đi xe trung_quốc xe trung_quốc ...,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,định mua tết đồ_đạc thay_thế,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,xe 500 t cặp đèn_pha led trăm củ,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


---

# Vectorize

In [14]:
vec = TfidfVectorizer(
    analyzer="char",
    min_df=3,
    max_df=0.95,
    ngram_range=(3, 5),
    sublinear_tf=True,
    max_features=30000,
)

In [15]:
# Fit TF-IDF on training and transform train/test
X_train_vec = vec.fit_transform(X_train["comment"])
X_val_vec = vec.transform(X_val["comment"])

In [16]:
print(f"Train shape: {X_train_vec.shape}")
print(f"Test shape: {X_val_vec.shape}")
print(f"Vocabulary size: {len(vec.get_feature_names_out())}")

Train shape: (1403, 13912)
Test shape: (500, 13912)
Vocabulary size: 13912


---

# Load Model


---

## Svm

### Model


In [17]:
# One-vs-Rest SVC
ovr_svc = OneVsRestClassifier(SVC())

In [18]:
# hyperparameters
param_grid = {
    "estimator__C": np.linspace(1, 10, 20),
    "estimator__kernel": ["linear", "rbf", "poly", "sigmoid"],
    "estimator__gamma": ["scale", "auto"],
}

In [19]:
scorer = make_scorer(f1_score, average="micro", zero_division=0)

# Grid search
grid = GridSearchCV(
    ovr_svc, 
    param_grid=param_grid, 
    scoring=scorer, 
    cv=5, 
    n_jobs=-1, 
    verbose=1
)

In [20]:
# Fit
grid.fit(X_train_vec, y_train.values)

Fitting 5 folds for each of 160 candidates, totalling 800 fits


0,1,2
,estimator,OneVsRestClas...timator=SVC())
,param_grid,"{'estimator__C': array([ 1. ... 10. ]), 'estimator__gamma': ['scale', 'auto'], 'estimator__kernel': ['linear', 'rbf', ...]}"
,scoring,make_scorer(f...ro_division=0)
,n_jobs,-1
,refit,True
,cv,5
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,C,np.float64(5.7368421052631575)
,kernel,'sigmoid'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


In [21]:
print("Best params:")
for item in grid.best_params_.items():
    print(f"\t{item[0]}: {item[1]}")
print()
print(f"Best CV score (f1_micro): {grid.best_score_:.2f}")

Best params:
	estimator__C: 5.7368421052631575
	estimator__gamma: scale
	estimator__kernel: sigmoid

Best CV score (f1_micro): 0.44


### Eval


In [22]:
# Predict
y_pred = grid.predict(X_val_vec)

In [23]:
# Compute metrics
metrics = {
    "precision_micro": precision_score(
        y_val.values, y_pred, average="micro", zero_division=0
    ),
    "recall_micro": recall_score(
        y_val.values, y_pred, average="micro", zero_division=0
    ),
    "f1_micro": f1_score(y_val.values, y_pred, average="micro", zero_division=0),
    "precision_macro": precision_score(
        y_val.values, y_pred, average="macro", zero_division=0
    ),
    "recall_macro": recall_score(
        y_val.values, y_pred, average="macro", zero_division=0
    ),
    "f1_macro": f1_score(y_val.values, y_pred, average="macro", zero_division=0),
}

matrix_metrics = pd.DataFrame.from_dict(metrics, orient="index", columns=["Score"])

In [24]:
matrix_metrics.round(4)

Unnamed: 0,Score
precision_micro,0.5201
recall_micro,0.4189
f1_micro,0.4641
precision_macro,0.3443
recall_macro,0.2947
f1_macro,0.3138


In [25]:
# Classified report
print(
    classification_report(
        y_val.values, y_pred, target_names=y_train.columns, zero_division=0
    )
)

                      precision    recall  f1-score   support

      BRAND#Negative       0.46      0.30      0.37        63
       BRAND#Neutral       0.00      0.00      0.00        10
      BRAND#Positive       0.38      0.21      0.27        77
       COST#Negative       0.65      0.58      0.61        59
        COST#Neutral       0.00      0.00      0.00        10
       COST#Positive       0.65      0.50      0.57        52
   EXTERIOR#Negative       0.57      0.56      0.56        63
    EXTERIOR#Neutral       0.17      0.09      0.12        11
   EXTERIOR#Positive       0.61      0.62      0.62        95
   FEATURES#Negative       0.57      0.66      0.61        38
    FEATURES#Neutral       0.00      0.00      0.00         6
   FEATURES#Positive       0.35      0.19      0.25        36
   INTERIOR#Negative       0.36      0.29      0.32        34
    INTERIOR#Neutral       0.00      0.00      0.00         5
   INTERIOR#Positive       0.50      0.49      0.49        47
PERFORM

### Test


In [26]:
df_test = pd.read_csv("../data/raw/val.csv")
df_test = df_test.iloc[-26:-20,:]
df_test

Unnamed: 0,brand,model,version,year,segment,comment,label
474,huyndai,accent,,2024,sedan,Minh đang chay 2022 huhu tiec quá moi sai 2 na...,{BRAND#Positive};
475,byd,sealion 6,,2025,suv,khung gầm nó có vẻ bị chê nhiều,{PERFORMANCE#Negative};
476,nissan,almera,,2024,sedan,Đổi lên bản mới này thì quá xứng đáng luôn,{BRAND#Positive};
477,toyota,vios,,2025,sedan,Dùng khung gầm daihatsu là k thích lắm,{EXTERIOR#Negative};
478,honda,hr-v,l,2025,suv,Bản L này có gì khác gì bản G đâu mà giá đắt h...,{COST#Negative};{FEATURES#Negative};
479,byd,atto 3,,2024,suv,"Atto3 đi đầm chắc, tăng tốc mượt, ghế trước sa...",{PERFORMANCE#Positive};{INTERIOR#Positive};{EX...


In [27]:
# Try prediction on some samples
samples = df_test["comment"].tolist()
samples_vec = vec.transform(samples)
preds = grid.best_estimator_.predict(samples_vec)


def decode_labels(pred_row, classes):
    return [cls for cls, val in zip(classes, pred_row) if val == 1]


for i, (text, pred_row) in enumerate(zip(samples, preds)):
    labels = decode_labels(pred_row, y_train.columns.tolist())
    print(f"Sample {i+1}:")
    print(f"\tText: {text}")
    print(f"\tPredicted labels: {labels}\n")

Sample 1:
	Text: Minh đang chay 2022 huhu tiec quá moi sai 2 nam . Sao no ko ra sớm hon
	Predicted labels: []

Sample 2:
	Text: khung gầm nó có vẻ bị chê nhiều
	Predicted labels: ['PERFORMANCE#Negative']

Sample 3:
	Text: Đổi lên bản mới này thì quá xứng đáng luôn
	Predicted labels: ['BRAND#Positive', 'EXTERIOR#Positive']

Sample 4:
	Text: Dùng khung gầm daihatsu là k thích lắm
	Predicted labels: ['EXTERIOR#Negative']

Sample 5:
	Text: Bản L này có gì khác gì bản G đâu mà giá đắt hơn 51tr. Chỉ là thêm Camera cập lề bên phải và cốp điện thôi. Thà mua bản G, lấy 51tr đó độ cả Camera 360, độ cốp điện, độ cả màn hình to... vẫn chưa hết tiền 51tr ấy.
	Predicted labels: ['FEATURES#Negative']

Sample 6:
	Text: Atto3 đi đầm chắc, tăng tốc mượt, ghế trước sau ngồi đều thoải mái dễ chịu. Thiết kế nội ngoại thất đẹp.
	Predicted labels: ['EXTERIOR#Positive', 'INTERIOR#Positive', 'PERFORMANCE#Positive']



---

## Logistic regression

### Model


### Eval


### Test


---

## Xgboost

### Model


### Eval


### Test
