---

# Lib


In [1]:
import joblib

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings("ignore")

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
)

from xgboost import XGBClassifier
import optuna

---

# Read file


In [2]:
df_train = pd.read_csv("../data/processed/train.csv", encoding="utf-8")
df_1 = pd.read_csv("../data/processed/val.csv", encoding="utf-8")
df_2 = pd.read_csv("../data/processed/test.csv", encoding="utf-8")
df_val = pd.concat([df_1, df_2], ignore_index=True)

In [3]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3592 entries, 0 to 3591
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   brand    3592 non-null   object
 1   model    3592 non-null   object
 2   version  2506 non-null   object
 3   year     3592 non-null   int64 
 4   segment  3592 non-null   object
 5   comment  3592 non-null   object
 6   label    3592 non-null   object
dtypes: int64(1), object(6)
memory usage: 196.6+ KB


In [4]:
df_val.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1540 entries, 0 to 1539
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   brand    1540 non-null   object
 1   model    1540 non-null   object
 2   version  1051 non-null   object
 3   year     1540 non-null   int64 
 4   segment  1540 non-null   object
 5   comment  1540 non-null   object
 6   label    1540 non-null   object
dtypes: int64(1), object(6)
memory usage: 84.3+ KB


---

# Labels


## Train


In [3]:
matrix_labels_train = df_train["label"]

In [4]:
matrix_labels_train.head()

0    negative
1    positive
2     neutral
3    positive
4    negative
Name: label, dtype: object

In [5]:
print(f"Number of labels: {len(matrix_labels_train.unique())}")
print(f"Labels: {matrix_labels_train.unique()}")

Number of labels: 3
Labels: ['negative' 'positive' 'neutral']


## Val


In [6]:
matrix_labels_val = df_val["label"]

In [7]:
matrix_labels_val.head()

0    positive
1    positive
2    positive
3    negative
4    positive
Name: label, dtype: object

In [8]:
print(f"Number of labels: {len(matrix_labels_val.unique())}")
print(f"Labels: {matrix_labels_val.unique()}")

Number of labels: 3
Labels: ['positive' 'negative' 'neutral']


---

# Train, val split


In [9]:
# Train
X_train = df_train[["comment"]]
y_train = matrix_labels_train

# Validation
X_val = df_val[["comment"]]
y_val = matrix_labels_val

In [10]:
# Encode labels
le = LabelEncoder()
y_train = le.fit_transform(y_train).astype(np.int64)
print(f"Labels: {le.classes_}")

Labels: ['negative' 'neutral' 'positive']


---

# Vectorize


In [11]:
vec = TfidfVectorizer(
    analyzer="char",
    min_df=2,
    max_df=0.95,
    ngram_range=(3, 5),
    max_features=30000,
)

In [12]:
# Fit TF-IDF on training and transform train/test
X_train_vec = vec.fit_transform(X_train["comment"])
X_val_vec = vec.transform(X_val["comment"])

In [13]:
print(f"Train shape: {X_train_vec.shape}")
print(f"Test shape: {X_val_vec.shape}")
print(f"Vocabulary size: {len(vec.get_feature_names_out())}")

Train shape: (3592, 30000)
Test shape: (1540, 30000)
Vocabulary size: 30000


---

# Load Model


In [14]:
# Define a StratifiedKFold splitters
cv_splitter = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

---

## Xgboost


### Model


In [15]:
def objective(trial):

    params = {
        "objective": "multi:softprob",
        "num_class": 3,
        "eval_metric": "mlogloss",
        "tree_method": "hist",
        "random_state": 42,
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "n_estimators": trial.suggest_int("n_estimators", 500, 800),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 5),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-5, 1.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-5, 1.0),
    }

    skf = cv_splitter
    f1_scores = []

    for train_idx, val_idx in skf.split(X_train_vec, y_train):
        X_tr, X_val = X_train_vec[train_idx], X_train_vec[val_idx]
        y_tr, y_val = y_train[train_idx], y_train[val_idx]

        model = XGBClassifier(**params)
        model.fit(X_tr, y_tr)

        y_pred = model.predict(X_val)
        f1 = f1_score(y_val, y_pred, average="macro")
        f1_scores.append(f1)

    return np.mean(f1_scores)

In [None]:
study = optuna.create_study(direction="maximize", study_name="xgb_f1_macro")
study.optimize(objective, n_trials=20)

In [None]:
print("Best f1-macro:", study.best_value)
print("Best trial:", study.best_trial.number)

Best f1-macro: 0.48725435270910095
Best trial: 14


In [None]:
best_params = study.best_params

print("Best params:")
for k, v in best_params.items():
    print(f"  {k}: {v}")

Best params:
  max_depth: 5
  learning_rate: 0.07590214057473325
  n_estimators: 697
  subsample: 0.7058329982069236
  colsample_bytree: 0.9920929369117178
  gamma: 0.0675097620876346
  reg_alpha: 0.7681049077910618
  reg_lambda: 0.02181451468909168


In [None]:
# Build final model with best hyperparameters
best_params.update(
    {
        "objective": "multi:softprob",
        "num_class": 3,
        "eval_metric": "mlogloss",
        "tree_method": "hist",
        "random_state": 42,
        
        "max_depth": 5,
        "learning_rate": 0.07590214057473325,
        "n_estimators": 697,
        "subsample": 0.7058329982069236,
        "colsample_bytree": 0.9920929369117178,
        "gamma": 0.0675097620876346,
        "reg_alpha": 0.7681049077910618,
        "reg_lambda": 0.02181451468909168,
    }
)

xgb_model = XGBClassifier(**best_params)

In [16]:
# Build final model with best hyperparameters
best_params = {
    "objective": "multi:softprob",
    "num_class": 3,
    "eval_metric": "mlogloss",
    "tree_method": "hist",
    "random_state": 42,
    "max_depth": 5,
    "learning_rate": 0.07590214057473325,
    "n_estimators": 697,
    "subsample": 0.7058329982069236,
    "colsample_bytree": 0.9920929369117178,
    "gamma": 0.0675097620876346,
    "reg_alpha": 0.7681049077910618,
    "reg_lambda": 0.02181451468909168,
}

xgb_model = XGBClassifier(**best_params)

In [17]:
xgb_model.fit(X_train_vec, y_train)

0,1,2
,objective,'multi:softprob'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.9920929369117178
,device,
,early_stopping_rounds,
,enable_categorical,False


### Eval validation


In [23]:
y_pred = xgb_model.predict(X_val_vec)
y_pred = le.inverse_transform(y_pred)

In [24]:
print(f"accuracy: {accuracy_score(y_val, y_pred):.4f}")
print(
    f"precision: {precision_score(y_val, y_pred, average='macro', zero_division=0):.4f}"
)
print(f"recall: {recall_score(y_val, y_pred, average='macro', zero_division=0):.4f}")
print(f"f1: {f1_score(y_val, y_pred, average='macro', zero_division=0):.4f}")

accuracy: 0.6584
precision: 0.5211
recall: 0.4980
f1: 0.4911


In [25]:
# Classified report
print(classification_report(y_val, y_pred, target_names=le.classes_, zero_division=0))

              precision    recall  f1-score   support

    negative       0.65      0.65      0.65       607
     neutral       0.23      0.06      0.09       156
    positive       0.68      0.78      0.73       777

    accuracy                           0.66      1540
   macro avg       0.52      0.50      0.49      1540
weighted avg       0.62      0.66      0.63      1540



### Test


In [21]:
df_test = pd.read_csv("../data/processed/test.csv")
df_test_predict = df_test.copy()
df_test_predict.head()

Unnamed: 0,brand,model,version,year,segment,comment,label
0,mitsubishi,attrage,cvt,2025,sedan,ngoại thất hiện_đại hẳn,positive
1,nissan,almera,,2025,sedan,thương_hiệu ninsan chỗ chê gtr nổi_tiếng khấp ...,positive
2,mg,5,luxury,2025,sedan,xe trung_quốc thiết_kế đẹp,positive
3,hyundai,creta,n line,2025,suv,giá trưng_bày,negative
4,toyota,vios,,2025,sedan,vụ tai_nạn mấy hôm trung_quốc vios sởn gai_ốc ...,negative


In [None]:
# Try prediction on some samples
samples = df_test["comment"].tolist()
samples_vec = vec.transform(samples).toarray()

# Use the trained final model for predictions
probs = xgb_model.predict(samples_vec) 
preds = np.argmax(probs, axis=1)

for i, (text, pred_idx) in enumerate(zip(samples, preds)):
    label_name = le.inverse_transform([pred_idx])[0]
    confidence = probs[i][pred_idx]
    df_test_predict.at[i, "predicted_label"] = label_name
    df_test_predict.at[i, "confidence"] = confidence

In [None]:
print(classification_report(df_test["label"], df_test_predict["predicted_label"], target_names=le.classes_, zero_division=0))

In [None]:
# Confusion matrix
cm = confusion_matrix(df_test_predict["label"], df_test_predict["predicted_label"])

# Plot confusion matrix
plt.figure(figsize=(8, 5))
sns.heatmap(
    cm,
    annot=True,
    fmt="d",
    cmap="coolwarm",
    xticklabels=le.classes_,
    yticklabels=le.classes_,
    cbar_kws={"label": "Count"},
)
plt.title("Confusion matrix", pad=20)
plt.ylabel("True", fontsize=12)
plt.xlabel("Predicted", fontsize=12)
plt.xticks(rotation=45, ha="right")
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

---

# Save model


In [None]:
model_save_path = "../models/ml/"

# Save XGBoost model
joblib.dump(xgb_model, model_save_path + "xgboost.pkl")
print(f"XGBoost model saved to {model_save_path}xgboost.pkl")