---

# Lib


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings("ignore")

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    precision_score,
    recall_score,
    f1_score,
    make_scorer,
)

from xgboost import XGBClassifier

from utils.other import parse_label

---

# Read file


In [None]:
df = pd.read_csv("../data/processed/train_preprocessed.csv")

In [None]:
df.head()

Unnamed: 0,comment,label
0,mẫu đẹp hiện_đại mặt đồng_hồ trung_tâm âm mặt ...,{EXTERIOR#Positive};{INTERIOR#Negative};
1,creta tầm giá xforce chút phiên_bản rẻ thiết_k...,{COST#Neutral};{EXTERIOR#Neutral};
2,giá cx5 dl mặc_dù rộng say cx5,{COST#Positive};
3,giá đợi chương_trình khuyến_mại,{COST#Negative};
4,creta xe mượt_mà hyundai phân_khúc khung gầm t...,{EXTERIOR#Positive};{INTERIOR#Positive};{PERFO...


---

# Label


In [None]:
matrix_label_train , mlb_train = matrix_labels(df_train[["label"]])
matrix_label_val , mlb_val = matrix_labels(df_val[["label"]])

In [None]:
matrix_label_train.head()

In [None]:
matrix_label_val.head()

---

# Train, val split


In [12]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.3, random_state=42
)

---

# Vectorize

In [13]:
vec = TfidfVectorizer(
    min_df=3,
    max_df=0.95,
    ngram_range=(1, 2),
    sublinear_tf=True,
    max_features=30000,
)

In [14]:
# Fit TF-IDF on training and transform train/test
X_train_vec = vec.fit_transform(X_train["comment"])
X_val_vec = vec.transform(X_val["comment"])

In [15]:
print(f"Train shape: {X_train_vec.shape}")
print(f"Test shape: {X_val_vec.shape}")
print(f"Vocabulary size: {len(vec.get_feature_names_out())}")

Train shape: (635, 696)
Test shape: (273, 696)
Vocabulary size: 696


In [16]:
print("Number of labels:", y.shape[1])
print("Number of labels train", y_train.shape[1])
print("Number of labels test", y_val.shape[1])

Number of labels: 18
Number of labels train 18
Number of labels test 18


---

# Load Model


---

## Svm

### Model


In [17]:
# One-vs-Rest SVC
ovr_svc = OneVsRestClassifier(SVC())

In [18]:
# hyperparameters
param_grid = {
    "estimator__C": np.linspace(1, 10, 40),
    "estimator__kernel": ["linear", "rbf", "poly", "sigmoid"],
    "estimator__gamma": ["scale", "auto"],
}

In [19]:
scorer = make_scorer(f1_score, average="micro", zero_division=0)

# Grid search
grid = GridSearchCV(
    ovr_svc, 
    param_grid=param_grid, 
    scoring=scorer, 
    cv=5, 
    n_jobs=-1, 
    verbose=1
)

In [20]:
# Fit
grid.fit(X_train_vec, y_train.values)

Fitting 5 folds for each of 320 candidates, totalling 1600 fits


0,1,2
,estimator,OneVsRestClas...timator=SVC())
,param_grid,"{'estimator__C': array([ 1. ... 10. ]), 'estimator__gamma': ['scale', 'auto'], 'estimator__kernel': ['linear', 'rbf', ...]}"
,scoring,make_scorer(f...ro_division=0)
,n_jobs,-1
,refit,True
,cv,5
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,C,np.float64(3.076923076923077)
,kernel,'linear'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


In [21]:
print("Best params:")
for item in grid.best_params_.items():
    print(f"\t{item[0]}: {item[1]}")
print()
print(f"Best CV score (f1_micro): {grid.best_score_:.2f}")

Best params:
	estimator__C: 3.076923076923077
	estimator__gamma: scale
	estimator__kernel: linear

Best CV score (f1_micro): 0.46


### Eval


In [22]:
# Predict
y_pred = grid.predict(X_val_vec)

In [23]:
# Compute metrics
metrics = {
    "precision_micro": precision_score(
        y_val.values, y_pred, average="micro", zero_division=0
    ),
    "recall_micro": recall_score(
        y_val.values, y_pred, average="micro", zero_division=0
    ),
    "f1_micro": f1_score(y_val.values, y_pred, average="micro", zero_division=0),
    "precision_macro": precision_score(
        y_val.values, y_pred, average="macro", zero_division=0
    ),
    "recall_macro": recall_score(
        y_val.values, y_pred, average="macro", zero_division=0
    ),
    "f1_macro": f1_score(y_val.values, y_pred, average="macro", zero_division=0),
}

matrix_metrics = pd.DataFrame.from_dict(metrics, orient="index", columns=["Score"])

In [24]:
matrix_metrics.round(4)

Unnamed: 0,Score
precision_micro,0.5539
recall_micro,0.3679
f1_micro,0.4421
precision_macro,0.3693
recall_macro,0.2399
f1_macro,0.2803


In [25]:
# Classified report
print(
    classification_report(
        y_val.values, y_pred, target_names=y.columns, zero_division=0
    )
)

                      precision    recall  f1-score   support

      BRAND#Negative       0.50      0.29      0.36        42
       BRAND#Neutral       0.00      0.00      0.00         2
      BRAND#Positive       0.40      0.20      0.27        20
       COST#Negative       0.59      0.59      0.59        46
        COST#Neutral       0.00      0.00      0.00         4
       COST#Positive       0.43      0.45      0.44        29
   EXTERIOR#Negative       0.77      0.27      0.40        37
    EXTERIOR#Neutral       0.00      0.00      0.00         3
   EXTERIOR#Positive       0.60      0.54      0.57        52
   FEATURES#Negative       0.67      0.20      0.31        30
    FEATURES#Neutral       0.00      0.00      0.00         1
   FEATURES#Positive       0.47      0.33      0.39        21
   INTERIOR#Negative       0.38      0.29      0.33        17
    INTERIOR#Neutral       0.00      0.00      0.00         1
   INTERIOR#Positive       0.64      0.24      0.35        29
PERFORM

### Test


In [None]:
df_test = pd.read_csv("../data/raw/train.csv")
df_test = df_test.iloc[700:705,:]
df_test

In [58]:
# Try prediction on some samples
samples = df_test["comment"].tolist()
samples_vec = vec.transform(samples)
preds = grid.best_estimator_.predict(samples_vec)


def decode_labels(pred_row, classes):
    return [cls for cls, val in zip(classes, pred_row) if val == 1]


for i, (text, pred_row) in enumerate(zip(samples, preds)):
    labels = decode_labels(pred_row, y.columns.tolist())
    print(f"Sample {i+1}:")
    print(f"\tText: {text}")
    print(f"\tPredicted labels: {labels}\n")

Sample 1:
	Text: Xe điện không trạm sạc xài tốt ở miền Nam, chính xác là miền Tây, từ Sài Gòn đến Cần Thơ chỉ 150 km. Trạm sạc 7kw /11kw thì rất nhiều quán cà phê ở các tỉnh miền Tây sẵn sàng lắp đặt cho khách hàng sạc. Dân miền Tây hào phóng, sẵn sàng giúp đỡ người dưng. Chạy xe gần hết điện bí quá tấp vô nhà dân bên đường xin sạc nhờ 1-2 tiếng. Người ta còn không lấy tiền điện nữa. Miền Bắc thì tui không rõ.
	Predicted labels: ['BRAND#Negative', 'FEATURES#Positive', 'PERFORMANCE#Positive']

Sample 2:
	Text: Điểm sướng nhất của phanh tái sinh khi đổ đèo là giảm sử dụng phanh chân, nếu quen thì gần như không phải dùng phanh chân mấy. Còn thêm được mấy % pin thì được thôi, k quan trọng lắm.
	Predicted labels: ['FEATURES#Positive']

Sample 3:
	Text: Atto3 đi đầm chắc, êm ái, không gian rộng rãi, thật tuyệt vời .
	Predicted labels: ['PERFORMANCE#Positive']

Sample 4:
	Text: con này bên ngoài đẹp vãi luôn đấy
	Predicted labels: ['EXTERIOR#Positive']

Sample 5:
	Text: Chiếc atto 3 này mình 

---

## Logistic regression

### Model


### Eval


### Test


---

## Xgboost

### Model


### Eval


### Test
