---

# Lib

In [259]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, make_scorer

from utils.other import parse_label

---

# Read file

In [225]:
df = pd.read_csv("../data/processed/train_preprocessed.csv")

In [226]:
df.head()

Unnamed: 0,comment,label
0,mẫu đẹp hiện_đại mặt đồng_hồ trung_tâm âm mặt ...,{EXTERIOR#Positive};{INTERIOR#Negative};
1,creta tầm giá xforce chút phiên_bản thiết_kế v...,{COST#Neutral};{EXTERIOR#Neutral};
2,giá_như cx5 dl mặc_dù rộng k say cx5,{COST#Positive};
3,giá đợi chương_trình khuyến_mại,{COST#Negative};
4,creta xe mượt hyundai phân khúc khung gầm treo...,{EXTERIOR#Positive};{INTERIOR#Positive};{PERFO...


In [227]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 908 entries, 0 to 907
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   comment  908 non-null    object
 1   label    908 non-null    object
dtypes: object(2)
memory usage: 14.3+ KB


---

# Label

In [228]:
mlb = MultiLabelBinarizer()

In [229]:
# Parse labels to lists
df["parsed_labels"] = df["label"].apply(parse_label)

In [230]:
df.head()

Unnamed: 0,comment,label,parsed_labels
0,mẫu đẹp hiện_đại mặt đồng_hồ trung_tâm âm mặt ...,{EXTERIOR#Positive};{INTERIOR#Negative};,"[EXTERIOR#Positive, INTERIOR#Negative]"
1,creta tầm giá xforce chút phiên_bản thiết_kế v...,{COST#Neutral};{EXTERIOR#Neutral};,"[COST#Neutral, EXTERIOR#Neutral]"
2,giá_như cx5 dl mặc_dù rộng k say cx5,{COST#Positive};,[COST#Positive]
3,giá đợi chương_trình khuyến_mại,{COST#Negative};,[COST#Negative]
4,creta xe mượt hyundai phân khúc khung gầm treo...,{EXTERIOR#Positive};{INTERIOR#Positive};{PERFO...,"[EXTERIOR#Positive, INTERIOR#Positive, PERFORM..."


In [231]:
# Binary matrix for multi-label classification
matrix_label = mlb.fit_transform(df['parsed_labels'])

# To dataframe
y = pd.DataFrame(matrix_label, columns=mlb.classes_)

In [232]:
y.head()

Unnamed: 0,BRAND#Negative,BRAND#Neutral,BRAND#Positive,COST#Negative,COST#Neutral,COST#Positive,EXTERIOR#Negative,EXTERIOR#Neutral,EXTERIOR#Positive,FEATURES#Negative,FEATURES#Neutral,FEATURES#Positive,INTERIOR#Negative,INTERIOR#Neutral,INTERIOR#Positive,PERFORMANCE#Negative,PERFORMANCE#Neutral,PERFORMANCE#Positive
0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0
1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1


In [233]:
X = df.drop(columns=["label", "parsed_labels"]).copy()

In [234]:
X.head()

Unnamed: 0,comment
0,mẫu đẹp hiện_đại mặt đồng_hồ trung_tâm âm mặt ...
1,creta tầm giá xforce chút phiên_bản thiết_kế v...
2,giá_như cx5 dl mặc_dù rộng k say cx5
3,giá đợi chương_trình khuyến_mại
4,creta xe mượt hyundai phân khúc khung gầm treo...


---

# Train, val split

In [235]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Vectorize

In [None]:
vec = TfidfVectorizer(
    analyzer='char',
    min_df = 2,
    max_df = 0.9,
    ngram_range=(3, 5),
    sublinear_tf=True,
    max_features=20000,
)

In [237]:
# Fit TF-IDF on training and transform train/test
X_train_vec = vec.fit_transform(X_train["comment"])
X_test_vec = vec.transform(X_test["comment"])

In [238]:
print(f"Train shape: {X_train_vec.shape}")
print(f"Test shape: {X_test_vec.shape}")
print(f"Vocabulary size: {len(vec.get_feature_names_out())}")

Train shape: (681, 15494)
Test shape: (227, 15494)
Vocabulary size: 15494


---

# Load Model

## Svm

### Model

In [239]:
# One-vs-Rest SVC to support kernel/gamma grid
ovr_svc = OneVsRestClassifier(SVC())

In [240]:
# hyperparameters
param_grid = {
    "estimator__C": np.linspace(1, 100, 100) ,
    "estimator__kernel": ["linear", "rbf", "poly", "sigmoid"],
    "estimator__gamma": ["scale", "auto"],
}

In [241]:
scorer = make_scorer(f1_score, average="micro", zero_division=0)

# Grid search
grid = GridSearchCV(
    ovr_svc, 
    param_grid=param_grid, 
    scoring=scorer, 
    cv=5, 
    n_jobs=-1, 
    verbose=1
)

In [242]:
# Fit
grid.fit(X_train_vec, y_train.values)

Fitting 5 folds for each of 800 candidates, totalling 4000 fits


0,1,2
,estimator,OneVsRestClas...timator=SVC())
,param_grid,"{'estimator__C': array([ 1., ...  100.]), 'estimator__gamma': ['scale', 'auto'], 'estimator__kernel': ['linear', 'rbf', ...]}"
,scoring,make_scorer(f...ro_division=0)
,n_jobs,-1
,refit,True
,cv,5
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,C,np.float64(5.0)
,kernel,'sigmoid'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


In [243]:
print("Best params:")
for item in grid.best_params_.items():
    print(f"\t{item[0]}: {item[1]}")
print()
print(f"Best CV score (f1_micro): {grid.best_score_:.2f}")

Best params:
	estimator__C: 5.0
	estimator__gamma: scale
	estimator__kernel: sigmoid

Best CV score (f1_micro): 0.43


### Eval

In [244]:
# Predict
y_pred = grid.predict(X_test_vec)

In [245]:
# Compute metrics
metrics = {
    "precision_micro": precision_score(y_test.values, y_pred, average="micro", zero_division=0),
    "recall_micro": recall_score(y_test.values, y_pred, average="micro", zero_division=0),
    "f1_micro": f1_score(y_test.values, y_pred, average="micro", zero_division=0),
    "precision_macro": precision_score(y_test.values, y_pred, average="macro", zero_division=0),
    "recall_macro": recall_score(y_test.values, y_pred, average="macro", zero_division=0),
    "f1_macro": f1_score(y_test.values, y_pred, average="macro", zero_division=0)
}

In [246]:
for item in metrics.items():
    print(f"{item[0]}: {item[1]:.4f}")

precision_micro: 0.5062
recall_micro: 0.3754
f1_micro: 0.4311
precision_macro: 0.3667
recall_macro: 0.2599
f1_macro: 0.2940


In [247]:
# Classified report
print(classification_report(y_test.values, y_pred, target_names=y.columns, zero_division=0))

                      precision    recall  f1-score   support

      BRAND#Negative       0.32      0.28      0.30        32
       BRAND#Neutral       0.00      0.00      0.00         1
      BRAND#Positive       0.22      0.11      0.15        18
       COST#Negative       0.64      0.57      0.61        40
        COST#Neutral       0.50      0.25      0.33         4
       COST#Positive       0.29      0.53      0.37        19
   EXTERIOR#Negative       0.60      0.20      0.30        30
    EXTERIOR#Neutral       0.00      0.00      0.00         3
   EXTERIOR#Positive       0.58      0.51      0.55        41
   FEATURES#Negative       0.67      0.33      0.44        24
    FEATURES#Neutral       0.00      0.00      0.00         1
   FEATURES#Positive       0.50      0.38      0.43        16
   INTERIOR#Negative       0.36      0.29      0.32        14
    INTERIOR#Neutral       0.00      0.00      0.00         1
   INTERIOR#Positive       0.64      0.38      0.47        24
PERFORM

### Test

In [257]:
df_test = pd.DataFrame(
    {
        "comment":["Ngoại thất đẹp"]
    }
)

In [258]:
# Try prediction on a few samples
samples = df_test["comment"].iloc[0:10].tolist()
samples_vec = vec.transform(samples)
preds = grid.best_estimator_.predict(samples_vec)

def decode_labels(pred_row, classes):
    return [cls for cls, val in zip(classes, pred_row) if val == 1]

for i, (text, pred_row) in enumerate(zip(samples, preds)):
    labels = decode_labels(pred_row, y.columns.tolist())
    print(f"Sample {i+1}:")
    print(f"\tText: {text}")
    print(f"\tPredicted labels: {labels}\n")

Sample 1:
	Text: Ngoại thất đẹp
	Predicted labels: ['EXTERIOR#Positive']



## Logistic regression

### Model

### Eval

### Test

## Xgboost

### Model

### Eval

### Test