In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import classification_report
from sklearn.svm import LinearSVC, SVC

from transformers import AutoTokenizer, AutoModel
import torch

import re
import joblib

In [2]:
no_label_data = 9
data_path = f"../DataPreprocess/Cleaned_Data/{no_label_data}_label/clean_data.csv"
train_data_path = f"../DataPreprocess/Cleaned_Data/{no_label_data}_label/train_data.csv"
dev_data_path = f"../DataPreprocess/Cleaned_Data/{no_label_data}_label/dev_data.csv"
test_data_path = f"../DataPreprocess/Cleaned_Data/{no_label_data}_label/test_data.csv"

In [3]:
train_data = pd.read_csv(train_data_path).fillna("")
valid_data = pd.read_csv(dev_data_path).fillna("")
test_data = pd.read_csv(test_data_path).fillna("")
df = pd.read_csv(data_path).fillna("")

print(f"Train Set Shape: {train_data.shape}")
print(f"Valid Set Shape: {valid_data.shape}")
print(f"Test Set Shape: {test_data.shape}")
print(f"Full Set Shape: {df.shape}")

df.head(5)

Train Set Shape: (10888, 12)
Valid Set Shape: (3077, 12)
Test Set Shape: (1568, 12)
Full Set Shape: (15533, 12)


Unnamed: 0,Index,Utterance,Speaker,Id_speaker,Utterance_id,Date,Time,Emotion,Emotion_Mutiple,Dialog_id,Label,Utterance_clean
0,1,Bao tiền,Nguyễn Thanh Tú,100031059109987,1,18/02/2022,08:07:47,Neutral,Neutral,1,0,Bao tiền
1,2,Nguyễn Thanh Tú bạn có khum haha,Nguyễn Thị Diễm,100007602498241,2,18/02/2022,08:08:10,Joy,Joy,1,1,bạn có khum haha
2,3,Nguyễn Thị Diễm nổ giá đii đừng ib.,Nguyễn Thanh Tú,100031059109987,3,18/02/2022,08:08:27,Anger,Anger,1,4,nổ giá đii đừng ib .
3,4,T có nha,Dao Phuong Anh,100009157681703,1,18/02/2022,08:37:06,Neutral,Neutral,2,0,T có nha
4,5,Dao Phuong Anh check ib ạ,Nguyễn Thị Diễm,100007602498241,2,18/02/2022,08:37:18,Neutral,Neutral,2,0,check ib ạ


In [4]:
# model_type = "vinai/phobert-large"
model_type = f"../PhoBERT/phobert_{no_label_data}"

tokenizer = AutoTokenizer.from_pretrained(model_type)

model = AutoModel.from_pretrained(model_type, output_hidden_states = True).cuda()

In [5]:
def sentences_embedding(text):
    # Encode
    input_ids = torch.tensor([tokenizer.encode(text, add_special_tokens=True, truncation=True)]).cuda()

    # no_grad
    with torch.no_grad():
        outputs = model(input_ids)  # Embedding
    # Hidden states
    hidden_states = outputs[2]
    token_embeddings = torch.stack(hidden_states, dim=0)
    token_vecs_sum = []

    # Lấy embedding từ 4 layer cuối
    for token in token_embeddings[-4:]:

        sum_vec = torch.sum(token, dim=0)
        token_vecs_sum.append(sum_vec)  # Thêm vào danh sách

    # Tính embedding của câu bằng cách lấy trung bình của các vector token
    sentence_embedding = torch.mean(torch.stack(token_vecs_sum), dim=0)
    sentence_embedding = torch.mean(sentence_embedding, dim=0)

#     print(sentence_embedding)
    return sentence_embedding.cpu().tolist()

In [6]:
# [sentences_embedding(row['Utterance_clean'], row['Id_speaker']) for index,row in train_data.head(5).iterrows()]

In [7]:
X_train = [sentences_embedding(row['Utterance_clean']) for index, row in train_data.iterrows()]
y_train = train_data['Label']
X_dev = [sentences_embedding(row['Utterance_clean']) for index, row in valid_data.iterrows()]
y_dev = valid_data['Label']
X_test = [sentences_embedding(row['Utterance_clean']) for index, row in test_data.iterrows()]
y_test = test_data['Label']

In [8]:
# # Sử dụng GridSearchCV để tìm ra các hyper-parameter tốt nhất cho mô hình SVM
params = {'C': [0.1, 0.3, 1, 3, 10], 'kernel': ['linear', 'rbf', 'poly'], 'gamma': ['scale', 'auto'], 'class_weight': [None, 'balanced']}
svm = SVC()
clf = GridSearchCV(svm, params, cv=5)
clf.fit(X_dev, y_dev)

In [9]:
# clf.fit(X_dev, y_dev)

In [10]:
# In ra 
best_params = clf.best_params_
print("Best parameters:", clf.best_params_)

Best parameters: {'C': 3, 'class_weight': None, 'gamma': 'auto', 'kernel': 'rbf'}


In [11]:
# Train
clf = SVC(C=best_params['C'], gamma=best_params['gamma'], kernel=best_params['kernel'], class_weight = best_params['class_weight'])
# clf = SVC(C=3, gamma='scale', kernel='rbf', class_weight = None)

clf.fit(X_train, y_train)

In [14]:
# Đánh giá mô hình trên tập kiểm tra
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.5251    0.5656    0.5446       389
           1     0.5380    0.6072    0.5705       443
           2     0.3712    0.3172    0.3421       268
           3     0.6000    0.5435    0.5703       138
           4     0.4167    0.3125    0.3571        32
           5     0.3551    0.3393    0.3470       112
           6     0.5600    0.5000    0.5283        28
           7     0.4820    0.4558    0.4685       147
           8     0.0000    0.0000    0.0000        11

    accuracy                         0.4962      1568
   macro avg     0.4276    0.4046    0.4143      1568
weighted avg     0.4876    0.4962    0.4904      1568



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [15]:
from sklearn.metrics import confusion_matrix

# Tính toán ma trận confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

print("Confusion Matrix:")
print(conf_matrix)

Confusion Matrix:
[[220  58  48  14   3  24   2  20   0]
 [ 71 269  69   6   3  14   2   9   0]
 [ 48 120  85   1   1   8   0   5   0]
 [ 14  11   3  75   0   8   3  24   0]
 [  6   4   2   2  10   4   0   4   0]
 [ 24  21  15   5   0  38   2   7   0]
 [  4   3   2   2   0   2  14   1   0]
 [ 30  11   4  19   5   9   2  67   0]
 [  2   3   1   1   2   0   0   2   0]]
