In [1]:
import pickle
import numpy as np
import pandas as pd
import random
import re
import torch

from underthesea import word_tokenize 

from transformers import AutoTokenizer, AutoModel

In [2]:
def seed_everything(seed=2021):
    print(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

In [3]:
seed_everything()

2021


# Check data

In [4]:
# Chọn số lượng tập nhãn
no_label_data = 9

In [5]:
# Đường dẫn
data_path = f"../DataPreprocess/Cleaned_Data/{no_label_data}_label/clean_data.csv"
train_data_path = f"../DataPreprocess/Cleaned_Data/{no_label_data}_label/train_data.csv"
dev_data_path = f"../DataPreprocess/Cleaned_Data/{no_label_data}_label/dev_data.csv"
test_data_path = f"../DataPreprocess/Cleaned_Data/{no_label_data}_label/test_data.csv"

In [6]:
train_data = pd.read_csv(train_data_path).fillna("")
valid_data = pd.read_csv(dev_data_path).fillna("")
test_data = pd.read_csv(test_data_path).fillna("")
df = pd.read_csv(data_path).fillna("")

print(f"Train Set Shape: {train_data.shape}")
print(f"Valid Set Shape: {valid_data.shape}")
print(f"Test Set Shape: {test_data.shape}")
print(f"Full Set Shape: {df.shape}")

df.head(5)

Train Set Shape: (10888, 12)
Valid Set Shape: (3077, 12)
Test Set Shape: (1568, 12)
Full Set Shape: (15533, 12)


Unnamed: 0,Index,Utterance,Speaker,Id_speaker,Utterance_id,Date,Time,Emotion,Emotion_Mutiple,Dialog_id,Label,Utterance_clean
0,1,Bao tiền,Nguyễn Thanh Tú,100031059109987,1,18/02/2022,08:07:47,Neutral,Neutral,1,0,Bao tiền
1,2,Nguyễn Thanh Tú bạn có khum haha,Nguyễn Thị Diễm,100007602498241,2,18/02/2022,08:08:10,Joy,Joy,1,1,bạn có khum haha
2,3,Nguyễn Thị Diễm nổ giá đii đừng ib.,Nguyễn Thanh Tú,100031059109987,3,18/02/2022,08:08:27,Anger,Anger,1,4,nổ giá đii đừng ib .
3,4,T có nha,Dao Phuong Anh,100009157681703,1,18/02/2022,08:37:06,Neutral,Neutral,2,0,T có nha
4,5,Dao Phuong Anh check ib ạ,Nguyễn Thị Diễm,100007602498241,2,18/02/2022,08:37:18,Neutral,Neutral,2,0,check ib ạ


### Dùng PhoBERT để embedding

In [7]:
# model_type = "vinai/phobert-large"
model_type = f"../PhoBERT/phobert_{no_label_data}"

tokenizer = AutoTokenizer.from_pretrained(model_type)

model = AutoModel.from_pretrained(model_type, output_hidden_states = True).cuda()

In [8]:
model

RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(64001, 1024, padding_idx=1)
    (position_embeddings): Embedding(258, 1024, padding_idx=1)
    (token_type_embeddings): Embedding(1, 1024)
    (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0): RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=1024, out_features=1024, bias=True)
            (key): Linear(in_features=1024, out_features=1024, bias=True)
            (value): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=1024, out_features=1024, bias=True)
            (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (d

In [9]:
def sentences_embedding(text):
    # Encode
    input_ids = torch.tensor([tokenizer.encode(text, add_special_tokens=True, truncation=True)]).cuda()

    # no_grad
    with torch.no_grad():
        outputs = model(input_ids)  # Embedding
    # Hidden states
    hidden_states = outputs[2]
    token_embeddings = torch.stack(hidden_states, dim=0)
    token_vecs_sum = []

    # Lấy embedding từ 4 layer cuối
    for token in token_embeddings[-4:]:

        sum_vec = torch.sum(token, dim=0)
        token_vecs_sum.append(sum_vec)  # Thêm vào danh sách

    # Tính embedding của câu bằng cách lấy trung bình của các vector token
    sentence_embedding = torch.mean(torch.stack(token_vecs_sum), dim=0)
    sentence_embedding = torch.mean(sentence_embedding, dim=0)

#     print(sentence_embedding)
    return sentence_embedding

# Iterate

In [10]:
# Khởi tạo các trường thông tin sẽ trích xuất
utt_ids = {}
speakers = {}
labels = {}
texts_embeded = {}
texts = {}
train_id = None
test_id = None


In [11]:
# Đếm số speaker tối đa trong 1 hội thoại
n_speaker = max(list(df.groupby('Dialog_id')['Id_speaker'].nunique()))
n_speaker

29

In [12]:
# Duyệt các mẫu -> xử lý dữ liệu (dialogue_ID, Speaker_ID, Utterance_ID, )
for index, row in df.iterrows():
    dial_id = int(row['Dialog_id'])
    speaker_id = row['Id_speaker']
    utt_id = int(row['Utterance_id'])
    sentence =  str("") if isinstance(row['Utterance_clean'], float) else row['Utterance_clean']
    label = row['Label']
    embedded_vec = sentences_embedding(sentence).cpu().tolist()


    # Kiểm tra hội thoại đã tồn tại
    if dial_id not in utt_ids:
        utt_ids[dial_id] = []
        speakers[dial_id] = {}
        labels[dial_id] = {}
        texts_embeded[dial_id] = {}
        texts[dial_id] = {}
    
    utt_ids[dial_id].append(utt_id)
    speakers[dial_id][utt_id] = speaker_id
    labels[dial_id][utt_id] = label
    texts[dial_id][utt_id] = sentence
    texts_embeded[dial_id][utt_id] = embedded_vec

In [13]:
# Khởi tạo
speakers_list = {}
labels_list = {}
texts_embeded_list = {}
texts_list = {}

In [14]:
video_ids = list(set(df['Dialog_id']))

# Convert all features in to list
for id in video_ids:
    # Sắp xếp thứ tự các utterance
    utt_ids[id].sort

    # Lấy giá trị labels theo utt_id
    labels_list[id] = [labels[id][i] for i in utt_ids[id]]

    # Lấy giá trị speaker theo utt_id (Mỗi utterance có speaker là dạng onehot vector)
    speaker_set = list(set(speakers[id].values()))
    speaker_dict = {k: v for v, k in enumerate(speaker_set)}  
    speakers_list[id] = []
    for i in utt_ids[id]:
        onehot_speaker = [0] *n_speaker
        onehot_speaker[speaker_dict[speakers[id][i]]] = 1
        speakers_list[id].append(onehot_speaker)
    
    # Lấy utterance text theo utt_id (dạng list of strings)
    texts_list[id] = [texts[id][i] for i in utt_ids[id]]
    
    # Lấy utterance embedding theo utt_id (dạng numpy)
    texts_embeded_list[id] = []
    for i in utt_ids[id]:
        texts_embeded_list[id].append(texts_embeded[id][i])
    texts_embeded_list[id] = np.array(texts_embeded_list[id])

In [15]:
train_index = list(set(train_data['Dialog_id']))
test_index = list(set(test_data['Dialog_id']))
dev_index = list(set(valid_data['Dialog_id']))

In [16]:
# Tổng hợp
data_processed = [utt_ids, speakers_list, labels_list, texts_embeded_list, texts_list, train_index, test_index, dev_index]

In [17]:
print("Hội thoại 1: ")
print(data_processed[0][1])
print(data_processed[1][1])
print(data_processed[2][1])
print(data_processed[3][1])
print(data_processed[4][1])

Hội thoại 1: 
[1, 2, 3]
[[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
[0, 1, 4]
[[ 0.34161964  0.33523902  0.0203289  ... -0.43199632 -0.01290933
  -0.6750083 ]
 [ 0.16943173  0.67860115  0.45725024 ... -0.08745073  0.29054123
  -0.11620389]
 [-0.00783821 -0.07530842  0.06312995 ...  0.0498473  -0.10333988
   0.05112461]]
['Bao tiền', 'bạn có khum haha', 'nổ giá đii đừng ib .']


In [18]:
import pickle

with open(f'Data_test_{no_label_data}.pkl', 'wb') as f:
    pickle.dump(data_processed, f)