In [1]:
# Importing the libraries needed
import pandas as pd
import numpy as np

import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import seaborn as sns
import transformers
import json
from tqdm import tqdm

from transformers import AutoTokenizer, AutoModel
import emoji

import pickle
import logging
import random
from underthesea import word_tokenize 
import re
from sklearn.metrics import f1_score, accuracy_score
import warnings

# Ignore all warnings
warnings.filterwarnings("ignore")
logging.basicConfig(level=logging.ERROR)

# Load data

In [2]:
# Chọn tập dữ liệu với no_label_data nhãn
no_label_data = 7

In [3]:
# Đường dẫn
data_path = f"../DataPreprocess/Cleaned_Data/{no_label_data}_label/clean_data.csv"
train_data_path = f"../DataPreprocess/Cleaned_Data/{no_label_data}_label/train_data.csv"
dev_data_path = f"../DataPreprocess/Cleaned_Data/{no_label_data}_label/dev_data.csv"
test_data_path = f"../DataPreprocess/Cleaned_Data/{no_label_data}_label/test_data.csv"

In [7]:
# Do tiền xử lý dẫn đến 1 số text đầu vào = nan -> fillna("")
df = pd.read_csv(data_path).fillna("")
train_data = pd.read_csv(train_data_path).fillna("")
dev_data = pd.read_csv(dev_data_path).fillna("")
test_data = pd.read_csv(test_data_path).fillna("")
print(f"Train Size: {train_data.shape}")
print(f"Dev Size: {dev_data.shape}")
print(f"Test Size: {test_data.shape}")
df.info()

Train Size: (10888, 12)
Dev Size: (3077, 12)
Test Size: (1568, 12)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15533 entries, 0 to 15532
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Index            15533 non-null  object
 1   Utterance        15533 non-null  object
 2   Speaker          15533 non-null  object
 3   Id_speaker       15533 non-null  uint64
 4   Utterance_id     15533 non-null  int64 
 5   Date             15533 non-null  object
 6   Time             15533 non-null  object
 7   Emotion          15533 non-null  object
 8   Emotion_Mutiple  15533 non-null  object
 9   Dialog_id        15533 non-null  int64 
 10  Label            15533 non-null  int64 
 11  Utterance_clean  15533 non-null  object
dtypes: int64(3), object(8), uint64(1)
memory usage: 1.4+ MB


### random seed

In [8]:
def seed_everything(seed=2021):
    print(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

In [9]:
seed_everything()

2021


### DataLoader

In [10]:
# Khai báo tham số
MAX_LEN = 256
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 4
LEARNING_RATE = 1e-05
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-large", truncation=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [11]:
class EmotionData(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.Utterance_clean
        self.speaker_ids = dataframe.Id_speaker  
        self.targets = self.data.Label
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())
        speaker_id = str(self.speaker_ids[index]) 

        inputs = self.tokenizer.encode_plus(
            text,
            speaker_id,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            truncation='only_first',
#             padding='max_length',
            return_token_type_ids=True
        )
        
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [12]:
# Nhập dữ liệu
training_set = EmotionData(train_data, tokenizer, MAX_LEN)
valid_set = EmotionData(dev_data, tokenizer, MAX_LEN)
testing_set = EmotionData(test_data, tokenizer, MAX_LEN)
full_set = EmotionData(df, tokenizer, MAX_LEN)

In [13]:
# training_set[0]

### Model for classification

In [14]:
class PhoBertClass(torch.nn.Module):
    def __init__(self):
        super(PhoBertClass, self).__init__()
        self.l1 = AutoModel.from_pretrained("vinai/phobert-large")
        self.tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-large")
#         self.tokenizer.add_tokens(list(emoji.EMOJI_DATA.keys())) # add emoji in vocab
#         self.l1.resize_token_embeddings(len(self.tokenizer))
        self.pre_classifier = torch.nn.Linear(1024, 1024)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(1024, no_label_data)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

    def predict(self, X):
        # Tokenize the input text
        inputs = self.tokenizer(X, padding=True, truncation=True, return_tensors="pt")

        # Move inputs to the same device as the model
        input_ids = inputs["input_ids"].to(self.l1.device)
        attention_mask = inputs["attention_mask"].to(self.l1.device)
        token_type_ids = inputs["token_type_ids"].to(self.l1.device)

        # Run forward pass to get predicted labels
        with torch.no_grad():
            logits = self.forward(input_ids, attention_mask, token_type_ids)
            probabilities = torch.nn.functional.softmax(logits, dim=1)
            predicted_labels = torch.argmax(probabilities, dim=1)
        return predicted_labels
    
    def predict_all(self, X):
        preds = []
        try:
            preds.append(int(model.predict(X)))  
        except Exception as e:
            x = ''
            preds.append(int(model.predict(x)))  
        return preds


In [15]:
model = PhoBertClass().to(device)

Some weights of the model checkpoint at vinai/phobert-large were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


### Fine-tuning

In [17]:
# Creating the loss function and optimizer
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE, weight_decay=0.001)
optimizer

Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 1e-05
    weight_decay: 0.001
)

In [18]:
def calculate_f1_score(y_pred, y_true):
    return f1_score(y_true.cpu(), y_pred.cpu(), average='weighted')
def calcuate_accuracy(y_pred, y_true):
    return accuracy_score(y_true.cpu(), y_pred.cpu())

In [19]:
# Train
def train(model, epoch, data_loader):
    preds = None
    labels = None
    tr_loss = 0
    nb_tr_steps = 0

    model.train()
    for _,data in tqdm(enumerate(data_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        outputs = model(ids, mask, token_type_ids)
        big_val, big_idx = torch.max(outputs.data, dim=1)
        loss = loss_function(outputs, targets)
        tr_loss += loss.item()
        nb_tr_steps += 1

#         print(targets)
#         print(big_val)
#         print(big_idx)
        
        if preds is None:
            preds = big_idx
            labels = targets
        else:
            preds = torch.cat((preds, big_idx))
            labels = torch.cat((labels, targets))
#         print(f"Training Accuracy Epoch: {calcuate_accuracy(preds, labels)}")
#         print(f"Training F1_Score Epoch: {calculate_f1_score(preds, labels)}")
        optimizer.zero_grad()
        loss.backward()
        # # When using GPU
        optimizer.step()

    print(f"Training Loss Epoch: {tr_loss/nb_tr_steps}")
    print(f"Training Accuracy Epoch: {calcuate_accuracy(preds, labels)}")
    print(f"Training F1_Score Epoch: {calculate_f1_score(preds, labels)}")

    return calculate_f1_score(preds, labels)

In [20]:
# Valid and test
def valid(model, data_loader):
    model.eval()
    preds = None
    labels = None
    tr_loss = 0
    nb_tr_steps = 0    
    with torch.no_grad():
        for _, data in tqdm(enumerate(data_loader, 0)):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.long)

            outputs = model(ids, mask, token_type_ids)
            big_val, big_idx = torch.max(outputs.data, dim=1)
            loss = loss_function(outputs, targets)
            tr_loss += loss.item()
            nb_tr_steps += 1
            
            if preds is None:
                preds = big_idx
                labels = targets
            else:
                preds = torch.cat((preds, big_idx))
                labels = torch.cat((labels, targets))

            
    print(f"Loss Epoch: {tr_loss/nb_tr_steps}")
    print(f"Accuracy Epoch: {calcuate_accuracy(preds, labels)}")
    print(f"F1_Score Epoch: {calculate_f1_score(preds, labels)}")
    
    return calculate_f1_score(preds, labels), preds

In [21]:
# Khởi tạo dataloader
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

valid_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': False,
                'num_workers': 0
                }

train_loader = DataLoader(training_set, **train_params)
valid_loader = DataLoader(valid_set, **valid_params)
test_loader = DataLoader(testing_set, **valid_params)
data_loader = DataLoader(full_set, **train_params)

# Train

In [None]:
# Train
EPOCHS = 8
best_f1 = 0
for epoch in range(EPOCHS):
    train(model, epoch, train_loader)
    f1, _ = valid(model, valid_loader)
    if best_f1 < f1:
        best_f1 = f1
        pickle.dump(model, open(f'Best_model.pkl', 'wb'))

        output_file_path = f'phobert_{no_label_data}_id'

        model_to_save = model.l1
        model_to_save.save_pretrained(output_file_path)
        tokenizer.save_pretrained(output_file_path)

        print(f'Save EPOCH {epoch}')

341it [07:58,  1.40s/it]


Training Loss Epoch: 1.3179061928452633
Training Accuracy Epoch: 0.49889786921381335
Training F1_Score Epoch: 0.44693321082637444


770it [00:50, 15.36it/s]


Valid Loss Epoch: 1.1099855108114032
Valid Accuracy Epoch: 0.6100097497562561
Valid F1_Score Epoch: 0.5670201796135781
Save EPOCH 0


341it [07:58,  1.40s/it]


Training Loss Epoch: 0.9994463726572277
Training Accuracy Epoch: 0.644746509919177
Training F1_Score Epoch: 0.6073465831932118


770it [00:50, 15.36it/s]


Valid Loss Epoch: 1.0066336103461004
Valid Accuracy Epoch: 0.6327591810204745
Valid F1_Score Epoch: 0.5956425403652843
Save EPOCH 1


341it [07:58,  1.40s/it]


Training Loss Epoch: 0.876072172545268
Training Accuracy Epoch: 0.6939750183688465
Training F1_Score Epoch: 0.6668697486849804


328it [00:21, 15.33it/s]

# TEST

In [None]:
best_model = pickle.load(open('Best_model.pkl', 'rb'))

In [None]:
# f1, predicts = valid(best_model, valid_loader)

In [None]:
f1, predicts = valid(best_model, test_loader)