In [None]:
!pip install transformers

In [1]:
import os

os.chdir("/content/drive/MyDrive/Notebooks/PAPER_EXPERIMENT")

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer
from transformers.optimization import AdamW
from transformers import get_linear_schedule_with_warmup
from sklearn.metrics import classification_report
import pandas as pd
import numpy as np
from tqdm import tqdm
import warnings

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
warnings.filterwarnings("ignore")

# Creat Dataloader

In [None]:
class MyDataset(Dataset):

    def __init__(self, 
                 review_path, 
                 dim_path, 
                 word_index_path, 
                 dim_attention_mask_path,
                 sentiment_label_path):
        super(MyDataset, self).__init__()
        self.review = pd.read_csv(review_path)["review_tokenized"]
        self.review = self.review.apply(lambda x: "".join(eval(x))).tolist()
        self.dim = np.load(dim_path)
        self.word_index = np.load(word_index_path)
        self.dim_attention_mask = np.load(dim_attention_mask_path)
        self.sentiment_label = np.load(sentiment_label_path)
    
    def __len__(self):
        return len(self.review)
    
    def __getitem__(self, idx):
        review = self.review[idx]
        dim = self.dim[idx]
        word_index = self.word_index[idx]
        dim_attention_mask = self.dim_attention_mask[idx]
        sentiment_label = self.sentiment_label[idx]
        return {
            "review": review,
            "dim_index": torch.LongTensor(dim),
            "word_index": torch.LongTensor(word_index),
            "dim_attention_mask": torch.LongTensor(dim_attention_mask),
            "sentiment_label": torch.LongTensor(sentiment_label)
        }

In [None]:
def get_DataLoader(dataset, batch_size=8, drop_last=True, shuffle=True):
    data_loader = DataLoader(dataset, batch_size, shuffle=shuffle, drop_last=drop_last)
    return data_loader

# Define Weighted Embedding Module

In [None]:
class WeightedEmbedding(nn.Module):

    def __init__(self, word_embedding):
        """
        :param word_embedding: size=(vocab_size, 768)
        """
        super(WeightedEmbedding, self).__init__()
        word_embedding = torch.FloatTensor(word_embedding)
        self.word_embedding = nn.Embedding.from_pretrained(word_embedding)
        self.embedding_size = word_embedding.shape[1]
        self.softmax = nn.Softmax(dim=-1)
    
    def forward(self, ernie_output, word_index):
        """
        :param ernie_output: size=(batch_size, 512, 768)
        :param word_index: size=(batch_size, 300, 3)
        :return weighted_embedding: size=(batch_size, 300, 768)
        Note that the sequence includes [CLS]/[SEP]/[PAD] tokens
        """
        word_index = torch.LongTensor(word_index)
        batch_size = word_index.shape[0]
        max_seq_len = word_index.shape[1]

        weighted_embeddings = torch.zeros([
            batch_size, max_seq_len, self.embedding_size
        ])
        ernie_seq_len = ernie_output.size(1)
        for i, review in enumerate(word_index):
            for j, word in enumerate(review):
                if word[2] < ernie_seq_len:
                    if ((word[2] - word[1]) > 1):
                        word_embedding = self.word_embedding(word[0]).expand([1, -1])  # size=(1, 768)
                        char_embedding = ernie_output[i, word[1]:word[2], :]  # size=(span, 768)
                        weights = self.softmax(
                            torch.matmul(word_embedding, char_embedding.transpose(0, 1))
                        )  # size=(1, span)
                        weighted_embedding = torch.matmul(weights, char_embedding)  # size=(1, 768)
                    elif ((word[2] - word[1]) == 1):
                        weighted_embedding = ernie_output[i, word[1]:word[2], :]
                    else:
                        weighted_embeddings[i, j:] = self.word_embedding(word[0])
                        break
                    weighted_embeddings[i, j] = weighted_embedding
                else:
                    weighted_embeddings[i, j] = self.word_embedding(word[0])
        return weighted_embeddings

In [None]:
# test the module

# torch.manual_seed(42)
# embeddings = torch.rand([10, 3])
# print(embeddings)
# print()
# ernie_outputs = torch.rand([1, 12, 3])
# print(ernie_outputs)
# print()
# word_index = np.array([
#     [[1, 1, 2],
#      [5, 3, 4],
#      [6, 4, 6],
#      [7, 6, 11],
#      [0, 11, 11]]
# ])
# weighted_embedding_layer = WeightedEmbedding(embeddings, word_index)
# weighted_embedding_layer(ernie_outputs)

# Define Dimension Attention Module

In [None]:
class DimensionAttention(nn.Module):

    def __init__(self, word_embeddings):
        """
        :param word_embeddings: size=(vocab_size, 768)
        """
        super(DimensionAttention, self).__init__()
        self.linear = nn.Linear(768, 768)
        self.tanh = nn.Tanh()
        self.softmax = nn.Softmax(dim=1)
        word_embeddings = torch.FloatTensor(word_embeddings)
        self.embedding = nn.Embedding.from_pretrained(word_embeddings)
    
    def forward(self, word_index, dim_index, dim_attention_mask):
        """
        :param word_index: size=(batch_size, 300, 3)
        :param dim_index: size=(batch_size, 5)
        :param dim_attention_mask: size=(batch_size, 5)
        :return attention_dim_embedding: (batch_size, 5, 768)
        """
        batch_size = word_index.shape[0]
        num_dim = dim_index.shape[1]
        word_index = torch.LongTensor(word_index[:, :, 0])  # size=(batch_size, 300)
        dim_index = torch.LongTensor(dim_index)
        dim_attention_mask = torch.LongTensor(dim_attention_mask).view(batch_size, -1, 1)  # (batch_size, 5, 1)
        word_embedding = self.embedding(word_index)  # size=(batch_size, 300, 768)
        dim_embedding = self.embedding(dim_index)  # size=(batch_size, 5, 768)
        padding_embedding = self.embedding(torch.LongTensor([0]))  # size=(1, 768)
        padding_embedding = padding_embedding.expand(batch_size, num_dim, 768)  # size=(batch_size, 5, 768)
        U = self.linear(word_embedding)  # size=(batch_size, 300, 768)
        U = self.tanh(U)  # size=(batch_size, 300, 768)
        alpha = self.softmax(
            torch.bmm(U, dim_embedding.transpose(1, 2))
        )  # size=(batch_size, 300, 5)
        attention_dim_embedding = torch.bmm(alpha.transpose(1, 2), word_embedding)  # size=(batch_size, 5, 768)
        attention_dim_embedding = attention_dim_embedding * (1-dim_attention_mask) + \
                                    padding_embedding * dim_attention_mask  # size=(batch_size, 5, 768)
        return attention_dim_embedding

# Define Top-k Pool Module

In [None]:
class TopKPool(nn.Module):

    def __init__(self, k):
        super(TopKPool, self).__init__()
        self.k = k

    def forward(self, x):
        """
        :param x: (batch_size, seq_len, embedding_size)
        :return: (batch_size, seq_len)
        """
        output = x.topk(self.k, dim=-1)[0].mean(dim=-1)
        return output

# Define Main Module

In [None]:
class MainModule(nn.Module):
    
    def __init__(self, embeddings):
        """
        :param embeddings: size=(vocab_size, 768), dtype=numpy.array
        """
        super(MainModule, self).__init__()
        
        self.word_embeddings = torch.FloatTensor(embeddings)
        self.embedding = nn.Embedding.from_pretrained(self.word_embeddings, freeze=True)
        self.ernie = AutoModel.from_pretrained("nghuyong/ernie-1.0")
        self.weighted_embedding = WeightedEmbedding(self.word_embeddings)
        self.dim_attention = DimensionAttention(self.word_embeddings)
        
        self.lstm_dim = embeddings.shape[1] // 2  # must be <int> type
        self.bi_lstm = nn.LSTM(
            input_size=embeddings.shape[1], hidden_size=self.lstm_dim,
            bidirectional=True, batch_first=True
        )

        self.pool = nn.MaxPool1d(embeddings.shape[1], stride=1)
        self.linear = nn.Linear(305, 72)
        self.softmax = nn.Softmax(dim=-1)

    def init_hidden(self, batch_size):
        return (torch.randn(2, batch_size, self.lstm_hid_dim).cuda(),
                torch.randn(2, batch_size, self.lstm_hid_dim).cuda())
    
    def forward(self, input_ids, token_type_ids, attention_mask,
                word_index, dim_index, dim_attention_mask):
        """
        :param input_ids / token_type_ids / attention_mask: size=(batch_size, 512)
        :param word_index: size=(batch_size, 300, 3)
        :param dim_index: size=(batch_size, 5)
        :param dim_attention_mask: size=(batch_size, 5)
        :return: size=(batch_size, 18, 4)
        """
        batch_size = input_ids.shape[0]

        # get ERNIE char embeddings, size = (batch_size, 512, 768)
        ernie_output = self.ernie(
            input_ids=input_ids, token_type_ids=token_type_ids,
            attention_mask=attention_mask
        )
        ernie_char_embeddings = ernie_output.last_hidden_state
        
        # get weighted word embeddings based on char-word similarity
        # size = (batch_size, 300, 768)
        weighted_embeddings = self.weighted_embedding(
            ernie_output, word_index
        )

        # get dimension word embedding based on review-dimension attention mechanism
        # size = (batch_size, 5, 768)
        att_dim_embeddings = self.dim_attention(
            word_index, dim_index, dim_attention_mask
        )

        # fuse the weighted word embeddings and dimension word embeddings
        # if concatenate, size = (batch_size, 305, 768)
        fuse_embeddings = torch.concat([weighted_embeddings, att_dim_embeddings], dim=1)

        # Bi-LSTM layer, size = (batch_size, 305, 768)
        hidden_state = self.init_hidden(batch_size)
        lstm_output, hidden_state = self.bi_lstm(fuse_embeddings, hidden_state)
        
        # max pooling layer, size = (batch_size, 305, 1)
        pooled_output = self.pool(lstm_output)

        # squeeze, size = (batch_size, 305)
        squeeze_output = pooled_output.squeeze(dim=2)

        # linear layer, size = (batch_size, 72)
        linear_output = self.linear(squeeze_output)

        # reshape, size = (batch_size, 18, 4)
        linear_output.resize_(batch_size, 18, 4)

        # softmax layer, size = (batch_size, 18, 4)
        output = self.softmax(linear_output)

        return output

# Define Loss Function

In [None]:
def loss_func(y_pred, target):
    """
    Loss function for multi-output classification
    :param y_pred, target: size=(batch_size, 18, 4)
    """
    target = target.float()
    num_samples = target.size(0)
    num_dimensions = target.size(1)
    log_pred = torch.log(y_pred)
    loss = -(target * log_pred).sum() / (num_dimensions*num_samples)
    return loss

# Define Evaluate Metric Function

In [None]:
def metrics(y_pred, y_true):
    """
    Calculate precision, recall and f1-score
    :param y_pred: size=(batch_size, 18, 4)
    :param y_true: size=(batch_size, 18, 4), one-hot labels
    """
    batch_size = y_pred.size(0)
    y_pred, y_true = y_pred.cpu().numpy(), y_true.cpu().numpy()
    pred_label = y_pred.argmax(axis=-1)
    true_label = np.argwhere(y_true == 1)[:, 2].reshape([batch_size, 18])
    precision_lst, recall_lst, f1_lst = [], [], []
    for i in range(18):
        rtn = classification_report(
            true_label[:, i], pred_label[:, i], digits=4, output_dict=True
        )
        macro_avg = rtn["macro avg"]
        precision_lst.append(macro_avg["precision"])
        recall_lst.append(macro_avg["recall"])
        f1_lst.append(macro_avg["f1-score"])
    precision = np.mean(precision_lst)
    recall = np.mean(recall_lst)
    f1_score = np.mean(f1_lst)
    return precision, recall, f1_score

# Define Training Function

In [None]:
def train(train_loader, dev_loader, embeddings, 
          epochs, lr, warmup_rate):
    """
    Main training function
    """
    tokenizer = AutoTokenizer.from_pretrained("nghuyong/ernie-1.0", model_max_length=512)

    model = MainModule(embeddings)
    model.cuda()
    
    num_training_steps = epochs * len(train_loader)
    num_warmup_steps = int(warmup_rate * num_training_steps)
    optimizer = AdamW(model.parameters(), lr=lr)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps
    )

    for epoch in range(epochs):

        """Traning stage"""

        model.train()
        print()
        print(f"Running EPOCH {epoch+1}")
        print()
        train_loss = []
        precision = []
        recall = []
        f1_score = []

        for batch_idx, batch_data in enumerate(tqdm(train_loader)):
            optimizer.zero_grad()
            word_index, dim_index, dim_attention_mask, sentiment_label = \
                batch_data["word_index"].cuda(), \
                batch_data["dim_index"].cuda(), \
                batch_data["dim_attention_mask"].cuda(), \
                batch_data["sentiment_label"].cuda()
            review = batch_data["review"]
            token_ids = tokenizer(review, padding="max_length", truncation=True)
            token_ids = {
                key: torch.LongTensor(value).cuda() for (key, value) in token_ids.items()
            }
            y_pred = model(
                **token_ids, word_index=word_index, dim_index=dim_index, 
                dim_attention_mask=dim_attention_mask
            )
            loss = loss_func(y_pred, sentiment_label)
            loss.backward()
            optimizer.step()
            scheduler.step()
            train_loss.append(loss.item())
            p, r, f1 = metrics(y_pred, sentiment_label)
            precision.append(p)
            recall.append(r)
            f1_score.append(f1)
        
        epoch_loss = np.mean(train_loss)
        epoch_p = np.mean(precision)
        epoch_r = np.mean(recall)
        epoch_f1 = np.mean(f1_score)

        print(f"EPOCH {epoch+1} Train End : avg_loss = {epoch_loss:.4f}")
        print(f"precision: {epoch_p:.4f}, recall: {epoch_r:.4f}, f1-score:{epoch_f1:.4f}")
        print()

        """Evaluation Stage"""
        model.eval()
        val_loss = []
        val_precision = []
        val_recall = []
        val_f1 = []
        
        for batch_idx, batch_data in enumerate(tqdm(dev_loader)):
            word_index, dim_index, dim_attention_mask, sentiment_label = \
                batch_data["word_index"].cuda(), \
                batch_data["dim_index"].cuda(), \
                batch_data["dim_attention_mask"].cuda(), \
                batch_data["sentiment_label"].cuda()
            review = batch_data["review"]
            token_ids = tokenizer(review, padding="max_length", truncation=True)
            token_ids = {
                key: torch.LongTensor(value).cuda() for (key, value) in token_ids.items()
            }
            with torch.no_grad():
                y_pred = model(
                    **token_ids, word_index=word_index, dim_index=dim_index, 
                    dim_attention_mask=dim_attention_mask
                )
                loss = loss_func(y_pred, sentiment_label)
                p, r, f1 = metrics(y_pred, sentiment_label)
            val_loss.append(loss.item())
            val_precision.append(p)
            val_recall.append(r)
            val_f1.append(f1)
        
        epoch_val_loss = np.mean(val_loss)
        epoch_val_p = np.mean(val_precision)
        epoch_val_r = np.mean(val_recall)
        epoch_val_f1 = np.mean(val_f1)

        print(f"EPOCH {epoch+1} Evaluation End : avg_loss = {epoch_val_loss:.4f}")
        print(f"precision: {epoch_val_p:.4f}, recall: {epoch_val_r:.4f}, f1-score:{epoch_val_f1:.4f}")
        print()
        print("="*60)