# to dataset

In [1]:
from torch.utils.data import Dataset, DataLoader, ConcatDataset
from transformers import BertTokenizer
import torch
from torch.nn.utils.rnn import pad_sequence
import pandas as pd

In [2]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [3]:
class absa_dataset(Dataset):
    def __init__(self, df, tokenizer):
        self.df = df
        self.tokenizer = tokenizer

    def __getitem__(self, idx):
        tokens, tags, pols = self.df.iloc[idx, :3].values
        tokens = tokens.replace("'", "").strip("][").split(', ')
        tags = tags.strip('][').split(', ')
        pols = pols.strip('][').split(', ')

        bert_tokens = []
        bert_att = []
        pols_label = 0
        for i in range(len(tokens)):
            t = tokenizer.tokenize(tokens[i])
            bert_tokens += t
            if int(pols[i]) != -1:
                bert_att += t
                pols_label = int(pols[i])

        segment_tensor = [0] + [0]*len(bert_tokens) + [0] + [1]*len(bert_att)
        bert_tokens = ['[cls]'] + bert_tokens + ['[sep]'] + bert_att
        

        bert_ids = tokenizer.convert_tokens_to_ids(bert_tokens)

        ids_tensor = torch.tensor(bert_ids)
        pols_tensor = torch.tensor(pols_label)
        segment_tensor = torch.tensor(segment_tensor)

        return bert_tokens, ids_tensor, segment_tensor, pols_tensor

    def __len__(self):
        return len(self.df)

In [4]:
df = pd.read_csv("laptops_train.csv")
laptops_train_ds = absa_dataset(df, tokenizer)

df = pd.read_csv("laptops_test.csv")
laptops_test_ds = absa_dataset(df, tokenizer)

df = pd.read_csv("restaurants_train.csv")
restaurants_train_ds = absa_dataset(df, tokenizer)

df = pd.read_csv("restaurants_test.csv")
restaurants_test_ds = absa_dataset(df, tokenizer)

In [5]:
w,x,y,z = laptops_train_ds.__getitem__(121)
print(w)
print(len(w))
print(x)
print(len(x))
print(y)
print(len(y))
print(z)
# print(len(z))

['[cls]', 'the', 'battery', 'life', 'seems', 'to', 'be', 'very', 'good', ',', 'and', 'have', 'had', 'no', 'issues', 'with', 'it', '.', '[sep]', 'battery', 'life']
21
tensor([ 100, 1996, 6046, 2166, 3849, 2000, 2022, 2200, 2204, 1010, 1998, 2031,
        2018, 2053, 3314, 2007, 2009, 1012,  100, 6046, 2166])
21
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1])
21
tensor(2)


In [6]:
def create_mini_batch(samples):
    ids_tensors = [s[1] for s in samples]
    ids_tensors = pad_sequence(ids_tensors, batch_first=True)

    segments_tensors = [s[2] for s in samples]
    segments_tensors = pad_sequence(segments_tensors, batch_first=True)

    label_ids = torch.stack([s[3] for s in samples])
    
    masks_tensors = torch.zeros(ids_tensors.shape, dtype=torch.long)
    masks_tensors = masks_tensors.masked_fill(ids_tensors != 0, 1)

    
    return ids_tensors, segments_tensors, masks_tensors, label_ids


In [7]:
train_ds = ConcatDataset([laptops_train_ds, restaurants_train_ds])
test_ds = ConcatDataset([laptops_test_ds, restaurants_test_ds])

train_loader = DataLoader(train_ds, batch_size=4, collate_fn=create_mini_batch, shuffle = True)
test_loader = DataLoader(test_ds, batch_size=50, collate_fn=create_mini_batch, shuffle = True)

In [8]:
for batch in train_loader:
    w,x,y,z = batch
    print(w)
    print(w.size())
    print(x)
    print(x.size())
    print(y)
    print(y.size())
    print(z)
    print(z.size())
    break

tensor([[  100,  1996,  2048,  2732, 27828,  2187,  3243,  2070,  2051,  3283,
          2000,  2330,  2037,  2219,  2173,  1012,   100, 27828,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0],
        [  100,  1996,  2833,  2467, 16958,  4840,  1998,  2366, 13364,  1012,
           100,  2833,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0],
        [  100,  3811, 16755,  2023,  2004,  2307,  3643,  2005,  6581, 10514,
          6182,  1998,  2326,  1012,   100,  2326,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0],
        [  100,  2174,  1010,  1045,  2001,  2045,  2005,  1037,  2147,  4596,
          2025,  2146,  3283,  2043,  2026, 11729,  2013,  2414,  4384,  1037,
          2200,  2312,  2300,  8569,  2290,  2006,  1996,  5894,  1012,   100,
 

# bert aspect extraction

In [9]:
from transformers import BertModel

In [10]:
class bert_ABSA(torch.nn.Module):
    def __init__(self, pretrain_model):
        super(bert_ABSA, self).__init__()
        self.bert = BertModel.from_pretrained(pretrain_model)
        self.linear = torch.nn.Linear(self.bert.config.hidden_size, 3)
        self.loss_fn = torch.nn.CrossEntropyLoss()

    def forward(self, ids_tensors, lable_tensors, masks_tensors, segments_tensors):
        _, pooled_outputs = self.bert(input_ids=ids_tensors, attention_mask=masks_tensors, token_type_ids=segments_tensors)
        # print(bert_outputs.size())
        linear_outputs = self.linear(pooled_outputs)
        # print(linear_outputs.size())

        if lable_tensors is not None:
            # print(linear_outputs.size())
            # print(tags_tensors.size())
            loss = self.loss_fn(linear_outputs, lable_tensors)
            return loss
        else:
            return linear_outputs

In [11]:
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
lr = 2e-5
model = bert_ABSA("bert-base-uncased").to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [12]:
import time
import numpy as np

In [13]:
def evl_time(t):
    min, sec= divmod(t, 60)
    hr, min = divmod(min, 60)
    return int(hr), int(min), int(sec)

def load_model(path):
    model.load_state_dict(torch.load(path), strict=False)
    
def save_model(name):
    torch.save(model.state_dict(), name)

def train(loader, epochs):
    all_data = len(loader)
    for epoch in range(epochs):
        finish_data = 0
        losses = []
        current_times = []
        correct_predictions = 0
        
        for data in loader:
            t0 = time.time()
            ids_tensors, segments_tensors, masks_tensors, label_ids = data
            ids_tensors = ids_tensors.to(DEVICE)
            segments_tensors = segments_tensors.to(DEVICE)
            label_ids = label_ids.to(DEVICE)
            masks_tensors = masks_tensors.to(DEVICE)

            loss = model(ids_tensors=ids_tensors, lable_tensors=label_ids, masks_tensors=masks_tensors, segments_tensors=segments_tensors)
            losses.append(loss.item())
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            finish_data += 1
            current_times.append(round(time.time()-t0,3))
            current = np.mean(current_times)
            hr, min, sec = evl_time(current*(all_data-finish_data) + current*all_data*(epochs-epoch-1))
            print('epoch:', epoch, " batch:", finish_data, "/" , all_data, " loss:", np.mean(losses), " hr:", hr, " min:", min," sec:", sec)         

        save_model('train-epoch-temp.pkl')

In [14]:
train(train_loader, 6)

149355006318  hr: 0  min: 0  sec: 16
epoch: 5  batch: 1212 / 1463  loss: 0.06295723273701141  hr: 0  min: 0  sec: 16
epoch: 5  batch: 1213 / 1463  loss: 0.06291003511636134  hr: 0  min: 0  sec: 16
epoch: 5  batch: 1214 / 1463  loss: 0.06286994091632006  hr: 0  min: 0  sec: 16
epoch: 5  batch: 1215 / 1463  loss: 0.0631474026678841  hr: 0  min: 0  sec: 16
epoch: 5  batch: 1216 / 1463  loss: 0.06309828165570396  hr: 0  min: 0  sec: 16
epoch: 5  batch: 1217 / 1463  loss: 0.06305265262035287  hr: 0  min: 0  sec: 16
epoch: 5  batch: 1218 / 1463  loss: 0.0630028611862195  hr: 0  min: 0  sec: 15
epoch: 5  batch: 1219 / 1463  loss: 0.06297745786395159  hr: 0  min: 0  sec: 15
epoch: 5  batch: 1220 / 1463  loss: 0.06307165179672158  hr: 0  min: 0  sec: 15
epoch: 5  batch: 1221 / 1463  loss: 0.06302445060405236  hr: 0  min: 0  sec: 15
epoch: 5  batch: 1222 / 1463  loss: 0.06297631300524248  hr: 0  min: 0  sec: 15
epoch: 5  batch: 1223 / 1463  loss: 0.06292759960403346  hr: 0  min: 0  sec: 15
epoch

# test

In [15]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
# import seaborn as sn
# import matplotlib.pyplot as plt

In [16]:
load_model('bert_ABSA_2.pkl')

In [17]:
def draw_c_matrix(c_matrix, title="test"):
    aix = []
    for y in range(len(c_matrix)):
        aix.append(y)
    df_cm = pd.DataFrame(c_matrix, aix, aix)
    sn.heatmap(df_cm, annot=True, fmt='g')
    plt.ylabel("prediction")
    plt.xlabel("ground truth")
    plt.title(title)
    plt.savefig(title+'.jpg')
    plt.show()

def test(loader):
    pred = []
    trueth = []
    with torch.no_grad():
        for data in loader:

            ids_tensors, segments_tensors, masks_tensors, label_ids = data
            ids_tensors = ids_tensors.to(DEVICE)
            segments_tensors = segments_tensors.to(DEVICE)
            masks_tensors = masks_tensors.to(DEVICE)

            outputs = model(ids_tensors, None, masks_tensors=masks_tensors, segments_tensors=segments_tensors)
            
            _, predictions = torch.max(outputs, dim=1)

            pred += list([int(i) for i in predictions])
            trueth += list([int(i) for i in label_ids])

    return trueth, pred

def predict(sentence, aspect, tokenizer):
    t1 = tokenizer.tokenize(sentence)
    t2 = tokenizer.tokenize(aspect)

    word_pieces = ['[cls]']
    word_pieces += t1
    word_pieces += ['[sep]']
    word_pieces += t2

    segment_tensor = [0] + [0]*len(t1) + [0] + [1]*len(t2)

    ids = tokenizer.convert_tokens_to_ids(word_pieces)
    input_tensor = torch.tensor([ids]).to(DEVICE)
    segment_tensor = torch.tensor(segment_tensor).to(DEVICE)

    with torch.no_grad():
        outputs = model(input_tensor, None, None, segments_tensors=segment_tensor)
        _, predictions = torch.max(outputs, dim=1)
    
    return word_pieces, predictions, outputs

In [18]:
%time x, y = test(test_loader)
print(classification_report(x, y, target_names=[str(i) for i in range(3)]))

Wall time: 8.84 s
              precision    recall  f1-score   support

           0       0.74      0.71      0.72       346
           1       0.62      0.68      0.65       355
           2       0.89      0.88      0.89      1025

    accuracy                           0.80      1726
   macro avg       0.75      0.75      0.75      1726
weighted avg       0.81      0.80      0.80      1726



In [19]:
x, y, z = predict("Not impressed with this movie, but the actors performance well.", "movie", tokenizer)
print(x)
print(y)
print(z)

['[cls]', 'not', 'impressed', 'with', 'this', 'movie', ',', 'but', 'the', 'actors', 'performance', 'well', '.', '[sep]', 'movie']
tensor([0], device='cuda:0')
tensor([[ 0.7563,  0.1108, -1.3406]], device='cuda:0')


In [20]:
x, y, z = predict("Not impressed with this movie, but the actors performance well.", "actors", tokenizer)
print(x)
print(y)
print(z)

['[cls]', 'not', 'impressed', 'with', 'this', 'movie', ',', 'but', 'the', 'actors', 'performance', 'well', '.', '[sep]', 'actors']
tensor([2], device='cuda:0')
tensor([[-1.9568, -2.5082,  5.1625]], device='cuda:0')


In [21]:
x, y, z = predict("For the price you pay this product is very good. However, battery life is a little lack-luster coming from a MacBook Pro.", "price", tokenizer)
print(x)
print(y)
print(z)

['[cls]', 'for', 'the', 'price', 'you', 'pay', 'this', 'product', 'is', 'very', 'good', '.', 'however', ',', 'battery', 'life', 'is', 'a', 'little', 'lack', '-', 'lust', '##er', 'coming', 'from', 'a', 'mac', '##book', 'pro', '.', '[sep]', 'price']
tensor([1], device='cuda:0')
tensor([[-2.8797e-01,  3.3837e-01, -1.4844e-04]], device='cuda:0')


In [22]:
x, y, z = predict("For the price you pay this product is very good. However, battery life is a little lack-luster coming from a MacBook Pro.", "battery life", tokenizer)
print(x)
print(y)
print(z)

['[cls]', 'for', 'the', 'price', 'you', 'pay', 'this', 'product', 'is', 'very', 'good', '.', 'however', ',', 'battery', 'life', 'is', 'a', 'little', 'lack', '-', 'lust', '##er', 'coming', 'from', 'a', 'mac', '##book', 'pro', '.', '[sep]', 'battery', 'life']
tensor([0], device='cuda:0')
tensor([[ 2.4636, -1.9068, -0.8480]], device='cuda:0')


In [23]:
x, y, z = predict("Python's design philosophy emphasizes code readability with its notable use of significant whitespace", "python", tokenizer)
print(x)
print(y)
print(z)

['[cls]', 'python', "'", 's', 'design', 'philosophy', 'emphasizes', 'code', 'read', '##ability', 'with', 'its', 'notable', 'use', 'of', 'significant', 'whites', '##pace', '[sep]', 'python']
tensor([1], device='cuda:0')
tensor([[-0.8861,  0.0493, -0.0957]], device='cuda:0')


In [24]:
x, y, z = predict("The AMD Ryzen is really slow.", "AMD Ryzen", tokenizer)
print(x)
print(y)
print(z)

['[cls]', 'the', 'am', '##d', 'ry', '##zen', 'is', 'really', 'slow', '.', '[sep]', 'am', '##d', 'ry', '##zen']
tensor([2], device='cuda:0')
tensor([[ 0.5644, -1.6248,  0.6487]], device='cuda:0')


In [25]:
x, y, z = predict("The AMD Ryzen is really fast", "AMD Ryzen", tokenizer)
print(x)
print(y)
print(z)

['[cls]', 'the', 'am', '##d', 'ry', '##zen', 'is', 'really', 'fast', '[sep]', 'am', '##d', 'ry', '##zen']
tensor([2], device='cuda:0')
tensor([[-2.1341, -2.8694,  5.8294]], device='cuda:0')


In [26]:
x, y, z = predict("Apple is better than Microsoft.", "Apple", tokenizer)
print(x)
print(y)
print(z)

['[cls]', 'apple', 'is', 'better', 'than', 'microsoft', '.', '[sep]', 'apple']
tensor([2], device='cuda:0')
tensor([[-0.6887, -1.6353,  2.0344]], device='cuda:0')


In [27]:
x, y, z = predict("Apple is better than Microsoft.", "Microsoft", tokenizer)
print(x)
print(y)
print(z)

['[cls]', 'apple', 'is', 'better', 'than', 'microsoft', '.', '[sep]', 'microsoft']
tensor([0], device='cuda:0')
tensor([[ 1.0112, -1.4838, -0.0873]], device='cuda:0')


In [28]:
x, y, z = predict("AMD's cpu is better than Intel's cpu", "cpu", tokenizer)
print(x)
print(y)
print(z)

['[cls]', 'am', '##d', "'", 's', 'cpu', 'is', 'better', 'than', 'intel', "'", 's', 'cpu', '[sep]', 'cpu']
tensor([2], device='cuda:0')
tensor([[-0.3281,  0.1926,  0.5652]], device='cuda:0')


In [29]:
x, y, z = predict("AMD's cpu is better than Intel's cpu", "Intel", tokenizer)
print(x)
print(y)
print(z)

['[cls]', 'am', '##d', "'", 's', 'cpu', 'is', 'better', 'than', 'intel', "'", 's', 'cpu', '[sep]', 'intel']
tensor([0], device='cuda:0')
tensor([[ 0.2396, -0.5333, -0.5908]], device='cuda:0')


In [30]:
x, y, z = predict("I have to say they have one of the fastest delivery times in the city","delivery times", tokenizer)
print(x)
print(y)
print(z)

['[cls]', 'i', 'have', 'to', 'say', 'they', 'have', 'one', 'of', 'the', 'fastest', 'delivery', 'times', 'in', 'the', 'city', '[sep]', 'delivery', 'times']
tensor([2], device='cuda:0')
tensor([[-2.0156, -2.8365,  5.4197]], device='cuda:0')


In [31]:
x, y, z = predict("I have to say they have one of the slowest delivery times in the city","delivery times", tokenizer)
print(x)
print(y)
print(z)

['[cls]', 'i', 'have', 'to', 'say', 'they', 'have', 'one', 'of', 'the', 'slow', '##est', 'delivery', 'times', 'in', 'the', 'city', '[sep]', 'delivery', 'times']
tensor([0], device='cuda:0')
tensor([[ 2.2668, -1.7621, -0.6383]], device='cuda:0')


In [32]:
x, y, z = predict("Great food but the service was dreadful","service", tokenizer)
print(x)
print(y)
print(z)

['[cls]', 'great', 'food', 'but', 'the', 'service', 'was', 'dreadful', '[sep]', 'service']
tensor([0], device='cuda:0')
tensor([[ 2.0604, -1.5593, -0.6599]], device='cuda:0')


In [33]:
x, y, z = predict("Great food but the service was dreadful","food", tokenizer)
print(x)
print(y)
print(z)

['[cls]', 'great', 'food', 'but', 'the', 'service', 'was', 'dreadful', '[sep]', 'food']
tensor([2], device='cuda:0')
tensor([[-0.9337, -0.6863,  1.9533]], device='cuda:0')
