# to dataset

In [1]:
from torch.utils.data import Dataset, DataLoader, ConcatDataset
from transformers import BertTokenizer
import torch
from torch.nn.utils.rnn import pad_sequence
import pandas as pd

In [2]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [3]:
class ae_dataset(Dataset):
    def __init__(self, df, tokenizer):
        self.df = df
        self.tokenizer = tokenizer

    def __getitem__(self, idx):
        tokens, tags, pols = self.df.iloc[idx, :3].values

        tokens = tokens.replace("'", "").strip("][").split(', ')
        tags = tags.strip('][').split(', ')
        pols = pols.strip('][').split(', ')

        bert_tokens = []
        bert_tags = []
        bert_pols = []
        for i in range(len(tokens)):
            t = tokenizer.tokenize(tokens[i])
            bert_tokens += t
            bert_tags += [int(tags[i])]*len(t)
            bert_pols += [int(pols[i])]*len(t)
        
        bert_ids = tokenizer.convert_tokens_to_ids(bert_tokens)

        ids_tensor = torch.tensor(bert_ids)
        tags_tensor = torch.tensor(bert_tags)
        pols_tensor = torch.tensor(bert_pols)

        return bert_tokens, ids_tensor, tags_tensor, pols_tensor

    def __len__(self):
        return len(self.df)


In [4]:
df = pd.read_csv("laptops_train.csv")
laptops_train_ds = ae_dataset(df, tokenizer)

df = pd.read_csv("laptops_test.csv")
laptops_test_ds = ae_dataset(df, tokenizer)

df = pd.read_csv("restaurants_train.csv")
restaurants_train_ds = ae_dataset(df, tokenizer)

df = pd.read_csv("restaurants_test.csv")
restaurants_test_ds = ae_dataset(df, tokenizer)

In [5]:
w,x,y,z = laptops_train_ds.__getitem__(121)
print(w)
print(len(w))
print(x)
print(len(x))
print(y)
print(len(y))
print(z)
print(len(z))

['the', 'battery', 'life', 'seems', 'to', 'be', 'very', 'good', ',', 'and', 'have', 'had', 'no', 'issues', 'with', 'it', '.']
17
tensor([1996, 6046, 2166, 3849, 2000, 2022, 2200, 2204, 1010, 1998, 2031, 2018,
        2053, 3314, 2007, 2009, 1012])
17
tensor([0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
17
tensor([0, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
17


In [6]:
train_ds = ConcatDataset([laptops_train_ds, restaurants_train_ds])
test_ds = ConcatDataset([laptops_test_ds, restaurants_test_ds])

In [7]:
def create_mini_batch(samples):
    ids_tensors = [s[1] for s in samples]
    ids_tensors = pad_sequence(ids_tensors, batch_first=True)

    tags_tensors = [s[2] for s in samples]
    tags_tensors = pad_sequence(tags_tensors, batch_first=True)

    pols_tensors = [s[3] for s in samples]
    pols_tensors = pad_sequence(pols_tensors, batch_first=True)
    
    masks_tensors = torch.zeros(ids_tensors.shape, dtype=torch.long)
    masks_tensors = masks_tensors.masked_fill(ids_tensors != 0, 1)
    
    return ids_tensors, tags_tensors, pols_tensors, masks_tensors

In [8]:
train_loader = DataLoader(train_ds, batch_size=5, collate_fn=create_mini_batch, shuffle = True)
test_loader = DataLoader(test_ds, batch_size=50, collate_fn=create_mini_batch, shuffle = True)

In [9]:
for batch in train_loader:
    w,x,y,z = batch
    print(w)
    print(w.size())
    print(x)
    print(x.size())
    print(y)
    print(y.size())
    print(z)
    print(z.size())
    break

tensor([[ 1045,  3641,  1037, 27333,  2072,  1998,  2356,  1018,  2335,  2005,
          2009,  2021,  2196,  2288,  2009,  1012,     0,     0,     0,     0,
             0,     0,     0,     0,     0],
        [ 2204,  2189,  1010,  2307,  2833,  1010, 26203,  2326, 15184,  7597,
          1012,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0],
        [ 1996,  4524,  9050,  2031,  2019,  5151,  5510,  2007,  1037, 27547,
         14902,  1010,  2119, 21271,  2100,  2664,  2025, 16031,  8029,  1012,
             0,     0,     0,     0,     0],
        [ 3524,  3095,  2003,  1038,  5802,  6528, 14626, 14477,  9397,  2890,
          7405,  6024,  1997,  2115,  2449,  2021,  2049,  1996,  2190, 11345,
          2006,  1996,  1057,  9333,   999],
        [ 1996,  6046,  2515,  1000, 23961,  1000,  2197,  2146,  2021,  1045,
          1000,  1049,  1000,  2469,  2019, 12200,  6046,  2052,  9611,  2008,
          3291,  1012,     0, 

# bert aspect extraction

In [4]:
from transformers import BertModel

In [5]:
class bert_aspect_extraction(torch.nn.Module):
    def __init__(self, pretrain_model):
        super(bert_aspect_extraction, self).__init__()
        self.bert = BertModel.from_pretrained(pretrain_model)
        self.linear = torch.nn.Linear(self.bert.config.hidden_size, 2)
        self.loss_fn = torch.nn.CrossEntropyLoss()

    def forward(self, ids_tensors, tags_tensors, masks_tensors):
        bert_outputs,_ = self.bert(input_ids=ids_tensors, attention_mask=masks_tensors)
        # print(bert_outputs.size())
        linear_outputs = self.linear(bert_outputs)
        # print(linear_outputs.size())

        if tags_tensors is not None:
            tags_tensors = tags_tensors.view(-1)
            linear_outputs = linear_outputs.view(-1,2)
            # print(linear_outputs.size())
            # print(tags_tensors.size())
            loss = self.loss_fn(linear_outputs, tags_tensors)
            return loss
        else:
            return linear_outputs

In [6]:
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
lr = 2e-5
model = bert_aspect_extraction("bert-base-uncased").to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [7]:
import time
import numpy as np

In [8]:
def evl_time(t):
    min, sec= divmod(t, 60)
    hr, min = divmod(min, 60)
    return int(hr), int(min), int(sec)

def load_model(path):
    model.load_state_dict(torch.load(path), strict=False)
    
def save_model(name):
    torch.save(model.state_dict(), name)

def train(loader, epochs):
    all_data = len(loader)
    for epoch in range(epochs):
        finish_data = 0
        losses = []
        current_times = []
        correct_predictions = 0
        
        for data in loader:
            t0 = time.time()
            ids_tensors, tags_tensors, _, masks_tensors = data
            ids_tensors = ids_tensors.to(DEVICE)
            tags_tensors = tags_tensors.to(DEVICE)
            masks_tensors = masks_tensors.to(DEVICE)

            loss = model(ids_tensors=ids_tensors, tags_tensors=tags_tensors, masks_tensors=masks_tensors)
            losses.append(loss.item())
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            finish_data += 1
            current_times.append(round(time.time()-t0,3))
            current = np.mean(current_times)
            hr, min, sec = evl_time(current*(all_data-finish_data) + current*all_data*(epochs-epoch-1))
            print('epoch:', epoch, " batch:", finish_data, "/" , all_data, " loss:", np.mean(losses), " hr:", hr, " min:", min," sec:", sec)         

        save_model('train-epoch-temp.pkl')

In [15]:
train(train_loader, 3)

  loss: 0.006587694087148549  hr: 0  min: 0  sec: 16
epoch: 2  batch: 922 / 1170  loss: 0.00658561877199939  hr: 0  min: 0  sec: 16
epoch: 2  batch: 923 / 1170  loss: 0.006582272819261408  hr: 0  min: 0  sec: 16
epoch: 2  batch: 924 / 1170  loss: 0.0066696877986890946  hr: 0  min: 0  sec: 15
epoch: 2  batch: 925 / 1170  loss: 0.00666298043509832  hr: 0  min: 0  sec: 15
epoch: 2  batch: 926 / 1170  loss: 0.006655934066853645  hr: 0  min: 0  sec: 15
epoch: 2  batch: 927 / 1170  loss: 0.006675612405716103  hr: 0  min: 0  sec: 15
epoch: 2  batch: 928 / 1170  loss: 0.006668631931586725  hr: 0  min: 0  sec: 15
epoch: 2  batch: 929 / 1170  loss: 0.0066617088813051  hr: 0  min: 0  sec: 15
epoch: 2  batch: 930 / 1170  loss: 0.006655523707208935  hr: 0  min: 0  sec: 15
epoch: 2  batch: 931 / 1170  loss: 0.006648549227490766  hr: 0  min: 0  sec: 15
epoch: 2  batch: 932 / 1170  loss: 0.0066438672362029595  hr: 0  min: 0  sec: 15
epoch: 2  batch: 933 / 1170  loss: 0.0066370976475257845  hr: 0  min:

# test

In [9]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
# import seaborn as sn
# import matplotlib.pyplot as plt

In [10]:
load_model('bert_aspect_extraction_2.pkl')

In [38]:
def draw_c_matrix(c_matrix, title="test"):
    aix = []
    for y in range(len(c_matrix)):
        aix.append(y)
    df_cm = pd.DataFrame(c_matrix, aix, aix)
    sn.heatmap(df_cm, annot=True, fmt='g')
    plt.ylabel("prediction")
    plt.xlabel("ground truth")
    plt.title(title)
    plt.savefig(title+'.jpg')
    plt.show()

def test(loader):
    pred = []
    trueth = []
    with torch.no_grad():
        for data in loader:

            ids_tensors, tags_tensors, _, masks_tensors = data
            ids_tensors = ids_tensors.to(DEVICE)
            tags_tensors = tags_tensors.to(DEVICE)
            masks_tensors = masks_tensors.to(DEVICE)

            outputs = model(ids_tensors=ids_tensors, tags_tensors=None, masks_tensors=masks_tensors)
            
            _, predictions = torch.max(outputs, dim=2)

            pred += list([int(j) for i in predictions for j in i ])
            trueth += list([int(j) for i in tags_tensors for j in i ])

    return trueth, pred

def predict(sentence, tokenizer):
    word_pieces = []
    tokens = tokenizer.tokenize(sentence)
    word_pieces += tokens

    ids = tokenizer.convert_tokens_to_ids(word_pieces)
    input_tensor = torch.tensor([ids]).to(DEVICE)

    with torch.no_grad():
        outputs = model(input_tensor, None, None)
        _, predictions = torch.max(outputs, dim=2)
    predictions = predictions[0].tolist()
    
    return word_pieces, predictions, outputs

In [35]:
%time x, y = test(test_loader)
print(classification_report(x, y, target_names=[str(i) for i in range(2)]))

Wall time: 23.7 s
              precision    recall  f1-score   support

           0       0.99      1.00      0.99    130191
           1       0.95      0.88      0.91      8871

    accuracy                           0.99    139062
   macro avg       0.97      0.94      0.95    139062
weighted avg       0.99      0.99      0.99    139062



In [63]:
x, y, z = predict("Not impressed with this movie, but the actors performance well.", tokenizer)
for i in range(len(x)):
    print((x[i], y[i]), end=" ")


('not', 0) ('impressed', 0) ('with', 0) ('this', 0) ('movie', 0) (',', 0) ('but', 0) ('the', 0) ('actors', 1) ('performance', 1) ('well', 0) ('.', 0) 

In [42]:
x, y, z = predict("The movie is terrible, but the actors performance well.", tokenizer)
for i in range(len(x)):
    print((x[i], y[i]), end=" ")

('the', 0) ('movie', 0) ('is', 0) ('terrible', 0) (',', 0) ('but', 0) ('the', 0) ('actors', 1) ('performance', 0) ('well', 0) ('.', 0) 

In [43]:
x, y, z = predict("Movie is terrible, but the actors performance well.", tokenizer)
for i in range(len(x)):
    print((x[i], y[i]), end=" ")

('movie', 1) ('is', 0) ('terrible', 0) (',', 0) ('but', 0) ('the', 0) ('actors', 1) ('performance', 1) ('well', 0) ('.', 0) 

In [44]:
x, y, z = predict("The AMD Ryzen is really fast !", tokenizer)
for i in range(len(x)):
    print((x[i], y[i]), end=" ")

('the', 0) ('am', 1) ('##d', 1) ('ry', 1) ('##zen', 1) ('is', 0) ('really', 0) ('fast', 0) ('!', 0) 

In [45]:
x, y, z = predict("For the price you pay this product is very good. However, battery life is a little lack-luster coming from a MacBook Pro.", tokenizer)
for i in range(len(x)):
    print((x[i], y[i]), end=" ")

('for', 0) ('the', 0) ('price', 1) ('you', 0) ('pay', 0) ('this', 0) ('product', 0) ('is', 0) ('very', 0) ('good', 0) ('.', 0) ('however', 0) (',', 0) ('battery', 1) ('life', 1) ('is', 0) ('a', 0) ('little', 0) ('lack', 0) ('-', 0) ('lust', 0) ('##er', 0) ('coming', 0) ('from', 0) ('a', 0) ('mac', 0) ('##book', 0) ('pro', 0) ('.', 0) 

In [46]:
x, y, z = predict("For the elephants you pay this product is very good. However, lions is a little lack-luster coming from a MacBook Pro.", tokenizer)
for i in range(len(x)):
    print((x[i], y[i]), end=" ")

('for', 0) ('the', 0) ('elephants', 1) ('you', 0) ('pay', 0) ('this', 0) ('product', 0) ('is', 0) ('very', 0) ('good', 0) ('.', 0) ('however', 0) (',', 0) ('lions', 1) ('is', 0) ('a', 0) ('little', 0) ('lack', 0) ('-', 0) ('lust', 0) ('##er', 0) ('coming', 0) ('from', 0) ('a', 0) ('mac', 0) ('##book', 0) ('pro', 0) ('.', 0) 

In [47]:
x, y, z = predict("For the java you pay this product is very good. However, python is a little lack-luster coming from a MacBook Pro.", tokenizer)
for i in range(len(x)):
    print((x[i], y[i]), end=" ")

('for', 0) ('the', 0) ('java', 1) ('you', 0) ('pay', 0) ('this', 0) ('product', 0) ('is', 0) ('very', 0) ('good', 0) ('.', 0) ('however', 0) (',', 0) ('python', 1) ('is', 0) ('a', 0) ('little', 0) ('lack', 0) ('-', 0) ('lust', 0) ('##er', 0) ('coming', 0) ('from', 0) ('a', 0) ('mac', 0) ('##book', 0) ('pro', 0) ('.', 0) 

In [48]:
x, y, z = predict("Apple is better than Microsoft.", tokenizer)
for i in range(len(x)):
    print((x[i], y[i]), end=" ")

('apple', 0) ('is', 0) ('better', 0) ('than', 0) ('microsoft', 0) ('.', 0) 

In [49]:
x, y, z = predict("Apple is better than Microsoft. Microsoft is bad.", tokenizer)
for i in range(len(x)):
    print((x[i], y[i]), end=" ")

('apple', 0) ('is', 0) ('better', 0) ('than', 0) ('microsoft', 1) ('.', 0) ('microsoft', 1) ('is', 0) ('bad', 0) ('.', 0) 

In [50]:
x, y, z = predict("Apple is better than Microsoft. Apple is good.", tokenizer)
for i in range(len(x)):
    print((x[i], y[i]), end=" ")

('apple', 0) ('is', 0) ('better', 0) ('than', 0) ('microsoft', 1) ('.', 0) ('apple', 1) ('is', 0) ('good', 0) ('.', 0) 

In [51]:
x, y, z = predict("AMD's cpu is better than Intel's cpu", tokenizer)
for i in range(len(x)):
    print((x[i], y[i]), end=" ")

('am', 0) ('##d', 0) ("'", 0) ('s', 0) ('cpu', 1) ('is', 0) ('better', 0) ('than', 0) ('intel', 0) ("'", 0) ('s', 0) ('cpu', 1) 

In [52]:
x, y, z = predict("AMD is better than Intel.", tokenizer)
for i in range(len(x)):
    print((x[i], y[i]), end=" ")

('am', 1) ('##d', 1) ('is', 0) ('better', 0) ('than', 0) ('intel', 0) ('.', 0) 

In [53]:
x, y, z = predict("AMD is better than Intel. Intel is so expensive", tokenizer)
for i in range(len(x)):
    print((x[i], y[i]), end=" ")

('am', 1) ('##d', 1) ('is', 0) ('better', 0) ('than', 0) ('intel', 1) ('.', 0) ('intel', 1) ('is', 0) ('so', 0) ('expensive', 0) 

In [54]:
x, y, z = predict("AMD is better than Intel. It is so expensive", tokenizer)
for i in range(len(x)):
    print((x[i], y[i]), end=" ")

('am', 1) ('##d', 1) ('is', 0) ('better', 0) ('than', 0) ('intel', 1) ('.', 0) ('it', 0) ('is', 0) ('so', 0) ('expensive', 0) 

In [55]:
x, y, z = predict("I think Apple is better than Microsoft", tokenizer)
for i in range(len(x)):
    print((x[i], y[i]), end=" ")

('i', 0) ('think', 0) ('apple', 0) ('is', 0) ('better', 0) ('than', 0) ('microsoft', 0) 

In [56]:
x, y, z = predict("cpu and graphic card are strong", tokenizer)
for i in range(len(x)):
    print((x[i], y[i]), end=" ")

('cpu', 1) ('and', 0) ('graphic', 1) ('card', 1) ('are', 0) ('strong', 0) 

In [57]:
x, y, z = predict("dogs and cats are cute", tokenizer)
for i in range(len(x)):
    print((x[i], y[i]), end=" ")

('dogs', 1) ('and', 0) ('cats', 1) ('are', 0) ('cute', 0) 

In [58]:
x, y, z = predict("Python's design philosophy emphasizes code readability with its notable use of significant whitespace", tokenizer)
for i in range(len(x)):
    print((x[i], y[i]), end=" ")

('python', 0) ("'", 0) ('s', 0) ('design', 0) ('philosophy', 0) ('emphasizes', 0) ('code', 1) ('read', 1) ('##ability', 1) ('with', 0) ('its', 0) ('notable', 0) ('use', 0) ('of', 0) ('significant', 0) ('whites', 1) ('##pace', 0) 

In [59]:
x, y, z = predict("I have to say they have one of the fastest delivery times in the city", tokenizer)
for i in range(len(x)):
    print((x[i], y[i]), end=" ")

('i', 0) ('have', 0) ('to', 0) ('say', 0) ('they', 0) ('have', 0) ('one', 0) ('of', 0) ('the', 0) ('fastest', 0) ('delivery', 1) ('times', 1) ('in', 0) ('the', 0) ('city', 0) 

In [60]:
x, y, z = predict("I have to say they have one of the fastest ASIB-KIU in the city", tokenizer)
for i in range(len(x)):
    print((x[i], y[i]), end=" ")

('i', 0) ('have', 0) ('to', 0) ('say', 0) ('they', 0) ('have', 0) ('one', 0) ('of', 0) ('the', 0) ('fastest', 0) ('as', 1) ('##ib', 1) ('-', 1) ('ki', 1) ('##u', 1) ('in', 0) ('the', 0) ('city', 0) 

* need NLI preprocessing (it, this, he...) 