In [None]:
!git clone https://github.com/Cld338/Aspect-Term-Extraction-and-Analysis-Custom ATE
!pip install transformers

In [None]:
ATE_file = "bert_ATE_epoch3.pkl"
ABSA_file = "bert_ABSA_epoch3.pkl"

# import

In [None]:
from google.colab import drive
drive.mount('/content/gdrive/')

In [1]:
from ATE.model.bert import bert_ATE, bert_ABSA
from ATE.data.dataset import dataset_ATM, dataset_ABSA

In [2]:
from torch.utils.data import DataLoader, ConcatDataset
from transformers import BertTokenizer
import torch
from torch.nn.utils.rnn import pad_sequence
import pandas as pd
import time
import numpy as np
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

# Settings

In [3]:
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
pretrain_model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(pretrain_model_name)
lr = 2e-5
model_ATE = bert_ATE(pretrain_model_name).to(DEVICE)
optimizer_ATE = torch.optim.Adam(model_ATE.parameters(), lr=lr)
model_ABSA = bert_ABSA(pretrain_model_name).to(DEVICE)
optimizer_ABSA = torch.optim.Adam(model_ABSA.parameters(), lr=lr)

In [4]:
def evl_time(t):
    min, sec= divmod(t, 60)
    hr, min = divmod(min, 60)
    return int(hr), int(min), int(sec)

def load_model(model, path):
    model.load_state_dict(torch.load(path), strict=False)
    return model
    
def save_model(model, name):
    torch.save(model.state_dict(), name)

# Acpect Term Extraction

In [4]:
laptops_train_ds = dataset_ATM(pd.read_csv("ATE/data/laptops_train.csv"), tokenizer)
laptops_test_ds = dataset_ATM(pd.read_csv("ATE/data/laptops_test.csv"), tokenizer)
restaurants_train_ds = dataset_ATM(pd.read_csv("ATE/data/restaurants_train.csv"), tokenizer)
restaurants_test_ds = dataset_ATM(pd.read_csv("ATE/data/restaurants_test.csv"), tokenizer)
twitter_train_ds = dataset_ATM(pd.read_csv("ATE/data/twitter_train.csv"), tokenizer)
twitter_test_ds = dataset_ATM(pd.read_csv("ATE/data/twitter_test.csv"), tokenizer)

In [5]:
# w,x,y,z = laptops_train_ds.__getitem__(121)
# print(w)
# print(x)
# print(x.size())
# print(y)
# print(y.size())
# print(z)
# print(z.size())

In [8]:
train_ds = ConcatDataset([laptops_train_ds, restaurants_train_ds, twitter_train_ds])
test_ds = ConcatDataset([laptops_test_ds, restaurants_test_ds, twitter_test_ds])

In [9]:
def create_mini_batch(samples):
    ids_tensors = [s[1] for s in samples]
    ids_tensors = pad_sequence(ids_tensors, batch_first=True)

    tags_tensors = [s[2] for s in samples]
    tags_tensors = pad_sequence(tags_tensors, batch_first=True)

    pols_tensors = [s[3] for s in samples]
    pols_tensors = pad_sequence(pols_tensors, batch_first=True)
    
    masks_tensors = torch.zeros(ids_tensors.shape, dtype=torch.long)
    masks_tensors = masks_tensors.masked_fill(ids_tensors != 0, 1)
    
    return ids_tensors, tags_tensors, pols_tensors, masks_tensors

In [10]:
train_loader = DataLoader(train_ds, batch_size=5, collate_fn=create_mini_batch, shuffle = True)
test_loader = DataLoader(test_ds, batch_size=50, collate_fn=create_mini_batch, shuffle = True)

In [11]:
# for batch in train_loader:
#     w,x,y,z = batch
#     print(w)
#     print(w.size())
#     print(x)
#     print(x.size())
#     print(y)
#     print(y.size())
#     print(z)
#     print(z.size())
#     break

In [19]:
def train_model_ATE(loader, epochs):
    all_data = len(loader)
    for epoch in range(epochs):
        finish_data = 0
        losses = []
        current_times = []
        correct_predictions = 0
        
        for data in loader:
            t0 = time.time()
            ids_tensors, tags_tensors, _, masks_tensors = data
            ids_tensors = ids_tensors.to(DEVICE)
            tags_tensors = tags_tensors.to(DEVICE)
            masks_tensors = masks_tensors.to(DEVICE)

            loss = model_ATE(ids_tensors=ids_tensors, tags_tensors=tags_tensors, masks_tensors=masks_tensors)
            losses.append(loss.item())
            loss.backward()
            optimizer_ATE.step()
            optimizer_ATE.zero_grad()

            finish_data += 1
            current_times.append(round(time.time()-t0,3))
            current = np.mean(current_times)
            hr, min, sec = evl_time(current*(all_data-finish_data) + current*all_data*(epochs-epoch-1))
            print('epoch:', epoch, " batch:", finish_data, "/" , all_data, " loss:", np.mean(losses), " hr:", hr, " min:", min," sec:", sec)         

        save_model(model_ATE, f'/content/gdrive/MyDrive/제대_사사과정/{ATE_file}')
        
def test_model_ATE(loader):
    pred = []
    trueth = []
    with torch.no_grad():
        for data in loader:

            ids_tensors, tags_tensors, _, masks_tensors = data
            ids_tensors = ids_tensors.to(DEVICE)
            tags_tensors = tags_tensors.to(DEVICE)
            masks_tensors = masks_tensors.to(DEVICE)

            outputs = model_ATE(ids_tensors=ids_tensors, tags_tensors=None, masks_tensors=masks_tensors)

            _, predictions = torch.max(outputs, dim=2)

            pred += list([int(j) for i in predictions for j in i ])
            trueth += list([int(j) for i in tags_tensors for j in i ])

    return trueth, pred



In [None]:
%time train_model_ATE(train_loader, 3)

In [14]:
model_ATE = load_model(model_ATE, f'/content/gdrive/MyDrive/제대_사사과정/{ATE_file}')

In [18]:
%time x, y = test_model_ATE(test_loader)
print(classification_report(x, y, target_names=[str(i) for i in range(3)]))

Wall time: 23.1 s
              precision    recall  f1-score   support

           0       0.99      0.99      0.99    140373
           1       0.84      0.92      0.88      6486
           2       0.93      0.73      0.82      3837

    accuracy                           0.98    150696
   macro avg       0.92      0.88      0.90    150696
weighted avg       0.99      0.98      0.98    150696



# Aspect Based Sentiment Analysis

In [5]:
laptops_train_ds = dataset_ABSA(pd.read_csv("ATE/data/laptops_train.csv"), tokenizer)
laptops_test_ds = dataset_ABSA(pd.read_csv("ATE/data/laptops_test.csv"), tokenizer)
restaurants_train_ds = dataset_ABSA(pd.read_csv("ATE/data/restaurants_train.csv"), tokenizer)
restaurants_test_ds = dataset_ABSA(pd.read_csv("ATE/data/restaurants_test.csv"), tokenizer)
twitter_train_ds = dataset_ABSA(pd.read_csv("ATE/data/twitter_train.csv"), tokenizer)
twitter_test_ds = dataset_ABSA(pd.read_csv("ATE/data/twitter_test.csv"), tokenizer)

In [6]:
w,x,y,z = laptops_train_ds.__getitem__(121)
print(w)
print(len(w))
print(x)
print(len(x))
print(y)
print(len(y))
print(z)

['[cls]', 'the', 'battery', 'life', 'seems', 'to', 'be', 'very', 'good', ',', 'and', 'have', 'had', 'no', 'issues', 'with', 'it', '.', '[sep]', 'battery', 'life']
21
tensor([ 100, 1996, 6046, 2166, 3849, 2000, 2022, 2200, 2204, 1010, 1998, 2031,
        2018, 2053, 3314, 2007, 2009, 1012,  100, 6046, 2166])
21
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1])
21
tensor(2)


In [7]:
def create_mini_batch2(samples):
    ids_tensors = [s[1] for s in samples]
    ids_tensors = pad_sequence(ids_tensors, batch_first=True)

    segments_tensors = [s[2] for s in samples]
    segments_tensors = pad_sequence(segments_tensors, batch_first=True)

    label_ids = torch.stack([s[3] for s in samples])
    
    masks_tensors = torch.zeros(ids_tensors.shape, dtype=torch.long)
    masks_tensors = masks_tensors.masked_fill(ids_tensors != 0, 1)

    return ids_tensors, segments_tensors, masks_tensors, label_ids

In [8]:
train_ds = ConcatDataset([laptops_train_ds, restaurants_train_ds, twitter_train_ds])
test_ds = ConcatDataset([laptops_test_ds, restaurants_test_ds, twitter_test_ds])

train_loader = DataLoader(train_ds, batch_size=4, collate_fn=create_mini_batch2, shuffle = True)
test_loader = DataLoader(test_ds, batch_size=50, collate_fn=create_mini_batch2, shuffle = True)

In [9]:
# for batch in train_loader:
#     w,x,y,z = batch
#     print(w)
#     print(w.size())
#     print(x)
#     print(x.size())
#     print(y)
#     print(y.size())
#     print(z)
#     print(z.size())
#     break

In [20]:
def train_model_ABSA(loader, epochs):
    all_data = len(loader)
    for epoch in range(epochs):
        finish_data = 0
        losses = []
        current_times = []
        correct_predictions = 0
        
        for data in loader:
            t0 = time.time()
            ids_tensors, segments_tensors, masks_tensors, label_ids = data
            ids_tensors = ids_tensors.to(DEVICE)
            segments_tensors = segments_tensors.to(DEVICE)
            label_ids = label_ids.to(DEVICE)
            masks_tensors = masks_tensors.to(DEVICE)

            loss = model_ABSA(ids_tensors=ids_tensors, lable_tensors=label_ids, masks_tensors=masks_tensors, segments_tensors=segments_tensors)
            losses.append(loss.item())
            loss.backward()
            optimizer_ABSA.step()
            optimizer_ABSA.zero_grad()

            finish_data += 1
            current_times.append(round(time.time()-t0,3))
            current = np.mean(current_times)
            hr, min, sec = evl_time(current*(all_data-finish_data) + current*all_data*(epochs-epoch-1))
            print('epoch:', epoch, " batch:", finish_data, "/" , all_data, " loss:", np.mean(losses), " hr:", hr, " min:", min," sec:", sec)         

        save_model(model_ABSA, f'/content/gdrive/MyDrive/제대_사사과정/{ABSA_file}')
        
def test_model_ABSA(loader):
    pred = []
    trueth = []
    with torch.no_grad():
        for data in loader:

            ids_tensors, segments_tensors, masks_tensors, label_ids = data
            ids_tensors = ids_tensors.to(DEVICE)
            segments_tensors = segments_tensors.to(DEVICE)
            masks_tensors = masks_tensors.to(DEVICE)

            outputs = model_ABSA(ids_tensors, None, masks_tensors=masks_tensors, segments_tensors=segments_tensors)
            
            _, predictions = torch.max(outputs, dim=1)

            pred += list([int(i) for i in predictions])
            trueth += list([int(i) for i in label_ids])

    return trueth, pred



In [None]:
%time train_model_ABSA(train_loader, 3)

In [12]:
model_ABSA = load_model(model_ABSA, f'/content/gdrive/MyDrive/제대_사사과정/{ABSA_file}')

In [13]:
%time x, y = test_model_ABSA(test_loader)
print(classification_report(x, y, target_names=[str(i) for i in range(3)]))

Wall time: 10.1 s
              precision    recall  f1-score   support

           0       0.72      0.75      0.74       497
           1       0.67      0.74      0.70       710
           2       0.89      0.83      0.86      1239

    accuracy                           0.79      2446
   macro avg       0.76      0.77      0.77      2446
weighted avg       0.79      0.79      0.79      2446



# ATE + ABSA

In [57]:
def predict_model_ABSA(sentence, aspect, tokenizer):
    t1 = tokenizer.tokenize(sentence)
    t2 = tokenizer.tokenize(aspect)

    word_pieces = ['[cls]']
    word_pieces += t1
    word_pieces += ['[sep]']
    word_pieces += t2

    segment_tensor = [0] + [0]*len(t1) + [0] + [1]*len(t2)

    ids = tokenizer.convert_tokens_to_ids(word_pieces)
    input_tensor = torch.tensor([ids]).to(DEVICE)
    segment_tensor = torch.tensor(segment_tensor).to(DEVICE)

    with torch.no_grad():
        outputs = model_ABSA(input_tensor, None, None, segments_tensors=segment_tensor)
        _, predictions = torch.max(outputs, dim=1)
    
    return word_pieces, predictions, outputs

def predict_model_ATE(sentence, tokenizer):
    word_pieces = []
    tokens = tokenizer.tokenize(sentence)
    word_pieces += tokens

    ids = tokenizer.convert_tokens_to_ids(word_pieces)
    input_tensor = torch.tensor([ids]).to(DEVICE)

    with torch.no_grad():
        outputs = model_ATE(input_tensor, None, None)
        _, predictions = torch.max(outputs, dim=2)
    predictions = predictions[0].tolist()

    return word_pieces, predictions, outputs

def ATE_ABSA(text):
    terms = []
    word = ""
    x, y, z = predict_model_ATE(text, tokenizer)
    for i in range(len(y)):
        if y[i] == 1:
            if len(word) != 0:
                terms.append(word.replace(" ##",""))
            word = x[i]
        if y[i] == 2:
            word += (" " + x[i])
            
    
    if len(word) != 0:
            terms.append(word.replace(" ##",""))
            
    print("tokens:", x)
    print("ATE:", terms)
    
    if len(terms) != 0:
        for i in terms:
            _, c, p = predict_model_ABSA(text, i, tokenizer)
            print("term:", [i], "class:", [int(c)], "ABSA:", [float(p[0][0]), float(p[0][1]), float(p[0][2])])


In [58]:
model_ABSA = load_model(model_ABSA, f'/content/gdrive/MyDrive/제대_사사과정/{ABSA_file}')
model_ATE = load_model(model_ATE, f'/content/gdrive/MyDrive/제대_사사과정/{ATE_file}')

In [59]:
text = "For the price you pay this product is very good. However, battery life is a little lack-luster coming from a MacBook Pro."
ATE_ABSA(text)

tokens: ['for', 'the', 'price', 'you', 'pay', 'this', 'product', 'is', 'very', 'good', '.', 'however', ',', 'battery', 'life', 'is', 'a', 'little', 'lack', '-', 'lust', '##er', 'coming', 'from', 'a', 'mac', '##book', 'pro', '.']
ATE: ['price', 'battery life']
term: ['price'] class: [2] ABSA: [-2.057527542114258, -0.6292028427124023, 2.606888771057129]
term: ['battery life'] class: [0] ABSA: [5.0118207931518555, -2.3663508892059326, -1.8548927307128906]


In [60]:
text = "I think Apple is better than Microsoft."
ATE_ABSA(text)

tokens: ['i', 'think', 'apple', 'is', 'better', 'than', 'microsoft', '.']
ATE: ['apple', 'microsoft']
term: ['apple'] class: [1] ABSA: [-0.6019173264503479, 1.7071634531021118, -0.6558032631874084]
term: ['microsoft'] class: [0] ABSA: [3.7963366508483887, 0.2745559811592102, -3.170161485671997]


# Cyberpunk 2077 - Xbox One

https://www.amazon.com/-/zh_TW/Cyberpunk-2077-Xbox-One/product-reviews/B07DJW4WZC/ref=cm_cr_arp_d_paging_btm_next_2?ie=UTF8&reviewerType=all_reviews&pageNumber=2

In [61]:
text = "Spent 5 hours downloading updates."
ATE_ABSA(text)

tokens: ['spent', '5', 'hours', 'download', '##ing', 'updates', '.']
ATE: ['download', '##ing updates']
term: ['download'] class: [1] ABSA: [-1.8432576656341553, 5.523550987243652, -3.1730458736419678]
term: ['##ing updates'] class: [1] ABSA: [-2.6808316707611084, 6.011416912078857, -2.864603042602539]


In [62]:
text = "Install is buggy, so after downloading a day one patch that's nearly 3 times the size of the game, it glitched on the CDs and had to reinstall the game from scratch."
ATE_ABSA(text)

tokens: ['install', 'is', 'bug', '##gy', ',', 'so', 'after', 'download', '##ing', 'a', 'day', 'one', 'patch', 'that', "'", 's', 'nearly', '3', 'times', 'the', 'size', 'of', 'the', 'game', ',', 'it', 'g', '##lit', '##ched', 'on', 'the', 'cds', 'and', 'had', 'to', 'reins', '##tal', '##l', 'the', 'game', 'from', 'scratch', '.']
ATE: ['install', 'patch', 'size', 'game', 'cds', 'game']
term: ['install'] class: [1] ABSA: [-2.2875146865844727, 4.987157821655273, -2.462806463241577]
term: ['patch'] class: [0] ABSA: [3.8425865173339844, -1.115385890007019, -1.8236310482025146]
term: ['size'] class: [1] ABSA: [1.2320547103881836, 1.6156060695648193, -2.0953636169433594]
term: ['game'] class: [1] ABSA: [-2.642533779144287, 5.77440071105957, -2.5892865657806396]
term: ['cds'] class: [1] ABSA: [-2.673746347427368, 5.815953731536865, -2.6103947162628174]
term: ['game'] class: [1] ABSA: [-2.642533779144287, 5.77440071105957, -2.5892865657806396]


In [71]:
text = "Cyberpunk 2077 freezes constantly, frame rates are terrible, and it's extremely frustrating to try to play."
ATE_ABSA(text)

tokens: ['cyber', '##pu', '##nk', '207', '##7', 'freeze', '##s', 'constantly', ',', 'frame', 'rates', 'are', 'terrible', ',', 'and', 'it', "'", 's', 'extremely', 'frustrating', 'to', 'try', 'to', 'play', '.']
ATE: ['cyberpu', 'frame rates']
term: ['cyberpu'] class: [0] ABSA: [4.44415283203125, -0.36560752987861633, -3.3459084033966064]
term: ['frame rates'] class: [0] ABSA: [5.2562408447265625, -2.305537700653076, -2.0652124881744385]


In [66]:
text = "Cyberpunk 2077 is completely unplayable on xbox one. They should have never released this for current gen."
ATE_ABSA(text)

tokens: ['cyber', '##pu', '##nk', '207', '##7', 'is', 'completely', 'un', '##play', '##able', 'on', 'xbox', 'one', '.', 'they', 'should', 'have', 'never', 'released', 'this', 'for', 'current', 'gen', '.']
ATE: ['xbox']
term: ['xbox'] class: [1] ABSA: [-2.0123980045318604, 5.7579731941223145, -3.235884666442871]


In [67]:
text = "It’s just a cash grab, the game crashes constantly, runs at like 20 fps, half the environment and characters only load when you’re three feet away from them. Unless you’re in a small space the game looks awful. The worst game i’ve ever played in years visually. It looks worse than later xbox 360 games."
ATE_ABSA(text)

tokens: ['it', '’', 's', 'just', 'a', 'cash', 'grab', ',', 'the', 'game', 'crashes', 'constantly', ',', 'runs', 'at', 'like', '20', 'f', '##ps', ',', 'half', 'the', 'environment', 'and', 'characters', 'only', 'load', 'when', 'you', '’', 're', 'three', 'feet', 'away', 'from', 'them', '.', 'unless', 'you', '’', 're', 'in', 'a', 'small', 'space', 'the', 'game', 'looks', 'awful', '.', 'the', 'worst', 'game', 'i', '’', 've', 'ever', 'played', 'in', 'years', 'visually', '.', 'it', 'looks', 'worse', 'than', 'later', 'xbox', '360', 'games', '.']
ATE: ['runs', 'environment', 'characters', 'xbox 360']
term: ['runs'] class: [0] ABSA: [4.292036056518555, -0.47526031732559204, -3.092445135116577]
term: ['environment'] class: [0] ABSA: [3.7919883728027344, 0.2691774070262909, -3.255924940109253]
term: ['characters'] class: [0] ABSA: [3.6958675384521484, 0.34112149477005005, -3.180785655975342]
term: ['xbox 360'] class: [0] ABSA: [3.6143949031829834, 0.5955154299736023, -3.378544330596924]


In [68]:
text = "CD Projekt Red should have just abandoned the current gen consoles instead of cheating people out of their money."
ATE_ABSA(text)

tokens: ['cd', 'pro', '##je', '##kt', 'red', 'should', 'have', 'just', 'abandoned', 'the', 'current', 'gen', 'consoles', 'instead', 'of', 'cheating', 'people', 'out', 'of', 'their', 'money', '.']
ATE: ['gen', 'consoles']
term: ['gen'] class: [1] ABSA: [-2.468400478363037, 5.807504177093506, -2.8556580543518066]
term: ['consoles'] class: [1] ABSA: [-2.6495633125305176, 5.937881946563721, -2.8111135959625244]
