# **Aspect-based Sentiment Analysis using BERTs**

## **Prepare Dataset**

In [None]:
!pip install -q datasets==3.2.0

## **Load Dataset**

In [None]:
from datasets import load_dataset

ds = load_dataset("Chow05/SemEval-2014-Task-4")

In [None]:
ds

DatasetDict({
    train: Dataset({
        features: ['Tokens', 'Tags', 'Polarities'],
        num_rows: 3602
    })
    test: Dataset({
        features: ['Tokens', 'Tags', 'Polarities'],
        num_rows: 1119
    })
})

In [None]:
ds['train'][0]

{'Tokens': ['But', 'the', 'staff', 'was', 'so', 'horrible', 'to', 'us', '.'],
 'Tags': ['0', '0', '1', '0', '0', '0', '0', '0', '0'],
 'Polarities': ['-1', '-1', '0', '-1', '-1', '-1', '-1', '-1', '-1']}

## **Build Vocab**

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("albert/albert-base-v2")

In [None]:
def tokenize_and_align_labels(examples):
    sentences, sentence_tags = [], []
    labels = []
    for tokens, pols in zip(examples['Tokens'], examples['Polarities']):

        bert_tokens = []
        bert_att = []
        pols_label = 0
        for i in range(len(tokens)):
            t = tokenizer.tokenize(tokens[i])
            bert_tokens += t
            if int(pols[i]) != -1:
                bert_att += t
                pols_label = int(pols[i])

        sentences.append(" ".join(bert_tokens))
        sentence_tags.append(" ".join(bert_att))
        labels.append(pols_label)

    tokenized_inputs = tokenizer(sentences, sentence_tags, padding=True, truncation=True, return_tensors="pt")
    tokenized_inputs['labels'] = labels
    return tokenized_inputs

In [None]:
preprocessed_ds = ds.map(tokenize_and_align_labels, batched=True)
preprocessed_ds

In [None]:
preprocessed_ds['train'][0]

{'Tokens': ['But', 'the', 'staff', 'was', 'so', 'horrible', 'to', 'us', '.'],
 'Tags': ['0', '0', '1', '0', '0', '0', '0', '0', '0'],
 'Polarities': ['-1', '-1', '0', '-1', '-1', '-1', '-1', '-1', '-1'],
 'input_ids': [2,
  47,
  14,
  1138,
  23,
  86,
  9244,
  20,
  182,
  13,
  9,
  3,
  1138,
  3,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'token_type_ids': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  

In [None]:
len(tokenizer)

30000

## **Evaluate**

In [None]:
!pip install -q evaluate==0.4.3

In [None]:
id2label = {
    0: 'Negative',
    1: 'Neutral',
    2: 'Positive'
}

label2id = {
    'Negative': 0,
    'Neutral': 1,
    'Positive': 2
}

In [None]:
import numpy as np
import evaluate

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

## **Model**

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    "albert/albert-base-v2",
    num_labels=3, id2label=id2label, label2id=label2id
)

In [None]:
model

AlbertForSequenceClassification(
  (albert): AlbertModel(
    (embeddings): AlbertEmbeddings(
      (word_embeddings): Embedding(30000, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0, inplace=False)
    )
    (encoder): AlbertTransformer(
      (embedding_hidden_mapping_in): Linear(in_features=128, out_features=768, bias=True)
      (albert_layer_groups): ModuleList(
        (0): AlbertLayerGroup(
          (albert_layers): ModuleList(
            (0): AlbertLayer(
              (full_layer_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (attention): AlbertSdpaAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=

In [None]:
import torch
from torchinfo import summary

# Define dummy input
dummy_input = {
    "input_ids": torch.randint(0, 100, (64, 100)),  # Batch size 1, sequence length 128
    "attention_mask": torch.ones(64, 100)  # All tokens are attended
}

summary(
    model,
    input_data=dummy_input,
    col_names=["input_size", "output_size", "num_params"]
)

Layer (type:depth-idx)                                            Input Shape               Output Shape              Param #
AlbertForSequenceClassification                                   --                        [64, 3]                   --
├─AlbertModel: 1-1                                                --                        [64, 768]                 --
│    └─AlbertEmbeddings: 2-1                                      [64, 100]                 [64, 100, 128]            --
│    │    └─Embedding: 3-1                                        [64, 100]                 [64, 100, 128]            3,840,000
│    │    └─Embedding: 3-2                                        [64, 100]                 [64, 100, 128]            256
│    │    └─Embedding: 3-3                                        [1, 100]                  [1, 100, 128]             65,536
│    │    └─LayerNorm: 3-4                                        [64, 100, 128]            [64, 100, 128]            256
│    │    └─Dr

## **Training**

In [None]:
# Disable wandb
import os
os.environ['WANDB_DISABLED'] = 'true'

# # Use wandb
# import wandb
# wandb.init(
#     project="aspect-based-sentiment-analysis",
#     name="distilbert/distilbert-base-uncased" # "transformer-encoder", "lstm", "conv1d"
# )

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="ATSC-albert-base-v2-For-SemEval-2014-Task-4",
    logging_dir="logs",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=4,
    num_train_epochs=54,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    # report_to="wandb",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=preprocessed_ds["train"],
    eval_dataset=preprocessed_ds["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.8649,0.895392,0.649687
2,0.6734,0.568423,0.77748
3,0.5461,0.464129,0.809651
4,0.4142,0.453952,0.827525
5,0.3211,0.594624,0.803396
6,0.2437,0.497421,0.832887
7,0.1958,0.491639,0.816801
8,0.1601,0.634829,0.827525
9,0.1095,0.653261,0.829312
10,0.0885,0.721237,0.820375


Epoch,Training Loss,Validation Loss,Accuracy
1,0.8649,0.895392,0.649687
2,0.6734,0.568423,0.77748
3,0.5461,0.464129,0.809651
4,0.4142,0.453952,0.827525
5,0.3211,0.594624,0.803396
6,0.2437,0.497421,0.832887
7,0.1958,0.491639,0.816801
8,0.1601,0.634829,0.827525
9,0.1095,0.653261,0.829312
10,0.0885,0.721237,0.820375


TrainOutput(global_step=3080, training_loss=0.08017959315549014, metrics={'train_runtime': 5199.7528, 'train_samples_per_second': 38.1, 'train_steps_per_second': 0.592, 'total_flos': 1208285163155400.0, 'train_loss': 0.08017959315549014, 'epoch': 54.0353982300885})

In [None]:
trainer.save_model("/content/ATSC-albert-base-v2-For-SemEval-2014-Task-4")
tokenizer.save_pretrained("/content/ATSC-albert-base-v2-For-SemEval-2014-Task-4")
best_model_path = "/content/ATSC-albert-base-v2-For-SemEval-2014-Task-4"

model = AutoModelForSequenceClassification.from_pretrained(best_model_path)
tokenizer = AutoTokenizer.from_pretrained(best_model_path)


In [None]:
model.push_to_hub("Chow05/ATSC-albert-base-v2-For-SemEval-2014-Task-4", token=" ")
tokenizer.push_to_hub("Chow05/ATSC-albert-base-v2-For-SemEval-2014-Task-4", token=" ")


## **Inference**

In [None]:
from transformers import pipeline

token_classifier = pipeline(
    model="Chow05/distilbert-base-uncased-For-SemEval-2014-Task-4",
    aggregation_strategy="simple"
)

classifier = pipeline(
    model="Chow05/ATSC-albert-base-v2-For-SemEval-2014-Task-4"
)

In [2]:
test_sentence = 'The bread is top notch as well'

results = token_classifier(test_sentence)
sentence_tags = " ".join([result['word'] for result in results])
pred_label = classifier(f'{test_sentence} [SEP] {sentence_tags}')
sentence_tags, pred_label

('bread', [{'label': 'Positive', 'score': 0.999703586101532}])

In [3]:
test_sentence = 'Our agreed favorite is the orrecchiete with sausage and chicken'

results = token_classifier(test_sentence)
sentence_tags = " ".join([result['word'] for result in results])
pred_label = classifier(f'{test_sentence} [SEP] {sentence_tags}')
sentence_tags, pred_label

('orr ##ec ##chi ##ete with sausage and chicken',
 [{'label': 'Neutral', 'score': 0.997747004032135}])

In [4]:
test_sentence = '''
The dish was bursting with flavor,
perfectly cooked, and left me craving more.
'''

results = token_classifier(test_sentence)
sentence_tags = " ".join([result['word'] for result in results])
pred_label = classifier(f'{test_sentence} [SEP] {sentence_tags}')
sentence_tags, pred_label

('dish flavor cooked', [{'label': 'Positive', 'score': 0.9997000694274902}])

In [5]:
test_sentence = '''
The food was absolutely delicious,
with rich flavors and fresh ingredients that made every bite enjoyable.
The presentation was stunning,
and the balance of textures and spices elevated the entire dining experience.
'''

results = token_classifier(test_sentence)
sentence_tags = " ".join([result['word'] for result in results])
pred_label = classifier(f'{test_sentence} [SEP] {sentence_tags}')
sentence_tags, pred_label

('food flavors ingredients presentation balance textures spices dining',
 [{'label': 'Neutral', 'score': 0.83456951379776}])

In [6]:
test_sentence = 'Elon Musk is the richest person in the world'

results = token_classifier(test_sentence)
sentence_tags = " ".join([result['word'] for result in results])
pred_label = classifier(f'{test_sentence} [SEP] {sentence_tags}')
sentence_tags, pred_label

('##sk', [{'label': 'Positive', 'score': 0.9966268539428711}])