## Restaurant NER (AE - Aspect Extraction) By Distilbert

### Preprocess training data

In [2]:
import json
import pandas as pd

rest_ner_training_file = '/home/chenfenghuang/Documents/ABSA/data/ae/rest/train.json'
rest_ner_training_samples = json.load(open(rest_ner_training_file))
len(rest_ner_training_samples)

1850

In [3]:
rest_ner_training_samples

{'0': {'label': ['O', 'O', 'O', 'B'],
  'sentence': ['I', 'LOVE', 'their', 'Thai']},
 '1': {'label': ['O', 'B', 'O', 'O', 'O'],
  'sentence': ['The', 'service', 'was', 'attentive', '.']},
 '2': {'label': ['O', 'O', 'O', 'O', 'O', 'O'],
  'sentence': ['I', 'go', 'twice', 'a', 'month', '!']},
 '3': {'label': ['O',
   'B',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'B',
   'I',
   'I',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'B',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O'],
  'sentence': ['The',
   'food',
   'was',
   'average',
   'to',
   'above',
   '-',
   'average',
   ';',
   'the',
   'French',
   'Onion',
   'soup',
   'filling',
   'yet',
   'not',
   'overly',
   'impressive',
   ',',
   'and',
   'the',
   'desserts',
   'not',
   'brilliant',
   'in',
   'any',
   'way',
   '.']},
 '4': {'label': ['O', 'O', 'B', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
  'sentence': ['Ask',
   'for',
   'Usha',
   ',',
   'the',
   'nicest',
 

In [4]:
dict_rest_ner_training_samples = {'sentence_id':[], 'sentence':[], 'label':[]}

for key in rest_ner_training_samples.keys():
    dict_rest_ner_training_samples['sentence_id'].append(key)
    dict_rest_ner_training_samples['sentence'].append(rest_ner_training_samples[key]['sentence'])
    dict_rest_ner_training_samples['label'].append(rest_ner_training_samples[key]['label'])
    
dict_rest_ner_training_samples

df_rest_ner_training_samples = pd.DataFrame.from_dict(dict_rest_ner_training_samples)
df_rest_ner_training_samples

Unnamed: 0,sentence_id,sentence,label
0,0,"[I, LOVE, their, Thai]","[O, O, O, B]"
1,1,"[The, service, was, attentive, .]","[O, B, O, O, O]"
2,2,"[I, go, twice, a, month, !]","[O, O, O, O, O, O]"
3,3,"[The, food, was, average, to, above, -, averag...","[O, B, O, O, O, O, O, O, O, O, B, I, I, O, O, ..."
4,4,"[Ask, for, Usha, ,, the, nicest, bartender, in...","[O, O, B, O, O, O, O, O, O, O]"
...,...,...,...
1845,1845,"[You, can, get, a, table, without, a, reservat...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
1846,1846,"[I, have, eaten, at, some, of, the, 'best, ', ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
1847,1847,"[Worth, visiting, the, 1st, Ave, spot, because...","[O, O, O, B, I, I, O, O, O, O, O, O, O]"
1848,1848,"[Its, located, in, greenewich, village, .]","[O, O, O, O, O, O]"


In [5]:
df_rest_ner_training_samples_exploded = df_rest_ner_training_samples.explode(['sentence', 'label'])
df_rest_ner_training_samples_exploded

Unnamed: 0,sentence_id,sentence,label
0,0,I,O
0,0,LOVE,O
0,0,their,O
0,0,Thai,B
1,1,The,O
...,...,...,...
1849,1849,front,O
1849,1849,of,O
1849,1849,the,O
1849,1849,guest,O


In [6]:
def get_mapping(data, column):
    t2id = {}
    id2t = {}
 
    vocab = list(set(data[column]))
    
    id2t = {i:t for i, t in enumerate(vocab)}
    t2id = {t:i for  i, t in enumerate(vocab)}
    return t2id, id2t


token2id, id2token = get_mapping(df_rest_ner_training_samples_exploded, 'sentence')
tag2id, id2tag = get_mapping(df_rest_ner_training_samples_exploded, 'label')

print(list(token2id.items())[0:5], list(id2token.items())[0:5])
print(tag2id, id2tag)

[('un', 0), ('Cantonese', 1), ('NAME', 2), ('dined', 3), ('delivered', 4)] [(0, 'un'), (1, 'Cantonese'), (2, 'NAME'), (3, 'dined'), (4, 'delivered')]
{'O': 0, 'B': 1, 'I': 2} {0: 'O', 1: 'B', 2: 'I'}


In [7]:
df_rest_ner_training_samples_exploded['word_id'] = df_rest_ner_training_samples_exploded['sentence'].map(token2id)
df_rest_ner_training_samples_exploded['tag_id'] = df_rest_ner_training_samples_exploded['label'].map(tag2id)
df_rest_ner_training_samples_exploded.head(20)

Unnamed: 0,sentence_id,sentence,label,word_id,tag_id
0,0,I,O,1765,0
0,0,LOVE,O,2201,0
0,0,their,O,2083,0
0,0,Thai,B,2913,1
1,1,The,O,20,0
1,1,service,B,98,1
1,1,was,O,2469,0
1,1,attentive,O,2226,0
1,1,.,O,3792,0
2,2,I,O,1765,0


In [8]:
df_rest_ner_training_samples_exploded.isnull().sum()

sentence_id    0
sentence       0
label          0
word_id        0
tag_id         0
dtype: int64

In [9]:
df_rest_ner_training_samples = df_rest_ner_training_samples_exploded.groupby(['sentence_id'])['sentence', 'label', 'word_id', 'tag_id'].agg(lambda x: list(x))
df_rest_ner_training_samples

  df_rest_ner_training_samples = df_rest_ner_training_samples_exploded.groupby(['sentence_id'])['sentence', 'label', 'word_id', 'tag_id'].agg(lambda x: list(x))


Unnamed: 0_level_0,sentence,label,word_id,tag_id
sentence_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,"[I, LOVE, their, Thai]","[O, O, O, B]","[1765, 2201, 2083, 2913]","[0, 0, 0, 1]"
1,"[The, service, was, attentive, .]","[O, B, O, O, O]","[20, 98, 2469, 2226, 3792]","[0, 1, 0, 0, 0]"
10,"[Everything, is, always, cooked, to, perfectio...","[O, O, O, O, O, O, O, O, B, O, O, O, O, B, O, ...","[844, 3732, 1500, 484, 1086, 135, 302, 941, 98...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, ..."
100,"[good, music, ,, great, food, ,, speedy, servi...","[O, B, O, O, B, O, O, B, O, O, O]","[1381, 1441, 302, 1337, 184, 302, 2110, 98, 27...","[0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0]"
1000,"[But, the, pizza, is, way, to, expensive, .]","[O, O, B, O, O, O, O, O]","[1294, 941, 1575, 3732, 3465, 1086, 1289, 3792]","[0, 0, 1, 0, 0, 0, 0, 0]"
...,...,...,...,...
995,"[The, only, beverage, we, did, receive, was, w...","[O, O, O, O, O, O, O, O, O, O, O, O]","[20, 1053, 790, 1983, 2642, 1845, 2469, 3221, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
996,"[A, friend, from, Seattle, and, I, went, on, a...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O]","[1836, 287, 481, 2709, 2963, 1765, 2482, 1736,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
997,"[Went, on, a, 3, day, oyster, binge, ,, with, ...","[O, O, O, O, O, O, O, O, O, B, O, O, O, O, O, ...","[447, 1736, 1137, 1613, 3274, 3584, 2091, 302,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."
998,"[Service, is, fast, and, friendly, .]","[B, O, O, O, O, O]","[1761, 3732, 1692, 2963, 1850, 3792]","[1, 0, 0, 0, 0, 0]"


In [10]:
max_sent_length = max([len(each) for each in list(df_rest_ner_training_samples['sentence'])])
max_sent_length

74

### Load and finetune pretrained Bert model

In [11]:
from transformers import AutoModelForTokenClassification, AutoTokenizer

pretrained_bert_model_name = 'distilbert-base-uncased'
pretrained_bert_tokenizer = AutoTokenizer.from_pretrained(pretrained_bert_model_name)

tokens = pretrained_bert_tokenizer(list(df_rest_ner_training_samples['sentence'])[0], is_split_into_words=True)
tokens

{'input_ids': [101, 1045, 2293, 2037, 7273, 102], 'attention_mask': [1, 1, 1, 1, 1, 1]}

In [12]:
from datasets import Dataset

def align_ner_labels(samples):
    inputs = pretrained_bert_tokenizer(samples['sentence'], truncation=True, padding='max_length', max_length=128, is_split_into_words=True) # Truncate if one sentence is more than 128 words

    label_input = []
    for i, label in enumerate(samples['label']):
        word_ids = inputs.word_ids(batch_index=i)
        
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            else:
                label_ids.append(tag2id[label[word_idx]])
        
        if i == 0:
            print(i)
            print(label)
            print(word_ids)
            print(label_ids)
            
        label_input.append(label_ids)
        
    inputs['labels'] = label_input
    
    # print('inputs', inputs.keys())
    
    return inputs

train_datasets = Dataset.from_pandas(df_rest_ner_training_samples[:1600]).map(align_ner_labels, batched=True, batch_size=100).remove_columns(['sentence', 'label', 'word_id', 'tag_id', 'sentence_id'])
validation_datasets = Dataset.from_pandas(df_rest_ner_training_samples[1600:]).map(align_ner_labels, batched=True, batch_size=10).remove_columns(['sentence', 'label', 'word_id', 'tag_id', 'sentence_id'])

Map:   0%|          | 0/1600 [00:00<?, ? examples/s]

0
['O', 'O', 'O', 'B']
[None, 0, 1, 2, 3, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None]
[-100, 0, 0, 0, 1, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -10

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

0
['B', 'I', 'I', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
[None, 0, 0, 1, 2, 3, 4, 4, 5, 6, 7, 8, 9, 9, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None]
[-100, 1, 1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -10

In [13]:
train_datasets

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 1600
})

In [14]:
validation_datasets

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 250
})

In [4]:
import torch 

#device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')   # mps is for Apple M1 GPU\
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [None]:
from transformers import TrainingArguments, Trainer, DataCollatorForTokenClassification
import torch

pretrained_bert_model = AutoModelForTokenClassification.from_pretrained(pretrained_bert_model_name, num_labels=len(tag2id.keys()), id2label=id2tag, label2id=tag2id)

pretrained_bert_model.to(device)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForTokenClassification: ['vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN t

DistilBertForTokenClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
    

In [None]:
args = TrainingArguments(
    output_dir='./AE_models/rest_ner_model_distilbert',
    per_device_train_batch_size=128,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    logging_strategy='epoch',
    evaluation_strategy='epoch',
    report_to='none'
)

# Default padding
data_collator = DataCollatorForTokenClassification(pretrained_bert_tokenizer) 
    
trainer = Trainer(
    model=pretrained_bert_model,
    args=args,
    train_dataset=train_datasets,
    eval_dataset=validation_datasets,
    # data_collator=data_collator, # for padding, optional here
    tokenizer=pretrained_bert_tokenizer
)

trainer.train()
trainer.save_model()


# solution to "AttributeError: module 'torch.distributed' has no attribute 'is_initialized'"
# https://stackoverflow.com/questions/72641886/attributeerror-module-torch-distributed-has-no-attribute-is-initialized-in/72641887

***** Running training *****
  Num examples = 1600
  Num Epochs = 10
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 130
  Number of trainable parameters = 66365187


  0%|          | 0/130 [00:00<?, ?it/s]

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


***** Running Evaluation *****
  Num examples = 250
  Batch size = 32


{'loss': 0.4956, 'learning_rate': 4.5e-05, 'epoch': 1.0}


  0%|          | 0/8 [00:00<?, ?it/s]

{'eval_loss': 0.28295931220054626, 'eval_runtime': 4.0475, 'eval_samples_per_second': 61.766, 'eval_steps_per_second': 1.977, 'epoch': 1.0}


***** Running Evaluation *****
  Num examples = 250
  Batch size = 32


{'loss': 0.2114, 'learning_rate': 4e-05, 'epoch': 2.0}


  0%|          | 0/8 [00:00<?, ?it/s]

{'eval_loss': 0.19204199314117432, 'eval_runtime': 3.999, 'eval_samples_per_second': 62.515, 'eval_steps_per_second': 2.0, 'epoch': 2.0}


***** Running Evaluation *****
  Num examples = 250
  Batch size = 32


{'loss': 0.1475, 'learning_rate': 3.5e-05, 'epoch': 3.0}


  0%|          | 0/8 [00:00<?, ?it/s]

{'eval_loss': 0.16163046658039093, 'eval_runtime': 3.9602, 'eval_samples_per_second': 63.127, 'eval_steps_per_second': 2.02, 'epoch': 3.0}


***** Running Evaluation *****
  Num examples = 250
  Batch size = 32


{'loss': 0.1025, 'learning_rate': 3e-05, 'epoch': 4.0}


  0%|          | 0/8 [00:00<?, ?it/s]

{'eval_loss': 0.1507633626461029, 'eval_runtime': 3.9693, 'eval_samples_per_second': 62.983, 'eval_steps_per_second': 2.015, 'epoch': 4.0}


***** Running Evaluation *****
  Num examples = 250
  Batch size = 32


{'loss': 0.0742, 'learning_rate': 2.5e-05, 'epoch': 5.0}


  0%|          | 0/8 [00:00<?, ?it/s]

{'eval_loss': 0.14325420558452606, 'eval_runtime': 3.9798, 'eval_samples_per_second': 62.818, 'eval_steps_per_second': 2.01, 'epoch': 5.0}


***** Running Evaluation *****
  Num examples = 250
  Batch size = 32


{'loss': 0.0529, 'learning_rate': 2e-05, 'epoch': 6.0}


  0%|          | 0/8 [00:00<?, ?it/s]

{'eval_loss': 0.1564081758260727, 'eval_runtime': 3.9559, 'eval_samples_per_second': 63.197, 'eval_steps_per_second': 2.022, 'epoch': 6.0}


***** Running Evaluation *****
  Num examples = 250
  Batch size = 32


{'loss': 0.039, 'learning_rate': 1.5e-05, 'epoch': 7.0}


  0%|          | 0/8 [00:00<?, ?it/s]

{'eval_loss': 0.17028847336769104, 'eval_runtime': 4.073, 'eval_samples_per_second': 61.379, 'eval_steps_per_second': 1.964, 'epoch': 7.0}


***** Running Evaluation *****
  Num examples = 250
  Batch size = 32


{'loss': 0.0298, 'learning_rate': 1e-05, 'epoch': 8.0}


  0%|          | 0/8 [00:00<?, ?it/s]

{'eval_loss': 0.1794525533914566, 'eval_runtime': 3.9664, 'eval_samples_per_second': 63.03, 'eval_steps_per_second': 2.017, 'epoch': 8.0}


***** Running Evaluation *****
  Num examples = 250
  Batch size = 32


{'loss': 0.0255, 'learning_rate': 5e-06, 'epoch': 9.0}


  0%|          | 0/8 [00:00<?, ?it/s]

{'eval_loss': 0.18778251111507416, 'eval_runtime': 4.1178, 'eval_samples_per_second': 60.712, 'eval_steps_per_second': 1.943, 'epoch': 9.0}


***** Running Evaluation *****
  Num examples = 250
  Batch size = 32


{'loss': 0.0218, 'learning_rate': 0.0, 'epoch': 10.0}


  0%|          | 0/8 [00:00<?, ?it/s]



Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to ./AE_models/rest_ner_model_distilbert
Configuration saved in ./AE_models/rest_ner_model_distilbert/config.json
Model weights saved in ./AE_models/rest_ner_model_distilbert/pytorch_model.bin
tokenizer config file saved in ./AE_models/rest_ner_model_distilbert/tokenizer_config.json
Special tokens file saved in ./AE_models/rest_ner_model_distilbert/special_tokens_map.json


{'eval_loss': 0.1909467726945877, 'eval_runtime': 4.0107, 'eval_samples_per_second': 62.333, 'eval_steps_per_second': 1.995, 'epoch': 10.0}
{'train_runtime': 780.7064, 'train_samples_per_second': 20.494, 'train_steps_per_second': 0.167, 'train_loss': 0.12002299771859096, 'epoch': 10.0}


In [None]:
from transformers import EarlyStoppingCallback


args = TrainingArguments(
    output_dir='./AE_models/rest_ner_model_distilbert_earlyStop',
    per_device_train_batch_size=128,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    logging_strategy='steps',
    evaluation_strategy='steps',
    eval_steps = 10,
    load_best_model_at_end=True,
    report_to='none'
)

data_collator = DataCollatorForTokenClassification(pretrained_bert_tokenizer)
    
trainer = Trainer(
    model=pretrained_bert_model,
    args=args,
    train_dataset=train_datasets,
    eval_dataset=validation_datasets,
    # data_collator=data_collator, # for padding, optional here
    tokenizer=pretrained_bert_tokenizer,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
)

trainer.train()
trainer.save_model()


PyTorch: setting up devices
***** Running training *****
  Num examples = 1600
  Num Epochs = 10
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 130
  Number of trainable parameters = 66365187


  0%|          | 0/130 [00:00<?, ?it/s]

***** Running Evaluation *****
  Num examples = 250
  Batch size = 32


  0%|          | 0/8 [00:00<?, ?it/s]

{'eval_loss': 0.2922757565975189, 'eval_runtime': 4.1442, 'eval_samples_per_second': 60.326, 'eval_steps_per_second': 1.93, 'epoch': 0.77}


***** Running Evaluation *****
  Num examples = 250
  Batch size = 32


  0%|          | 0/8 [00:00<?, ?it/s]

{'eval_loss': 0.28691521286964417, 'eval_runtime': 4.1955, 'eval_samples_per_second': 59.588, 'eval_steps_per_second': 1.907, 'epoch': 1.54}


***** Running Evaluation *****
  Num examples = 250
  Batch size = 32


  0%|          | 0/8 [00:00<?, ?it/s]

{'eval_loss': 0.29062438011169434, 'eval_runtime': 4.1608, 'eval_samples_per_second': 60.085, 'eval_steps_per_second': 1.923, 'epoch': 2.31}


***** Running Evaluation *****
  Num examples = 250
  Batch size = 32


  0%|          | 0/8 [00:00<?, ?it/s]

{'eval_loss': 0.3034421503543854, 'eval_runtime': 4.1251, 'eval_samples_per_second': 60.604, 'eval_steps_per_second': 1.939, 'epoch': 3.08}


***** Running Evaluation *****
  Num examples = 250
  Batch size = 32


  0%|          | 0/8 [00:00<?, ?it/s]

{'eval_loss': 0.3107999563217163, 'eval_runtime': 4.1851, 'eval_samples_per_second': 59.735, 'eval_steps_per_second': 1.912, 'epoch': 3.85}


***** Running Evaluation *****
  Num examples = 250
  Batch size = 32


  0%|          | 0/8 [00:00<?, ?it/s]

{'eval_loss': 0.29551100730895996, 'eval_runtime': 4.3019, 'eval_samples_per_second': 58.114, 'eval_steps_per_second': 1.86, 'epoch': 4.62}


### Load finetune Bert model for Rest NER task, and do prediction

In [None]:
from transformers import AutoModelForTokenClassification, AutoTokenizer

finetuned_rest_tokenizer = AutoTokenizer.from_pretrained('./AE_models/rest_ner_model_distilbert')
finetuned_rest_model = AutoModelForTokenClassification.from_pretrained('.AE_models/rest_ner_model_distilbert', num_labels=len(tag2id.keys()))

2023-09-13 11:27:10.292489: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


NameError: name 'tag2id' is not defined

In [None]:
from transformers import pipeline

rest_ner_pipeline = pipeline('ner', model=finetuned_rest_model, tokenizer=finetuned_rest_tokenizer)

examples = ['Serves really good food, nice environment', 'Oh yeah ... the view was good too']
rest_ner_results = rest_ner_pipeline(examples)
print(rest_ner_results)

[[{'entity': 'B', 'score': 0.9978817, 'index': 4, 'word': 'food', 'start': 19, 'end': 23}, {'entity': 'B', 'score': 0.99630994, 'index': 7, 'word': 'environment', 'start': 30, 'end': 41}], [{'entity': 'B', 'score': 0.9970036, 'index': 7, 'word': 'view', 'start': 16, 'end': 20}]]


### Evaluation

In [None]:
rest_ner_test_file = './data/ae/rest/test.json'
rest_ner_test_samples = json.load(open(rest_ner_test_file))
len(rest_ner_test_samples)

676

In [None]:
dict_rest_ner_test_samples = {'sentence_id':[], 'sentence':[], 'full_sentence':[], 'label':[]}

for key in rest_ner_test_samples.keys():
    dict_rest_ner_test_samples['sentence_id'].append(key)
    dict_rest_ner_test_samples['sentence'].append(rest_ner_test_samples[key]['sentence'])
    dict_rest_ner_test_samples['full_sentence'].append(' '.join(rest_ner_test_samples[key]['sentence']))
    dict_rest_ner_test_samples['label'].append(rest_ner_test_samples[key]['label'])
    

In [None]:
rest_ner_results = rest_ner_pipeline(dict_rest_ner_test_samples['full_sentence'])

In [None]:
rest_ner_results[:5]

[[],
 [{'entity': 'B',
   'score': 0.99839336,
   'index': 4,
   'word': 'su',
   'start': 19,
   'end': 21},
  {'entity': 'B',
   'score': 0.9987388,
   'index': 5,
   'word': '##shi',
   'start': 21,
   'end': 24}],
 [{'entity': 'B',
   'score': 0.97772455,
   'index': 4,
   'word': 'portions',
   'start': 16,
   'end': 24}],
 [{'entity': 'B',
   'score': 0.99576145,
   'index': 1,
   'word': 'green',
   'start': 0,
   'end': 5},
  {'entity': 'I',
   'score': 0.9977702,
   'index': 2,
   'word': 'tea',
   'start': 6,
   'end': 9},
  {'entity': 'I',
   'score': 0.99857545,
   'index': 3,
   'word': 'cr',
   'start': 10,
   'end': 12},
  {'entity': 'I',
   'score': 0.9987343,
   'index': 4,
   'word': '##eme',
   'start': 12,
   'end': 15},
  {'entity': 'I',
   'score': 0.9982901,
   'index': 5,
   'word': 'br',
   'start': 16,
   'end': 18},
  {'entity': 'I',
   'score': 0.99826485,
   'index': 6,
   'word': '##ule',
   'start': 18,
   'end': 21},
  {'entity': 'I',
   'score': 0.99847

In [None]:
reformatted_rest_ner_results = []

for sentence_result in rest_ner_results:
    if len(sentence_result) == 0:
        reformatted_rest_ner_results.append({})
        continue
    last_label = sentence_result[0]['entity']
    last_token = sentence_result[0]['word']
    reformatted_sent_result = {}
    for token_result in sentence_result[1:]:
        label = token_result['entity']
        token = token_result['word']
        
        if token.startswith('##') is False:
            if last_label != '':
                reformatted_sent_result[last_token] = last_label
            last_token = token
            last_label = label
        else:
            last_token = last_token + token[2:] # remove '##' 
            last_label = label            

    if last_label != '':
        reformatted_sent_result[last_token] = last_label
    
    reformatted_rest_ner_results.append(reformatted_sent_result)
    
reformatted_rest_ner_results[:5]         

[{},
 {'sushi': 'B'},
 {'portions': 'B'},
 {'green': 'B', 'tea': 'I', 'creme': 'I', 'brulee': 'I'},
 {}]

In [None]:
y_pred = []

for tokens, ner_tags in zip(dict_rest_ner_test_samples['sentence'], reformatted_rest_ner_results):
    for token in tokens:
        if token not in ner_tags.keys():
            y_pred.append('O')
        else:
            y_pred.append(ner_tags[token])
    # print(tokens, ner_tags, pred)
    
y_pred[:10]
len(y_pred)

10096

In [None]:
y_true = []
for each in dict_rest_ner_test_samples['label']:
    y_true.extend(each)

len(y_true)

10096

In [None]:
from sklearn import metrics
metrics.accuracy_score(y_true, y_pred)

0.9522583201267829

In [None]:
metrics.precision_recall_fscore_support(y_true, y_pred, labels=['B', 'I', 'O'])

(array([0.7816092 , 0.70720721, 0.96760051]),
 array([0.66666667, 0.56071429, 0.9831595 ]),
 array([0.71957672, 0.62549801, 0.97531796]),
 array([ 612,  280, 9204]))

In [None]:
metrics.confusion_matrix(y_true, y_pred, labels=['B', 'I', 'O'])

array([[ 408,   10,  194],
       [  14,  157,  109],
       [ 100,   55, 9049]])

In [None]:
finetuned_rest_tokenizer2 = AutoTokenizer.from_pretrained('./rest_ner_model_distilbert_earlyStop')
finetuned_rest_model2 = AutoModelForTokenClassification.from_pretrained('./rest_ner_model_distilbert_earlyStop', num_labels=len(tag2id.keys()))
rest_ner_pipeline = pipeline('ner', model=finetuned_rest_model2, tokenizer=finetuned_rest_tokenizer2)
rest_ner_results = rest_ner_pipeline(dict_rest_ner_test_samples['full_sentence'])

reformatted_rest_ner_results = []

for sentence_result in rest_ner_results:
    if len(sentence_result) == 0:
        reformatted_rest_ner_results.append({})
        continue
    last_label = sentence_result[0]['entity']
    last_token = sentence_result[0]['word']
    reformatted_sent_result = {}
    for token_result in sentence_result[1:]:
        label = token_result['entity']
        token = token_result['word']
        
        if token.startswith('##') is False:
            if last_label != '':
                reformatted_sent_result[last_token] = last_label
            last_token = token
            last_label = label
        else:
            last_token = last_token + token[2:] # remove '##' 
            last_label = label            

    if last_label != '':
        reformatted_sent_result[last_token] = last_label
    
    reformatted_rest_ner_results.append(reformatted_sent_result)

y_pred = []
for tokens, ner_tags in zip(dict_rest_ner_test_samples['sentence'], reformatted_rest_ner_results):
    for token in tokens:
        if token not in ner_tags.keys():
            y_pred.append('O')
        else:
            y_pred.append(ner_tags[token])

y_true = []
for each in dict_rest_ner_test_samples['label']:
    y_true.extend(each)
print(len(y_pred), len(y_true))



loading file vocab.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading configuration file ./rest_ner_model_distilbert_earlyStop/config.json
Model config DistilBertConfig {
  "_name_or_path": "./rest_ner_model_distilbert_earlyStop",
  "activation": "gelu",
  "architectures": [
    "DistilBertForTokenClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "I",
    "1": "B",
    "2": "O"
  },
  "initializer_range": 0.02,
  "label2id": {
    "B": 1,
    "I": 0,
    "O": 2
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.24.0",
  "vocab_size": 30522
}

loading weights file ./rest_ner_model_disti

10096 10096


In [None]:
metrics.accuracy_score(y_true, y_pred)


0.9508716323296355

In [None]:
metrics.precision_recall_fscore_support(y_true, y_pred, labels=['B', 'I', 'O'])

(array([0.77821012, 0.68949772, 0.96646374]),
 array([0.65359477, 0.53928571, 0.9831595 ]),
 array([0.71047957, 0.60521042, 0.97474013]),
 array([ 612,  280, 9204]))

In [None]:
metrics.confusion_matrix(y_true, y_pred, labels=['B', 'I', 'O'])

array([[ 400,    9,  203],
       [  18,  151,  111],
       [  96,   59, 9049]])

## Laptop NER (AE)

In [None]:
import json
import pandas as pd

laptop_ner_training_file = '/home/chenfenghuang/Documents/ABSA/data/ae/laptop/train.json'
laptop_ner_training_samples = json.load(open(laptop_ner_training_file))
len(laptop_ner_training_samples)

2895

In [None]:
dict_laptop_ner_training_samples = {'sentence_id':[], 'sentence':[], 'label':[]}

for key in laptop_ner_training_samples.keys():
    dict_laptop_ner_training_samples['sentence_id'].append(key)
    dict_laptop_ner_training_samples['sentence'].append(laptop_ner_training_samples[key]['sentence'])
    dict_laptop_ner_training_samples['label'].append(laptop_ner_training_samples[key]['label'])
    
dict_laptop_ner_training_samples

df_laptop_ner_training_samples = pd.DataFrame.from_dict(dict_laptop_ner_training_samples)
df_laptop_ner_training_samples

Unnamed: 0,sentence_id,sentence,label
0,0,"[Keyboard, is, great, but, primary, and, secon...","[B, O, O, O, O, O, O, B, I, O, O, O, O, O]"
1,1,"[I, bought, this, laptop, about, a, month, ago...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
2,2,"[I, am, however, pleased, that, it, is, still,...","[O, O, O, O, O, O, O, O, O, O, O, O]"
3,3,"[I, went, to, my, local, Best, Buy, looking, f...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
4,4,"[The, Apple, MC371LL/, A, 2.4Ghz, 15.4-, inch,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
...,...,...,...
2890,2890,"[After, talking, it, over, with, the, very, kn...","[O, O, O, O, O, O, O, O, B, I, O, O, O, O, O, ..."
2891,2891,"[If, internet, connectivity, is, important, I,...","[O, B, I, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2892,2892,"[My, friend, just, had, to, replace, his, enti...","[O, O, O, O, O, O, O, O, B, O, O, O, O, O, O, ..."
2893,2893,"[I, work, with, kids, and, they, love, making,...","[O, O, O, O, O, O, O, O, O, O, O, O, O]"


In [None]:
df_laptop_ner_training_samples_exploded = df_laptop_ner_training_samples.explode(['sentence', 'label'])
df_laptop_ner_training_samples_exploded

Unnamed: 0,sentence_id,sentence,label
0,0,Keyboard,B
0,0,is,O
0,0,great,O
0,0,but,O
0,0,primary,O
...,...,...,...
2894,2894,machine,O
2894,2894,for,O
2894,2894,personal,O
2894,2894,use,O


In [None]:
token2id, id2token = get_mapping(df_laptop_ner_training_samples_exploded, 'sentence')
tag2id, id2tag = get_mapping(df_laptop_ner_training_samples_exploded, 'label')

print(list(token2id.items())[0:5], list(id2token.items())[0:5])
print(tag2id, id2tag)

[('engine', 0), ('Win7', 1), ('crap', 2), ('many', 3), ('D620', 4)] [(0, 'engine'), (1, 'Win7'), (2, 'crap'), (3, 'many'), (4, 'D620')]
{'B': 0, 'I': 1, 'O': 2} {0: 'B', 1: 'I', 2: 'O'}


In [None]:
df_laptop_ner_training_samples_exploded['word_id'] = df_laptop_ner_training_samples_exploded['sentence'].map(token2id)
df_laptop_ner_training_samples_exploded['tag_id'] = df_laptop_ner_training_samples_exploded['label'].map(tag2id)
df_laptop_ner_training_samples_exploded.head(20)

Unnamed: 0,sentence_id,sentence,label,word_id,tag_id
0,0,Keyboard,B,2678,0
0,0,is,O,1618,2
0,0,great,O,855,2
0,0,but,O,2867,2
0,0,primary,O,3529,2
0,0,and,O,81,2
0,0,secondary,O,2373,2
0,0,control,B,1021,0
0,0,buttons,I,3268,1
0,0,could,O,738,2


In [None]:
df_laptop_ner_training_samples_exploded.isnull().sum()

sentence_id    0
sentence       0
label          0
word_id        0
tag_id         0
dtype: int64

In [None]:
df_laptop_ner_training_samples = df_laptop_ner_training_samples_exploded.groupby(['sentence_id'])['sentence', 'label', 'word_id', 'tag_id'].agg(lambda x: list(x))
df_laptop_ner_training_samples

  df_laptop_ner_training_samples = df_laptop_ner_training_samples_exploded.groupby(['sentence_id'])['sentence', 'label', 'word_id', 'tag_id'].agg(lambda x: list(x))


Unnamed: 0_level_0,sentence,label,word_id,tag_id
sentence_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,"[Keyboard, is, great, but, primary, and, secon...","[B, O, O, O, O, O, O, B, I, O, O, O, O, O]","[2678, 1618, 855, 2867, 3529, 81, 2373, 1021, ...","[0, 2, 2, 2, 2, 2, 2, 0, 1, 2, 2, 2, 2, 2]"
1,"[I, bought, this, laptop, about, a, month, ago...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O]","[128, 4387, 579, 2949, 3030, 2693, 1927, 488, ...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]"
10,"[-, Stay, away, from, Apple, ,, or, hope, you,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]","[4362, 1177, 1721, 907, 736, 2626, 3333, 1261,...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]"
100,"[also, you, may, need, to, charge, it, once, a...","[O, O, O, O, O, B, O, O, O, O, O, O, O, O, O, ...","[3845, 2126, 3002, 2292, 4080, 1230, 2022, 386...","[2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, ..."
1000,"[The, notebook, would, not, turn, back, on, ag...","[O, O, O, O, O, O, O, O, O]","[4662, 3323, 2432, 806, 3960, 4428, 3319, 4401...","[2, 2, 2, 2, 2, 2, 2, 2, 2]"
...,...,...,...,...
995,"[The, salesman, talked, us, into, this, comput...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[4662, 1163, 164, 28, 4407, 579, 1878, 1721, 9...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ..."
996,"[The, switchable, graphic, card, is, pretty, s...","[O, B, I, I, O, O, O, O, O, O, B, O, O, O, O]","[4662, 2218, 588, 2698, 1618, 2990, 129, 4147,...","[2, 0, 1, 1, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2]"
997,"[Its, small, enough, where, I, can, take, it, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[797, 4099, 4304, 2802, 128, 1027, 1608, 2022,...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ..."
998,"[They, only, stay, charged, a, little, over, a...","[O, O, O, B, O, O, O, O, O, O]","[1139, 581, 87, 3417, 2693, 1873, 745, 2003, 3...","[2, 2, 2, 0, 2, 2, 2, 2, 2, 2]"


In [None]:
max_sent_length = max([len(each) for each in list(df_laptop_ner_training_samples['sentence'])])
max_sent_length

83

In [None]:
pretrained_bert_tokenizer = AutoTokenizer.from_pretrained(pretrained_bert_model_name)

tokens = pretrained_bert_tokenizer(list(df_laptop_ner_training_samples['sentence'])[0], is_split_into_words=True)
tokens

{'input_ids': [101, 9019, 2003, 2307, 2021, 3078, 1998, 3905, 2491, 11287, 2071, 2022, 2062, 25634, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
train_datasets = Dataset.from_pandas(df_laptop_ner_training_samples[:2600]).map(align_ner_labels, batched=True, batch_size=100).remove_columns(['sentence', 'label', 'word_id', 'tag_id', 'sentence_id'])
validation_datasets = Dataset.from_pandas(df_laptop_ner_training_samples[2600:]).map(align_ner_labels, batched=True, batch_size=10).remove_columns(['sentence', 'label', 'word_id', 'tag_id', 'sentence_id'])

Map:   0%|          | 0/2600 [00:00<?, ? examples/s]

0
['B', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'I', 'O', 'O', 'O', 'O', 'O']
[None, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None]
[-100, 0, 2, 2, 2, 2, 2, 2, 0, 1, 2, 2, 2, 2, 2, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -10

Map:   0%|          | 0/295 [00:00<?, ? examples/s]

0
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
[None, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None]
[-100, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -10

In [None]:
train_datasets

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 2600
})

In [None]:
validation_datasets


Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 295
})

In [None]:
torch.cuda.empty_cache()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
pretrained_bert_model = AutoModelForTokenClassification.from_pretrained(pretrained_bert_model_name, num_labels=len(tag2id.keys()), id2label=id2tag, label2id=tag2id)

pretrained_bert_model.to(device)

NameError: name 'torch' is not defined

In [None]:
args = TrainingArguments(
    output_dir='./laptop_ner_model_distilbert',
    per_device_train_batch_size=128,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    logging_strategy='epoch',
    evaluation_strategy='epoch',
    report_to='none'
)

# Default padding
data_collator = DataCollatorForTokenClassification(pretrained_bert_tokenizer) 
    
trainer = Trainer(
    model=pretrained_bert_model,
    args=args,
    train_dataset=train_datasets,
    eval_dataset=validation_datasets,
    # data_collator=data_collator, # for padding, optional here
    tokenizer=pretrained_bert_tokenizer
)

trainer.train()
trainer.save_model()


# solution to "AttributeError: module 'torch.distributed' has no attribute 'is_initialized'"
# https://stackoverflow.com/questions/72641886/attributeerror-module-torch-distributed-has-no-attribute-is-initialized-in/72641887

## Load Testing  - to do...

### Config File

### Result - Distil Bert

### Result - Bert

### Config File

### Result - Distil Bert

### Result - Bert