In [2]:
import tensorflow as tf
from transformers import T5Tokenizer, TFT5ForConditionalGeneration
import pandas as pd
import re
from sklearn.model_selection import train_test_split

# Load Model and Tokenizer

In [3]:
# Set up the T5 tokenizer
tokenizer = T5Tokenizer.from_pretrained('t5-base')

# Load the pre-trained T5 model
#model = TFT5ForConditionalGeneration.from_pretrained('t5-base')

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [6]:
pad_id = model.config.pad_token_id

# Data

In [4]:
df = pd.read_csv('D:/Kuliah/S2/Tesis/Code/Generative-ABSA-main/data/quadruplet/rest15/all_data.csv')
df.head()

Unnamed: 0,input,triplet,label,quadruplet_label
0,Judging from previous posts this used to be a ...,"(place,good,NEG)","('place', 'restaurant general', 'negative')","(place, good, NEG, restaurant general)"
1,"We , there were four of us , arrived at noon -...","(staff,rude,NEG)","('staff', 'service general', 'negative')","(staff, rude, NEG, service general)"
2,The food was lousy - too sweet or too salty an...,"(food,lousy,NEG);(food,too sweet,NEG);(food,to...","('food', 'food quality', 'negative'), ('portio...","(food, lousy, NEG, food quality);(food, too sw..."
3,Avoid this place,"(place,Avoid,NEG)","('place', 'restaurant general', 'negative')","(place, Avoid, NEG, restaurant general)"
4,"I have eaten at Saul , many times , the food i...","(food,good,POS)","('food', 'food quality', 'positive')","(food, good, POS, food quality)"


In [5]:
def make_data(x, y, pad_id):
    input_encodings = dict(tokenizer(x, padding=True, truncation=True, return_tensors="pt"))
    target_encodings = dict(tokenizer(y, padding=True, truncation=True, return_tensors="pt"))
    decoder_input_ids = target_encodings['input_ids']
    labels = decoder_input_ids.masked_fill_(decoder_input_ids == pad_id, -100)
    encodings = {
        'input_ids': tf.convert_to_tensor(input_encodings['input_ids'].numpy()),
        'attention_mask': tf.convert_to_tensor(input_encodings['attention_mask'].numpy()),
        'decoder_input_ids': tf.convert_to_tensor(decoder_input_ids.numpy()),
        'labels': tf.convert_to_tensor(labels.numpy()),
    }
    return encodings

In [6]:
#test pakai cara bart
x = df['input'].values.tolist()
y = df['quadruplet_label'].values.tolist()
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
# train_inputs = make_data(x_train, y_train, pad_id)
# test_inputs = make_data(x_test, y_test, pad_id)

In [7]:
pad_id

0

# Tokenize

In [9]:
# Define the training data and pre-process it with the tokenizer
train_inputs = dict(tokenizer(x_train, padding=True, truncation=True, return_tensors="tf"))
train_outputs = dict(tokenizer(y_train, padding=True, truncation=True, return_tensors="tf"))

test_inputs = dict(tokenizer(x_test, padding=True, truncation=True, return_tensors="tf"))
test_outputs = dict(tokenizer(y_test, padding=True, truncation=True, return_tensors="tf"))

train_inputs = {**train_inputs, "labels": train_outputs["input_ids"], 'decoder_attention_mask':train_outputs['attention_mask']} 
test_inputs = {**test_inputs, "labels": test_outputs["input_ids"], 'decoder_attention_mask':test_outputs['attention_mask']}

# Train Model

In [14]:
save_path = 'tf-t5-many-epoch-quadruplet'
checkpoint_filepath = save_path + "/" + "T5-{epoch:04d}-{val_loss:.4f}.ckpt"
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=False,
    monitor='val_loss',
    mode='min',
    save_best_only=True)

callbacks = [model_checkpoint_callback] 
metrics = [tf.keras.metrics.SparseTopKCategoricalAccuracy(name='accuracy') ]

In [15]:
# Define the fine-tuning parameters
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
model.compile(optimizer=optimizer, metrics=metrics)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


In [16]:
# Fine-tune the model
history = model.fit(x = train_inputs, epochs=200, batch_size=8, callbacks=callbacks,
validation_data=test_inputs)

Epoch 1/200



INFO:tensorflow:Assets written to: tf-t5-many-epoch-quadruplet\T5-0001-1.1564.ckpt\assets


INFO:tensorflow:Assets written to: tf-t5-many-epoch-quadruplet\T5-0001-1.1564.ckpt\assets


Epoch 2/200



INFO:tensorflow:Assets written to: tf-t5-many-epoch-quadruplet\T5-0002-0.6651.ckpt\assets


INFO:tensorflow:Assets written to: tf-t5-many-epoch-quadruplet\T5-0002-0.6651.ckpt\assets


Epoch 3/200



INFO:tensorflow:Assets written to: tf-t5-many-epoch-quadruplet\T5-0003-0.3861.ckpt\assets


INFO:tensorflow:Assets written to: tf-t5-many-epoch-quadruplet\T5-0003-0.3861.ckpt\assets


Epoch 4/200



INFO:tensorflow:Assets written to: tf-t5-many-epoch-quadruplet\T5-0004-0.2553.ckpt\assets


INFO:tensorflow:Assets written to: tf-t5-many-epoch-quadruplet\T5-0004-0.2553.ckpt\assets


Epoch 5/200



INFO:tensorflow:Assets written to: tf-t5-many-epoch-quadruplet\T5-0005-0.1871.ckpt\assets


INFO:tensorflow:Assets written to: tf-t5-many-epoch-quadruplet\T5-0005-0.1871.ckpt\assets


Epoch 6/200



INFO:tensorflow:Assets written to: tf-t5-many-epoch-quadruplet\T5-0006-0.1489.ckpt\assets


INFO:tensorflow:Assets written to: tf-t5-many-epoch-quadruplet\T5-0006-0.1489.ckpt\assets


Epoch 7/200



INFO:tensorflow:Assets written to: tf-t5-many-epoch-quadruplet\T5-0007-0.1298.ckpt\assets


INFO:tensorflow:Assets written to: tf-t5-many-epoch-quadruplet\T5-0007-0.1298.ckpt\assets


Epoch 8/200



INFO:tensorflow:Assets written to: tf-t5-many-epoch-quadruplet\T5-0008-0.1170.ckpt\assets


INFO:tensorflow:Assets written to: tf-t5-many-epoch-quadruplet\T5-0008-0.1170.ckpt\assets


Epoch 9/200



INFO:tensorflow:Assets written to: tf-t5-many-epoch-quadruplet\T5-0009-0.1095.ckpt\assets


INFO:tensorflow:Assets written to: tf-t5-many-epoch-quadruplet\T5-0009-0.1095.ckpt\assets


Epoch 10/200



INFO:tensorflow:Assets written to: tf-t5-many-epoch-quadruplet\T5-0010-0.1027.ckpt\assets


INFO:tensorflow:Assets written to: tf-t5-many-epoch-quadruplet\T5-0010-0.1027.ckpt\assets


Epoch 11/200



INFO:tensorflow:Assets written to: tf-t5-many-epoch-quadruplet\T5-0011-0.0987.ckpt\assets


INFO:tensorflow:Assets written to: tf-t5-many-epoch-quadruplet\T5-0011-0.0987.ckpt\assets


Epoch 12/200



INFO:tensorflow:Assets written to: tf-t5-many-epoch-quadruplet\T5-0012-0.0955.ckpt\assets


INFO:tensorflow:Assets written to: tf-t5-many-epoch-quadruplet\T5-0012-0.0955.ckpt\assets


Epoch 13/200



INFO:tensorflow:Assets written to: tf-t5-many-epoch-quadruplet\T5-0013-0.0918.ckpt\assets


INFO:tensorflow:Assets written to: tf-t5-many-epoch-quadruplet\T5-0013-0.0918.ckpt\assets


Epoch 14/200



INFO:tensorflow:Assets written to: tf-t5-many-epoch-quadruplet\T5-0014-0.0908.ckpt\assets


INFO:tensorflow:Assets written to: tf-t5-many-epoch-quadruplet\T5-0014-0.0908.ckpt\assets


Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200



INFO:tensorflow:Assets written to: tf-t5-many-epoch-quadruplet\T5-0018-0.0893.ckpt\assets


INFO:tensorflow:Assets written to: tf-t5-many-epoch-quadruplet\T5-0018-0.0893.ckpt\assets


Epoch 19/200



INFO:tensorflow:Assets written to: tf-t5-many-epoch-quadruplet\T5-0019-0.0850.ckpt\assets


INFO:tensorflow:Assets written to: tf-t5-many-epoch-quadruplet\T5-0019-0.0850.ckpt\assets


Epoch 20/200
Epoch 21/200
Epoch 22/200



INFO:tensorflow:Assets written to: tf-t5-many-epoch-quadruplet\T5-0022-0.0848.ckpt\assets


INFO:tensorflow:Assets written to: tf-t5-many-epoch-quadruplet\T5-0022-0.0848.ckpt\assets


Epoch 23/200
Epoch 24/200



INFO:tensorflow:Assets written to: tf-t5-many-epoch-quadruplet\T5-0024-0.0830.ckpt\assets


INFO:tensorflow:Assets written to: tf-t5-many-epoch-quadruplet\T5-0024-0.0830.ckpt\assets


Epoch 25/200
Epoch 26/200



INFO:tensorflow:Assets written to: tf-t5-many-epoch-quadruplet\T5-0026-0.0825.ckpt\assets


INFO:tensorflow:Assets written to: tf-t5-many-epoch-quadruplet\T5-0026-0.0825.ckpt\assets


Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78/200
Epoch 79/200
Epoch 80/200
Epoch 81/200
Epoch 82/200
Epoch 83/200
Epoch 84/200
Epoch 85/200
Epoch 86/200
Epoch 87/200
Epoch 88/200
Epoch 89/200
Epoch 90/200
Epoch 91/200
Epoch 92/200
Epoch 93/200
Epoch 94/200
Epoch 95/200
Epoch 96/200
Epoch 97/200
Epoch 98/200
Epoch 99/200
Epoch 100/200
Epoch 101/200
Epoch 102/200
Epoch 103

In [17]:
model.save_pretrained(save_path)

# Inference

In [7]:
import random

In [8]:
save_path = 'tf-t5-many-epoch-quadruplet'
loaded_model = TFT5ForConditionalGeneration.from_pretrained(save_path)

All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at tf-t5-many-epoch-quadruplet.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [10]:
for i in range(5):
    idx = random.randint(0, len(x_test))
    input_text =  x_test[idx]
    encoded_query = dict(tokenizer(input_text, padding=True, truncation=True, return_tensors="tf"))
    input_ids = encoded_query["input_ids"]
    attention_mask = encoded_query["attention_mask"]
    generated_answer = loaded_model.generate(input_ids, attention_mask=attention_mask, max_length=150)
    decoded_answer = tokenizer.decode(generated_answer.numpy()[0])

    print('Masukkan: ',x_test[idx])
    print('Label sebenarnya: ',y_test[idx])
    print('Hasil prediksi model: ',decoded_answer)
    print('-'*60)

Masukkan:  The place is beautiful 
Label sebenarnya:  (place, beautiful, POS, ambience general)
Hasil prediksi model:  <pad> (place, beautiful, POS, ambience general)</s>
------------------------------------------------------------
Masukkan:  I fell in love with the egg noodles in the beef broth with shrimp dumplings and slices of BBQ roast pork 
Label sebenarnya:  (egg noodles in the beef broth with shrimp dumplings and slices of BBQ roast pork, love, POS, food quality)
Hasil prediksi model:  <pad> (egg noodles in the beef broth with shrimp dumplings and slices of BBQ roast pork, fell in love, POS, food quality)</s>
------------------------------------------------------------
Masukkan:  The atmosphere is nothing special , but it feels like a Sushi establishment in Tokyo 
Label sebenarnya:  (atmosphere, nothing special, POS, ambience general)
Hasil prediksi model:  <pad> (atmosphere, nothing special, NEU, ambience general)</s>
-----------------------------------------------------------

In [20]:
pred_quadruplet = []
for i in range(len(x_test)):
    input_text =  x_test[i]
    encoded_query = dict(tokenizer(input_text, padding=True, truncation=True, return_tensors="tf"))
    input_ids = encoded_query["input_ids"]
    attention_mask = encoded_query["attention_mask"]
    generated_answer = loaded_model.generate(input_ids, attention_mask=attention_mask)
    decoded_answer = tokenizer.decode(generated_answer.numpy()[0])
    pred_quadruplet.append(decoded_answer)



In [24]:
df = pd.DataFrame(data={
    'input' : x_test,
    'output': y_test,
    'predicted' : pred_quadruplet
})

In [26]:
df.to_csv('rest15_quadruplet.csv', index=False)

# Post-process

In [28]:
from Levenshtein import distance

extract quadruplet

In [63]:
def extract_quadruplet(label):
    sample = [quadruplet.replace('(', '[').replace(')', ']') for quadruplet in label.split(';')]
    sample = [re.findall('\[(.*?)\]', quadruplet) for quadruplet in sample]
    quadruplets = []
    for quadruplet in sample:
        #untuk menghindari quadruplet kosong dari label hasil prediksi
        if quadruplet:
            quadruplet = quadruplet[0].split(',')
            quadruplets.append(tuple(quadruplet))
    return quadruplets

In [65]:
def compute_f1_scores(pred_pt, gold_pt):
    """
    Function to compute F1 scores with pred and gold pairs/triplets
    The input needs to be already processed
    """
    # number of true postive, gold standard, predicted aspect terms
    n_tp, n_gold, n_pred = 0, 0, 0

    for i in range(len(pred_pt)):
        n_gold += len(gold_pt[i])
        n_pred += len(pred_pt[i])

        for t in pred_pt[i]:
            if t in gold_pt[i]:
                n_tp += 1

    precision = float(n_tp) / float(n_pred) if n_pred != 0 else 0
    recall = float(n_tp) / float(n_gold) if n_gold != 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if precision != 0 or recall != 0 else 0
    scores = {'precision': precision, 'recall': recall, 'f1': f1}

    return scores

In [64]:
idx = random.randint(0, len(x_test))
print(extract_quadruplet(pred_quadruplet[idx]))
print(extract_quadruplet(y_test[idx]))

[('service', ' quick', ' POS', ' service general')]
[('service', ' quick', ' POS', ' service general'), ('service', ' friendly', ' POS', ' service general')]


In [66]:
idx = random.randint(0, len(x_test))
pred_label = extract_quadruplet(pred_quadruplet[idx])
true_label = extract_quadruplet(y_test[idx])
compute_f1_scores(pred_label, true_label)

{'precision': 0.25, 'recall': 0.25, 'f1': 0.25}

In [29]:
distance("lewenstein", "levenshtein")

2

In [62]:
arr = ['a', 'b', 'c']
tuple(arr)

('a', 'b', 'c')