#### Import libraries

In [1]:
!pip install --quiet datasets
!pip install --quiet transformers


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.7/468.7 KB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.9/132.9 KB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.2/212.2 KB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 KB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.2/114.2 KB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m158.8/158.8 KB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━

In [2]:
import tensorflow as tf
import re
import nltk
import numpy as np
import math
import time
import pandas as pd
from tensorflow.keras import layers
import tensorflow_datasets as tfds
import matplotlib.pyplot as plt
from datasets import load_dataset
from transformers import AutoTokenizer, TFT5ForConditionalGeneration

print(tf.__version__)

2.11.0


In [3]:
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


#### Loading data

In [4]:
dataset = load_dataset('aegrif/CIS6930_DAAGR_Empathetic_Dialogues')
dataset

Downloading readme:   0%|          | 0.00/750 [00:00<?, ?B/s]

Downloading and preparing dataset None/None to /root/.cache/huggingface/datasets/aegrif___parquet/aegrif--CIS6930_DAAGR_Empathetic_Dialogues-3358e2c61020f15c/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/8.55M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating test split:   0%|          | 0/10973 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/84167 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/12077 [00:00<?, ? examples/s]

Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/aegrif___parquet/aegrif--CIS6930_DAAGR_Empathetic_Dialogues-3358e2c61020f15c/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    test: Dataset({
        features: ['conv_id', 'utterance_idx', 'context', 'prompt', 'utterance', 'new_context', 'previous_utterance'],
        num_rows: 10973
    })
    train: Dataset({
        features: ['conv_id', 'utterance_idx', 'context', 'prompt', 'utterance', 'new_context', 'previous_utterance'],
        num_rows: 84167
    })
    validation: Dataset({
        features: ['conv_id', 'utterance_idx', 'context', 'prompt', 'utterance', 'new_context', 'previous_utterance'],
        num_rows: 12077
    })
})

Filter dataset to not use the first utterance in each conversation because we are training responses and there is no previous utterance for the first utterance in the conversation.

In [5]:
# Define the filtering function
def filter_function(example):
  return example['utterance_idx'] != 1

# Loop through the splits in the dataset and filter them
for split in dataset.keys():
  dataset[split] = dataset[split].filter(filter_function)

Filter:   0%|          | 0/10973 [00:00<?, ? examples/s]

Filter:   0%|          | 0/84167 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12077 [00:00<?, ? examples/s]

In [6]:
dataset["train"][0]

{'conv_id': 'hit:0_conv:1',
 'utterance_idx': 2,
 'context': 'sentimental',
 'prompt': 'i remember going to the fireworks with my best friend. there was a lot of people, but it only felt like us in the world.',
 'utterance': 'was this a friend you were in love with, or just a best friend?',
 'new_context': 'disappointed',
 'previous_utterance': 'i remember going to see the fireworks with my best friend. it was the first time we ever spent time alone together. although there was a lot of people, we felt like the only people in the world.'}

In [7]:
tr_questions = [q.strip() for q in dataset['train']["previous_utterance"]]
tr_answers = [a.strip() for a in dataset['train']["utterance"]]
print("question: " + tr_questions[1]) #list of questions
print("answer: " + tr_answers[1])

question: was this a friend you were in love with, or just a best friend?
answer: this was a best friend. i miss her.


#### Data preprocessing

In [8]:
##tokenize data
tokenizer = AutoTokenizer.from_pretrained("t5-small")

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [9]:
#process data
def process_data_addTask(data):
  data['input1'] = 'question: %s </s>' % (data['previous_utterance'])
  data['target'] = '%s </s>' % data['utterance']
  return data

In [10]:
# map add_eos_to_examples function to the dataset example wise 
train_dataset = dataset['train'].map(process_data_addTask)
validation_dataset = dataset['validation'].map(process_data_addTask)
train_dataset['target'][0]

Map:   0%|          | 0/64636 [00:00<?, ? examples/s]

Map:   0%|          | 0/9308 [00:00<?, ? examples/s]

'was this a friend you were in love with, or just a best friend? </s>'

In [11]:
def preprocess_function(data):
  context = data['utterance']
  question = data['input1']
  answer = data['target']
  

  encoder_inputs = tokenizer(question, truncation=True, 
                               return_tensors='tf', max_length=128,
                               padding="max_length")
    
  decoder_inputs = tokenizer(answer, truncation=True, 
                               return_tensors='tf', max_length=128,
                              padding="max_length")
  
  input_ids = encoder_inputs['input_ids'][0]
  input_attention = encoder_inputs['attention_mask'][0]
  target_ids = decoder_inputs['input_ids'][0]
  target_attention = decoder_inputs['attention_mask'][0]

  outputs = {'input_ids':input_ids, 'attention_mask': input_attention, 
               'labels':target_ids, 'decoder_attention_mask':target_attention}
  return outputs

In [12]:
train_ds=  train_dataset.map(preprocess_function)
validation_ds=  validation_dataset.map(preprocess_function)
train_ds  

Map:   0%|          | 0/64636 [00:00<?, ? examples/s]

Map:   0%|          | 0/9308 [00:00<?, ? examples/s]

Dataset({
    features: ['conv_id', 'utterance_idx', 'context', 'prompt', 'utterance', 'new_context', 'previous_utterance', 'input1', 'target', 'input_ids', 'attention_mask', 'labels', 'decoder_attention_mask'],
    num_rows: 64636
})

In [13]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator(return_tensors="tf")

In [14]:
columns = ['input_ids',  'attention_mask', 'labels', 'decoder_attention_mask']

encoded_tf_dataset = train_ds.to_tf_dataset(
    columns=columns,
    shuffle=True,
    batch_size=128,
    collate_fn=data_collator
)

encoded_tf_dataset2 = validation_ds.to_tf_dataset(
    columns=columns,
   shuffle=True,
    batch_size=16,
    collate_fn=data_collator
)

In [15]:
encoded_tf_dataset

<PrefetchDataset element_spec={'labels': TensorSpec(shape=(None, 128), dtype=tf.int64, name=None), 'input_ids': TensorSpec(shape=(None, 128), dtype=tf.int64, name=None), 'attention_mask': TensorSpec(shape=(None, 128), dtype=tf.int64, name=None), 'decoder_attention_mask': TensorSpec(shape=(None, 128), dtype=tf.int64, name=None)}>

In [16]:
model = TFT5ForConditionalGeneration.from_pretrained("t5-small")

Downloading tf_model.h5:   0%|          | 0.00/242M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at t5-small.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [17]:
data_dir = "/content/drive/MyDrive/code/Final Code Files/T5_models_no_emo"
log_dir = f"{data_dir}/experiments/t5/logs"
save_path = f"{data_dir}/experiments/t5/models"
cache_path_train = f"{data_dir}/cache/t5.train"
cache_path_validation = f"{data_dir}/cache/t5.validation"

In [18]:
import datetime
steps = 100
start_profile_batch = steps+10
stop_profile_batch = start_profile_batch + 100
profile_range = f"{start_profile_batch},{stop_profile_batch}"

log_path = log_dir + "/" + datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_path, histogram_freq=1,
                                                     update_freq=20,profile_batch=profile_range)

checkpoint_filepath = data_dir + "/" + "T5-{epoch:04d}-{val_loss:.4f}.ckpt"
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=False,
    monitor='val_loss',
    mode='min',
    save_best_only=True)

early_stopping_callback = tf.keras.callbacks.EarlyStopping(
    # Stop training when `val_loss` is no longer improving
    monitor="val_loss",
    # "no longer improving" being further defined as "for at least 4 epochs"
    patience=10,
    verbose=1,
    mode="min"
)

callbacks = [tensorboard_callback, model_checkpoint_callback, early_stopping_callback] 
metrics = [tf.keras.metrics.SparseTopKCategoricalAccuracy(name='accuracy')]

In [19]:
# learning_rate = CustomSchedule()
learning_rate = 0.001  # Instead set a static learning rate
optimizer = tf.keras.optimizers.Adam(learning_rate)

In [20]:
model.compile(optimizer=optimizer, metrics=metrics)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


In [21]:
epochs_done = 0
model.fit(x=encoded_tf_dataset, validation_data=encoded_tf_dataset2, epochs=50, callbacks=callbacks)

Epoch 1/50


Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089






Epoch 2/50



Epoch 3/50



Epoch 4/50



Epoch 5/50



Epoch 6/50



Epoch 7/50



Epoch 8/50



Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 18: early stopping


<keras.callbacks.History at 0x7f3a4ac3ac70>

### Test Model

In [22]:
question = "what do you mean it has not been easy ? how close have you come to cheating ?"
print(question)

what do you mean it has not been easy ? how close have you come to cheating ?


In [29]:
from google.colab import drive
drive.mount('/content/drive/test')

Mounted at /content/drive/test


In [23]:
input_text =  f"question: {question} </s>"
encoded_query = tokenizer(input_text, 
                         return_tensors='tf', padding=True, truncation=True, max_length=128)
input_ids = encoded_query["input_ids"]
attention_mask = encoded_query["attention_mask"]
generated_answer = model.generate(input_ids, attention_mask=attention_mask, 
                                 max_length=128, top_p=0.95, top_k=50, repetition_penalty=float(2))
decoded_answer = tokenizer.decode(generated_answer.numpy()[0])
print("Answer: ", decoded_answer)

Answer:  <pad> i have been cheating for years now.</s>


In [24]:
data_dir = "/content/drive/MyDrive/code/T5_models_no_emo"
log_dir = f"{data_dir}/experiments/t5/logs"
save_path = f"{data_dir}/experiments/t5/models"
cache_path_train = f"{data_dir}/cache/t5.train"
cache_path_test = f"{data_dir}/cache/t5.test"

In [None]:
/content/drive/MyDrive/code/Final Code Files

In [25]:
# Save the transformer model as a SavedModel.
save_directory = "/content/drive/MyDrive/code/Final_Code_Files/T5_models/saved_t5_models" 
model.save(save_directory)



In [26]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|
    
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) n
Token is valid.
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [27]:
model.push_to_hub("aegrif/CIS6930_DAAGR_T5_NoEmo")

tf_model.h5:   0%|          | 0.00/374M [00:00<?, ?B/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

In [30]:
loaded_model = TFT5ForConditionalGeneration.from_pretrained('aegrif/CIS6930_DAAGR_T5_NoEmo')

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.45k [00:00<?, ?B/s]

Downloading tf_model.h5:   0%|          | 0.00/374M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at aegrif/CIS6930_DAAGR_T5_NoEmo.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


Downloading (…)neration_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

In [31]:
input_text =  f"question: {question} </s>"
encoded_query = tokenizer(input_text, 
                         return_tensors='tf', padding=True, truncation=True, max_length=128)
input_ids = encoded_query["input_ids"]
attention_mask = encoded_query["attention_mask"]
generated_answer = loaded_model.generate(input_ids, attention_mask=attention_mask, 
                                 max_length=128, top_p=0.95, top_k=500, repetition_penalty=float(4))
decoded_answer = tokenizer.decode(generated_answer.numpy()[0])
print("Answer: ", decoded_answer)

Answer:  <pad> i have been cheating for years now.</s>
