# Setup

In [None]:
%%capture
!pip install transformers
!pip install datasets

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Input, GlobalMaxPooling1D
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Embedding
from tensorflow.keras.models import Model
from datasets import load_dataset

print(tf.__version__)

2.11.0


In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
#check GPU availability
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


# Load Data

## Import from Huggingface hub

In [None]:
dataset = load_dataset('aegrif/CIS6930_DAAGR_Empathetic_Dialogues')



  0%|          | 0/3 [00:00<?, ?it/s]

# Add emotion label

In [None]:
label_map = {'disappointed': 0, 'annoyed': 1, 'excited': 2, 'afraid': 3, 'disgusted': 4, 'grateful': 5, 'impressed': 6, 'prepared': 7}

def map_label(example):
    example['b_labels'] = label_map[example['new_context']]
    return example

dataset = dataset.map(map_label)



In [None]:
print(dataset)

DatasetDict({
    test: Dataset({
        features: ['conv_id', 'utterance_idx', 'context', 'prompt', 'utterance', 'previous_utterance', 'new_context', 'b_labels'],
        num_rows: 10973
    })
    train: Dataset({
        features: ['conv_id', 'utterance_idx', 'context', 'prompt', 'utterance', 'previous_utterance', 'new_context', 'b_labels'],
        num_rows: 84167
    })
    validation: Dataset({
        features: ['conv_id', 'utterance_idx', 'context', 'prompt', 'utterance', 'previous_utterance', 'new_context', 'b_labels'],
        num_rows: 12077
    })
})


### Model

In [None]:
import transformers
from transformers import DistilBertTokenizerFast
from transformers import TFDistilBertForSequenceClassification
from transformers import TextClassificationPipeline

In [None]:
y_train = dataset['train']['b_labels']
y_val = dataset['validation']['b_labels']
y_test = dataset['test']['b_labels']

In [None]:
X_train = dataset['train']['utterance']
X_val = dataset['validation']['utterance']
X_test = dataset['test']['utterance']

In [None]:
db_tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

train_encodings = db_tokenizer(X_train, truncation = True, padding = True  )

val_encodings = db_tokenizer(X_val, truncation = True, padding = True )

loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/vocab.txt
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/tokenizer_config.json
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropo

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    y_train
))


val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    y_val
))

In [None]:
from transformers import TFDistilBertForSequenceClassification, TFTrainer, TFTrainingArguments, logging

logging.set_verbosity_info()


training_args = TFTrainingArguments(
    output_dir='/content/drive/MyDrive/CIS6930_NLP_Group_Project/Final Code Files/DBert',
    overwrite_output_dir = True,        
    num_train_epochs=15,              
    per_device_train_batch_size=256,  
    per_device_eval_batch_size=256,   
    warmup_steps=500,                
    weight_decay=1e-5,
    learning_rate=1e-5,  
    logging_steps=500,         
    logging_dir='./logs',            
    do_eval=True,
    evaluation_strategy="steps",
    eval_steps=100,
    save_steps=100,
    save_total_limit = 2,
    load_best_model_at_end=True,         
)

with training_args.strategy.scope():
    trainer_model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels = 8)


trainer = TFTrainer(
    model=trainer_model,                 
    args=training_args,                  
    train_dataset=train_dataset,         
    eval_dataset=val_dataset         
)

The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Tensorflow: setting up strategy
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/config.json
Model config DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5

In [None]:
trainer.train()

***** Running training *****
  Num examples = 84167
  Num Epochs = 15.0
  Instantaneous batch size per device = 256
  Total train batch size (w. parallel, distributed & accumulation) = 256
  Gradient Accumulation steps = 1
  Steps per epoch = 329
  Total optimization steps = 4935
***** Running Evaluation *****
  Num examples in dataset = 12077
  Num examples in used in evaluation = 12288
  Batch size = 256
{'eval_loss': 2.055497169494629, 'epoch': 0.303951367781155, 'step': 100}
Saving checkpoint for step 100 at /content/drive/MyDrive/CIS6930_NLP_Group_Project/Final Code Files/DBert/checkpoint/ckpt-1
***** Running Evaluation *****
  Num examples in dataset = 12077
  Num examples in used in evaluation = 12288
  Batch size = 256
{'eval_loss': 1.9532055854797363, 'epoch': 0.60790273556231, 'step': 200}
Saving checkpoint for step 200 at /content/drive/MyDrive/CIS6930_NLP_Group_Project/Final Code Files/DBert/checkpoint/ckpt-2
***** Running Evaluation *****
  Num examples in dataset = 12077


In [None]:
trainer.evaluate()

AttributeError: ignored

In [None]:
#save model
save_directory = "/content/drive/MyDrive/CIS6930_NLP_Group_Project/Final Code Files" 

trainer_model.save_pretrained(save_directory)

db_tokenizer.save_pretrained(save_directory)

Configuration saved in /content/drive/MyDrive/CIS6930_NLP_Group_Project/Final Code Files/config.json
Model weights saved in /content/drive/MyDrive/CIS6930_NLP_Group_Project/Final Code Files/tf_model.h5
tokenizer config file saved in /content/drive/MyDrive/CIS6930_NLP_Group_Project/Final Code Files/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/CIS6930_NLP_Group_Project/Final Code Files/special_tokens_map.json


('/content/drive/MyDrive/CIS6930_NLP_Group_Project/Final Code Files/tokenizer_config.json',
 '/content/drive/MyDrive/CIS6930_NLP_Group_Project/Final Code Files/special_tokens_map.json',
 '/content/drive/MyDrive/CIS6930_NLP_Group_Project/Final Code Files/vocab.txt',
 '/content/drive/MyDrive/CIS6930_NLP_Group_Project/Final Code Files/added_tokens.json',
 '/content/drive/MyDrive/CIS6930_NLP_Group_Project/Final Code Files/tokenizer.json')

In [None]:
##load model and test 

In [None]:
tokenizer_fine_tuned = DistilBertTokenizerFast.from_pretrained(save_directory)

model_fine_tuned = TFDistilBertForSequenceClassification.from_pretrained(save_directory)


loading file vocab.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading configuration file /content/drive/MyDrive/CIS6930_NLP_Group_Project/Final Code Files/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6,
    "LABEL_7": 7
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 

In [None]:
test_text = "i have always been loyal to my wife"
predict_input = tokenizer_fine_tuned.encode(
    test_text,
    truncation = True,
    padding = True,
    return_tensors = 'tf'    
)
output = model_fine_tuned(predict_input)[0]

prediction_value = tf.argmax(output, axis = 1).numpy()[0]

prediction_value

5

In [None]:
test_text = ""
predict_input = tokenizer_fine_tuned.encode(
    test_text,
    truncation = True,
    padding = True,
    return_tensors = 'tf'    
)
output = model_fine_tuned(predict_input)[0]

prediction_value = tf.argmax(output, axis = 1).numpy()[0]

prediction_value

5

In [None]:
!pip install evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 KB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: evaluate
Successfully installed evaluate-0.4.0


# Push to Huggingface hub

In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|
    
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) n
Token is valid.
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
model_fine_tuned.push_to_hub("aegrif/CIS6930_DAAGR_Classification")

Configuration saved in /tmp/tmpnanhtdev/config.json
Model weights saved in /tmp/tmpnanhtdev/tf_model.h5
Uploading the following files to aegrif/CIS6930_DAAGR_Classification: README.md,tf_model.h5,config.json


tf_model.h5:   0%|          | 0.00/268M [00:00<?, ?B/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
tokenizer_fine_tuned.push_to_hub("aegrif/CIS6930_DAAGR_Classification")

tokenizer config file saved in /tmp/tmp43qnzria/tokenizer_config.json
Special tokens file saved in /tmp/tmp43qnzria/special_tokens_map.json
Uploading the following files to aegrif/CIS6930_DAAGR_Classification: special_tokens_map.json,tokenizer.json,vocab.txt,tokenizer_config.json


CommitInfo(commit_url='https://huggingface.co/aegrif/CIS6930_DAAGR_Classification/commit/5ba261416c8fd26052750084bd1141018b357a2b', commit_message='Upload tokenizer', commit_description='', oid='5ba261416c8fd26052750084bd1141018b357a2b', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
model_test = TFDistilBertForSequenceClassification.from_pretrained('aegrif/CIS6930_DAAGR_Classification')
tokenizer_test = DistilBertTokenizerFast.from_pretrained('aegrif/CIS6930_DAAGR_Classification')

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--aegrif--CIS6930_DAAGR_Classification/snapshots/5ba261416c8fd26052750084bd1141018b357a2b/config.json
Model config DistilBertConfig {
  "_name_or_path": "/content/drive/MyDrive/CIS6930_NLP_Group_Project/Final Code Files",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6,
    "LABEL_7": 7
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "

Downloading (…)okenizer_config.json:   0%|          | 0.00/315 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--aegrif--CIS6930_DAAGR_Classification/snapshots/5ba261416c8fd26052750084bd1141018b357a2b/vocab.txt
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--aegrif--CIS6930_DAAGR_Classification/snapshots/5ba261416c8fd26052750084bd1141018b357a2b/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--aegrif--CIS6930_DAAGR_Classification/snapshots/5ba261416c8fd26052750084bd1141018b357a2b/special_tokens_map.json
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--aegrif--CIS6930_DAAGR_Classification/snapshots/5ba261416c8fd26052750084bd1141018b357a2b/tokenizer_config.json


In [None]:
test_text = "i have always been loyal to my wife"
predict_input = tokenizer_test.encode(
    test_text,
    truncation = True,
    padding = True,
    return_tensors = 'tf'    
)
output = model_test(predict_input)[0]

prediction_value = tf.argmax(output, axis = 1).numpy()[0]

prediction_value

5