<a href="https://colab.research.google.com/github/CIS6930-NLP/final_project/blob/main/DistBERT_EmotionClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import re
import nltk
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Input, GlobalMaxPooling1D
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Embedding
from tensorflow.keras.models import Model

print(tf.__version__)

2.11.0


In [2]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [3]:
#check GPU availability
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [4]:
#load data
train_file = "/content/drive/MyDrive/code/Updated Data/train.csv"
val_file = "/content/drive/MyDrive/code/Updated Data/valid.csv"
test_file = "/content/drive/MyDrive/code/Updated Data/test.csv"
def data_loading(filepath): 
  data = pd.read_csv(filepath, encoding = 'utf-8')
  data = data.drop('conv_id', axis = 1)
  data = data.drop('utterance_idx', axis = 1)
  data = data.drop('speaker_idx', axis = 1)
  data = data.drop('selfeval', axis = 1)
  data = data.drop('tags', axis = 1)
  data = data.dropna()
  return data

train_data = data_loading(train_file)
val_data = data_loading(val_file)
test_data = data_loading(test_file)
train_data.head()

Unnamed: 0.1,Unnamed: 0,context,prompt,utterance
0,0,sentimental,I remember going to the fireworks with my best...,I remember going to see the fireworks with my ...
1,1,sentimental,I remember going to the fireworks with my best...,Was this a friend you were in love with_comma_...
2,2,sentimental,I remember going to the fireworks with my best...,This was a best friend. I miss her.
3,3,sentimental,I remember going to the fireworks with my best...,Where has she gone?
4,4,sentimental,I remember going to the fireworks with my best...,We no longer talk.


In [5]:
#emotion preprocessing
#group emotions
emotions = {}
emotions['excited'] = emotions['surprised'] = emotions['joyful'] = "excited"
emotions['afraid'] = emotions['terrified'] = emotions['anxious']= emotions['apprehensive']='afraid'
emotions['disgusted'] = emotions['embarrassed']= emotions['guilty'] = emotions['ashamed'] ="disgusted"
emotions['angry'] = emotions ['annoyed'] = emotions['jealous'] =emotions[ 'furious' ] = "annoyed"
emotions['faithful'] = emotions ['trusting']=emotions ['grateful']= emotions['caring'] = emotions['hopeful'] = "grateful"
emotions['sad'] = emotions['disappointed'] = emotions['devastated']= emotions ['lonely']=emotions['nostalgic']=emotions['sentimental'] = "disappointed"
emotions['proud']= emotions['impressed']= emotions['content'] = "impressed"
emotions['anticipating']=emotions[ 'prepared']=emotions ['confident'] = "prepared"
dicttt=emotions

In [6]:
train_context = train_data['context']
train_question = train_data['prompt']
train_answer = train_data['utterance']


val_context = val_data['context']
val_question = val_data['prompt']
val_answer = val_data['utterance']

test_context = test_data['context']
test_question = test_data['prompt']
test_answer = test_data['utterance']
# print(len(train_context))
# print(len(train_question))
# print(len(train_answer))

  # Maximum number of samples to preprocess
MAX_SAMPLES = 50000

def preprocess_sentence(sentence):
    sentence = sentence.lower().strip()
    sentence = re.sub(r"([?.!,])", r" \1 ", sentence)
    sentence = re.sub(r'[" "]+', " ", sentence)
    sentence = re.sub(r"[^a-zA-Z?.!,]+", " ", sentence)
    sentence = sentence.strip()

    return sentence

train_context = [preprocess_sentence(emotions[sentence]) for sentence in train_context]
train_questions = [preprocess_sentence(sentence) for sentence in train_question]
train_answers = [preprocess_sentence(sentence) for sentence in train_answer]

val_context = [preprocess_sentence(emotions[sentence]) for sentence in val_context]
val_questions = [preprocess_sentence(sentence) for sentence in val_question]
val_answers = [preprocess_sentence(sentence) for sentence in val_answer]

test_context = [preprocess_sentence(emotions[sentence]) for sentence in test_context]
test_questions = [preprocess_sentence(sentence) for sentence in test_question]
test_answers = [preprocess_sentence(sentence) for sentence in test_answer]

In [None]:
print('Sample context: {}'.format(train_context[20]))
print('Sample question: {}'.format(train_questions[20]))
print('Sample answer: {}'.format(train_answers[20]))

Sample context: grateful
Sample question: i have always been loyal to my wife .
Sample answer: what do you mean it hasn t been easy ? how close have you come to cheating ?


In [7]:
# create a dataset 
train_data_clf = pd.DataFrame(
    {'label': train_context,
     'questions': train_questions
    })

val_data_clf = pd.DataFrame(
    {'label': val_context,
     'questions': val_questions
    })

test_data_clf = pd.DataFrame(
    {'label': test_context,
     'questions': test_questions
    })

train_data_clf.head()

Unnamed: 0,label,questions
0,disappointed,i remember going to the fireworks with my best...
1,disappointed,i remember going to the fireworks with my best...
2,disappointed,i remember going to the fireworks with my best...
3,disappointed,i remember going to the fireworks with my best...
4,disappointed,i remember going to the fireworks with my best...


In [None]:
train_data_clf['b_labels'] = train_data_clf['label'].map({'disappointed': 0, 'annoyed': 1, 
                                                          'excited': 2, 'afraid': 3, 'disgusted': 4, 
                                                          'grateful': 5, 'impressed': 6, 'prepared': 7,}) # create new column as 'b_labels'
y_train = train_data_clf['b_labels'].values

val_data_clf['b_labels'] = val_data_clf['label'].map({'disappointed': 0, 'annoyed': 1, 
                                                          'excited': 2, 'afraid': 3, 'disgusted': 4, 
                                                          'grateful': 5, 'impressed': 6, 'prepared': 7,}) # create new column as 'b_labels'
y_val = val_data_clf['b_labels'].values

test_data_clf['b_labels'] = test_data_clf['label'].map({'disappointed': 0, 'annoyed': 1, 
                                                          'excited': 2, 'afraid': 3, 'disgusted': 4, 
                                                          'grateful': 5, 'impressed': 6, 'prepared': 7,}) # create new column as 'b_labels'
y_test = test_data_clf['b_labels'].values

In [12]:
train_data_clf['b_labels'] = train_data_clf['label'].astype('category').cat.codes

In [13]:
val_data_clf['b_labels'] = val_data_clf['label'].astype('category').cat.codes
test_data_clf['b_labels'] = test_data_clf['label'].astype('category').cat.codes


In [17]:
X_train = train_data_clf ['questions'].to_list()
X_val = val_data_clf ['questions'].to_list()
X_test = test_data_clf ['questions'].to_list()

y_train = train_data_clf['b_labels'].to_list()
y_val = val_data_clf['b_labels'].to_list()
y_test = test_data_clf['b_labels'].to_list()

In [18]:
train_data_clf['b_labels'].value_counts()

2    15519
5    11596
1    11074
0    10100
4    10083
3     9935
6     8042
7     7818
Name: b_labels, dtype: int64

### Model

In [20]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m68.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.1-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.2/199.2 KB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m73.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.1 tokenizers-0.13.2 transformers-4.26.1


In [21]:
from transformers import DistilBertTokenizer
from transformers import TFDistilBertForSequenceClassification
from transformers import TextClassificationPipeline

In [24]:
db_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

train_encodings = db_tokenizer(X_train, truncation = True, padding = True  )

val_encodings = db_tokenizer(X_val, truncation = True, padding = True )

In [25]:
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    y_train
))


val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    y_val
))

In [22]:
db_model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=8)

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading tf_model.h5:   0%|          | 0.00/363M [00:00<?, ?B/s]

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_layer_norm', 'activation_13', 'vocab_transform', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier', 'dropout_19', 'classifier']
You should probably TRAIN this model on a down-stream task to be able to use i

In [26]:
from transformers import TFDistilBertForSequenceClassification, TFTrainer, TFTrainingArguments


training_args = TFTrainingArguments(
    output_dir='./results',          
    num_train_epochs=5,              
    per_device_train_batch_size=16,  
    per_device_eval_batch_size=64,   
    warmup_steps=500,                
    weight_decay=1e-5,               
    logging_dir='./logs',            
    eval_steps=100                   
)

with training_args.strategy.scope():
    trainer_model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels = 8)


trainer = TFTrainer(
    model=trainer_model,                 
    args=training_args,                  
    train_dataset=train_dataset,         
    eval_dataset=val_dataset,            
)

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_layer_norm', 'activation_13', 'vocab_transform', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier', 'classifier', 'dropout_39']
You should probably TRAIN this model on a down-stream task to be able to use i

In [27]:
trainer.train()

Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


In [28]:
trainer.evaluate()

{'eval_loss': 1.8126793916894015}

In [32]:
save_directory = "/content/drive/MyDrive/code/saved_db_models" 

trainer_model.save_pretrained(save_directory)

db_tokenizer.save_pretrained(save_directory)

('/content/drive/MyDrive/code/saved_db_models/tokenizer_config.json',
 '/content/drive/MyDrive/code/saved_db_models/special_tokens_map.json',
 '/content/drive/MyDrive/code/saved_db_models/vocab.txt',
 '/content/drive/MyDrive/code/saved_db_models/added_tokens.json')

In [None]:
##load model and test 

In [33]:
tokenizer_fine_tuned = DistilBertTokenizer.from_pretrained(save_directory)

model_fine_tuned = TFDistilBertForSequenceClassification.from_pretrained(save_directory)


Some layers from the model checkpoint at /content/drive/MyDrive/code/saved_db_models were not used when initializing TFDistilBertForSequenceClassification: ['dropout_39']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at /content/drive/MyDrive/code/saved_db_models and are newly initialized: ['dropout_79']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [34]:
test_text = "i have always been loyal to my wife"
predict_input = tokenizer_fine_tuned.encode(
    test_text,
    truncation = True,
    padding = True,
    return_tensors = 'tf'    
)
output = model_fine_tuned(predict_input)[0]

prediction_value = tf.argmax(output, axis = 1).numpy()[0]

prediction_value

5