# Approach 1: Transformer with OHE (Used in the final model)
Make sure you set the runtime type to GPU.

## Importing the Libraries and data

In [None]:
#import libraries
!pip install transformers
!pip install tensorflow-addons
import pandas as pd
import numpy as np
import torch
import tensorflow as tf
import tensorflow_addons as tfa
import matplotlib.pyplot as plt
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers.schedules import PolynomialDecay
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.utils.class_weight import compute_class_weight, compute_sample_weight
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification, BartForConditionalGeneration, BartTokenizer, pipeline
from huggingface_hub import notebook_login
from tqdm.notebook import tqdm
tqdm.pandas()

In [None]:
#Parameters for easy toggling
use_paraphrasing = False
paraphrase_selective = False
use_class_weights = True
model_name='roberta-large'
tokenizer_name = 'roberta-large'
epochs=40
spell_check = False
early_stopping = True
patience = 15
best_weights = True
schedule_lr = False
use_swa = True
test_mode = True

In [None]:
# Load data

#training data
data = pd.read_csv("WASSA23_essay_level_with_labels_train.tsv", sep="\t")
data = data[['emotion', 'essay']]

if test_mode:
  dev_data = pd.read_csv("WASSA23_essay_level_test.tsv", sep="\t")
else:
  #development data
  dev_data = pd.read_csv("WASSA23_essay_level_dev.tsv", sep="\t")

  #gold standard dev labels
  df = pd.read_csv('/content/drive/MyDrive/MSML641 project/goldstandard_dev.tsv', sep="\t", names=['1', '2', 'emotion',1,2,3,4,5,6,7,8,9])
  dev_data['emotion'] = df['emotion']

In [None]:
#create a list of unique emotions
#unique_emotions = list(set(data["emotion"].str.split("/").explode().unique())) #uncomment to create a new sequence

#fixed sequence to account for saved model
unique_emotions = ['Disgust', 'Hope', 'Fear', 'Anger', 'Sadness', 'Joy', 'Surprise', 'Neutral']

unique_emotions

In [None]:
occurences = data["emotion"].str.split("/").explode().value_counts()
occurences

## Spell Check

In [None]:
#fix spelling using huggingface spell correction model
def fix_spelling_iterate(df):
  '''
    This function is applied on a dataframe. It corrects the spelling of essay column
  '''
  #take in essay
  essay = df['essay']

  #get length of essay
  length_essay = len(essay)

  #fix spelling by calling the spell check function
  df['essay'] = fix_spelling(essay,max_length=length_essay)

  #return the row
  return df

#if spell check flag is set
if spell_check:

  #instantiate huggingface pipeline
  fix_spelling = pipeline("text2text-generation",model="oliverguhr/spelling-correction-english-base")

  #apply the function on training and development data
  data = data.progress_apply(fix_spelling_iterate, axis=1)
  dev_data = dev_data.progress_apply(fix_spelling_iterate, axis=1)

##Paraphrasing

In [None]:
#check count of each label combination
frequency_list = data.groupby(['emotion']).size().to_dict()
frequency_list

In [None]:
#if paraphrasing flag is set
if use_paraphrasing:
  if paraphrase_selective:
    #create a dictionary from the training data
    paraphrase_threshold = 40
    data_dict = data.to_dict(orient="records")
  else:
    data_dict = []
    paraphrase_threshold = 400000

  #instantiate paraphrasing model and shift it to gpu
  model = BartForConditionalGeneration.from_pretrained('eugenesiow/bart-paraphrase')
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  model = model.to(device)

  #instantiate the tokenizer
  tokenizer = BartTokenizer.from_pretrained('eugenesiow/bart-paraphrase')

  #define the paraphrasing function
  def paraphrase(df):
    '''
      This function paraphrases the essays whose label frequency is less than 40
    '''

    #get the essay and the label
    input_sentence = df['essay']
    emotion = df['emotion']

    #check if frequency is less than 40
    if frequency_list[emotion] < paraphrase_threshold:

      #tokenize essay and paraphrase
      batch = tokenizer(input_sentence, return_tensors='pt').to(device)
      generated_ids = model.generate(batch['input_ids'])
      generated_sentence = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

      #append to training data dictionary
      data_dict.append({'essay': generated_sentence[0], 'emotion':emotion})

  #apply above function to training data
  data.progress_apply(paraphrase, axis=1)

  #get new training data by converting the dictionary to dataframe
  data = pd.DataFrame(data_dict)




## Training

In [None]:
#One Hot Encode the labels
num_classes = len(unique_emotions)
for emotion in unique_emotions:
    data[emotion] = data["emotion"].str.contains(emotion).astype(int)
    dev_data[emotion] = dev_data["emotion"].str.contains(emotion).astype(int)

#drop string emotion column
data = data.drop(columns=["emotion"])


In [None]:
#get training and testing texts and labels
train_texts= data['essay'] 
train_labels = data[unique_emotions]
test_texts = dev_data['essay']
test_labels= dev_data[unique_emotions]

# Load tokenizer and pre-trained model
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
model = TFAutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_classes)

# Compute class weights for training data
class_weights = compute_sample_weight(class_weight='balanced', y=train_labels)

In [None]:
# Tokenize train and test texts
train_encodings = tokenizer(train_texts.to_list(), truncation=True, padding=True)
test_encodings = tokenizer(test_texts.to_list(), truncation=True, padding=True)

# Create TensorFlow datasets, add class weights to training dataset if flag is set
if use_class_weights:
  train_dataset = tf.data.Dataset.from_tensor_slices((
      dict(train_encodings),
      train_labels.astype(np.float32),
      class_weights
  ))
else:
  train_dataset = tf.data.Dataset.from_tensor_slices((
      dict(train_encodings),
      train_labels.astype(np.float32)
  ))

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    test_labels.astype(np.float32)
))

In [None]:
#lr scheduler
num_steps = len(train_dataset) * epochs // 16
lr_scheduler = PolynomialDecay(
    initial_learning_rate = 3e-05,
    end_learning_rate = 0.0,
    decay_steps = num_steps
)


# Define training parameters
if schedule_lr:
  optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=lr_scheduler, decay=0.008) #, weight_decay=0.008
else:
  optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=3e-05,decay=0.008)

if use_swa:
  optimizer = tfa.optimizers.SWA(optimizer)
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
metric = tf.keras.metrics.BinaryAccuracy('accuracy')

# Compile model
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

#early stopping, monitoring validation loss, if flag is set
if early_stopping:
  early_stopping_callback = EarlyStopping(
      monitor="val_loss",
      patience=patience,
      restore_best_weights=best_weights
  )

  #fit the model
  history = model.fit(train_dataset.shuffle(len(train_labels)).batch(16), epochs=epochs, batch_size=16, validation_data=test_dataset.shuffle(len(test_labels)).batch(16), callbacks=[early_stopping_callback])
else:
  #fit the model without early stopping
  history = model.fit(train_dataset.shuffle(len(train_labels)).batch(16), epochs=epochs, batch_size=16, validation_data=test_dataset.shuffle(len(test_labels)).batch(16))


In [None]:
#plot loss 
plt.plot(history.history['loss'], label='train loss')
plt.plot(history.history['val_loss'], label='val loss')

#plot a line at the best loss
plt.axhline(np.min(history.history['val_loss']), linestyle='--', color='r', label='best val loss')
plt.legend()


In [None]:
#plot accuracy
plt.plot(history.history['accuracy'], label='train accuracy')
plt.plot(history.history['val_accuracy'], label='val accuracy')
plt.legend()

In [None]:
#plot learning rate
plt.plot(lr_scheduler(tf.range(0, num_steps)))

##Evaluation

In [None]:
#finding the best threshold
def predict_on_test_dataset(test_dataset, thresholds):
  '''
    This function takes in the test dataset (here, dev data) and calculates F1 and accuracy. 
    Test different thresholds for logits
  '''

  #iterate over thresholds
  for threshold in thresholds:

    #predict using trained model
    y_pred = model.predict(test_dataset.batch(16))

    #based on threshold, convert logits to 1
    y_pred = tf.where(np.array(list(y_pred.logits)) > threshold, 1,0)
    y_pred = y_pred.numpy()
    y_true = test_labels.to_numpy()

    #calculate accuracy
    accuracy = np.sum(y_pred == y_true) / y_true.size

    #print accuracy and F1 macro
    print(threshold, f1_score(y_true, y_pred, average="macro",zero_division=1), accuracy)

#instantiate a list of thresholds
#Try different values
thresholds = [-0.3,-0.25,-0.2,-0.15, -0.1, -0.075]

#call the function
predict_on_test_dataset(test_dataset, thresholds)

In [None]:
#Get the string version of predictions
def get_predicted_emotions(data, threshold=-0.075):
    '''
      This function takes in the data and finds the emotion of the essay using the trained model
    '''

    #get the essay column
    essays = data['essay']

    #get the encodings
    encodings = tokenizer(essays.tolist(), truncation=True, padding=True)

    #create tensorflow dataset
    dataset = tf.data.Dataset.from_tensor_slices(dict(encodings))

    #get predictions
    y_pred = model.predict(dataset.batch(16))
    
    #create matrix based on threshold
    matrix = tf.where(y_pred.logits>threshold, 1, 0)
    y_pred = []

    #iterate over rows and find corresponding emotions
    for row in matrix:
      emotions = []
      for i in range(len(row)):
        if row[i] == 1:
          emotions.append(unique_emotions[i])
          emotions.sort()
      y_pred.append("/".join(emotions))

    #create a dataframe with the essays and the predicted emotions
    df = pd.DataFrame({'essay': essays, 'emotion': y_pred})

    #return the dataframe
    return df

In [None]:
#if paraphrasing flag is set
if use_paraphrasing:
  if paraphrase_selective:
    #create a dictionary from the training data
    paraphrase_threshold = 40
    data_dict = data.to_dict(orient="records")
  else:
    data_dict = []
    paraphrase_threshold = 400000

  #instantiate paraphrasing model and shift it to gpu
  model_para = BartForConditionalGeneration.from_pretrained('eugenesiow/bart-paraphrase')
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  model_para = model_para.to(device)

  #instantiate the tokenizer
  tokenizer_para = BartTokenizer.from_pretrained('eugenesiow/bart-paraphrase')

  #define the paraphrasing function
  def paraphrase(df):
    '''
      This function paraphrases the essays whose label frequency is less than 40
    '''

    #get the essay and the label
    input_sentence = df['essay']
    emotion = df['emotion']

    #check if frequency is less than 40

    #tokenize essay and paraphrase
    batch = tokenizer_para(input_sentence, return_tensors='pt').to(device)
    generated_ids = model_para.generate(batch['input_ids'])
    generated_sentence = tokenizer_para.batch_decode(generated_ids, skip_special_tokens=True)

    #append to training data dictionary
    data_dict.append({'essay': generated_sentence[0], 'emotion':emotion})

  #apply above function to training data
  dev_data.progress_apply(paraphrase, axis=1)

  #get new training data by converting the dictionary to dataframe
  dev_data = pd.DataFrame(data_dict)




In [None]:
#get predictions on the dev data
dev_predictions = get_predicted_emotions(dev_data)
dev_predictions

In [None]:
#if no emotion is assigned, assign neutral to that row
dev_predictions['emotion'] = dev_predictions['emotion'].replace('', 'Neutral')

In [None]:
#save to csv
dev_predictions['emotion'].to_csv('predictions_EMO.tsv', sep="\t", header=False, index=False)

## Saving the model

In [None]:
#log in to huggingface (You'll need a token)
notebook_login()

In [None]:
#save the model
model.save_pretrained("adityapatkar/WASSA_EMO", push_to_hub=True)

In [None]:
#save the tokenizer
tokenizer.push_to_hub("adityapatkar/WASSA_EMO")

## Prediction using huggingface hub model

In [None]:
#get the model and tokenizer from huggingface hub
model_name = 'adityapatkar/WASSA_EMO'
tokenizer = AutoTokenizer.from_pretrained(model_name) #add arg: use_auth_token="Add_Your_Auth_Token"
model = TFAutoModelForSequenceClassification.from_pretrained(model_name, num_labels=8) #add arg: use_auth_token="Add_Your_Auth_Token"

#get the test set
test_set = pd.read_csv("WASSA23_essay_level_test.tsv", sep="\t")

unique_emotions = ['Disgust', 'Hope', 'Fear', 'Anger', 'Sadness', 'Joy', 'Surprise', 'Neutral']

unique_emotions

#find predictions on the test set
test_predictions = get_predicted_emotions(test_set)

#see the predictions
test_predictions

In [None]:
#if no emotion is assigned, assign neutral to that row
test_predictions['emotion'] = test_predictions['emotion'].replace('', 'Neutral')

#save to csv
test_predictions['emotion'].to_csv('predictions_EMO.tsv', sep="\t", header=False, index=False)
test_predictions.to_csv('predictions_EMO_Essay.tsv', sep="\t", header=False, index=False)

## Predicting if multiple emotions exist

In [None]:
#classifier for predicting if multiple emotions are present in an essay

data = pd.read_csv("/content/WASSA23_essay_level_with_labels_train.tsv", sep="\t")
data['multiple_emotions'] = data['emotion'].apply(lambda x: 1 if len(x.split("/")) > 1 else 0)
dev_data['multiple_emotions'] = dev_data['emotion'].apply(lambda x: 1 if len(x.split("/")) > 1 else 0)

train_texts = data['essay']
train_labels = data['multiple_emotions']
test_texts = dev_data['essay']
test_labels = dev_data['multiple_emotions']

# Load tokenizer and pre-trained model
tokenizer = AutoTokenizer.from_pretrained('roberta-base')
model = TFAutoModelForSequenceClassification.from_pretrained('roberta-base', num_labels=2)

# Tokenize train and test texts
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True)
test_encodings = tokenizer(test_texts.tolist(), truncation=True, padding=True)

# Create TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels.astype(np.float32)

))  
test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    test_labels.astype(np.float32)
))

optimizer = tf.keras.optimizers.AdamW(learning_rate=2e-5)
loss = tf.keras.losses.BinaryCrossentropy(from_logits=False)
metric = tf.keras.metrics.BinaryAccuracy('accuracy')

# Compile model
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

# Train model
model.fit(train_dataset.shuffle(len(train_labels)).batch(16), epochs=5, batch_size=16, validation_data=test_dataset.shuffle(len(test_labels)).batch(16))




In [None]:
def multiple_emotions_present(essay):
    '''
    Takes in an essay and returns 0 if no multiple emotions are present and 1 if multiple emotions are present
    '''
    encodings = tokenizer(essay, truncation=True, padding=True)
    
    #create 1 row dataset
    dataset = tf.data.Dataset.from_tensor_slices(dict(encodings))

    #predict: send as batch of 1
    y_pred = model.predict(dataset.batch(1))
    y_pred = tf.nn.softmax(y_pred.logits)
    #get argmax of each row
    y_pred = tf.argmax(y_pred, axis=1)
    return y_pred[0].numpy()  

In [None]:
dev_data['essay'][203]

In [None]:
essay = "After reading the article, you can't help but feel bad for the people that were involved in the train crash. IT was a freak accident and something you can't control. It 's just sad because they had to leave all of their family behind and their lives will most likely never be the same. While reading it, i felt really worried because it could happen to anyone."

In [None]:
multiple_emotions_present(essay)

# Training on GoEmotions

In [None]:
#import libraries
!pip install transformers
import pandas as pd
import numpy as np
import torch
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers.schedules import PolynomialDecay
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.utils.class_weight import compute_class_weight, compute_sample_weight
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification, BartForConditionalGeneration, BartTokenizer, pipeline
from huggingface_hub import notebook_login
from tqdm.notebook import tqdm
tqdm.pandas()

In [None]:
#Parameters for easy toggling
use_paraphrasing = False
paraphrase_selective = False
use_class_weights = True
model_name='roberta-large'
tokenizer_name = 'roberta-large'
epochs=15
spell_check = False
early_stopping = True
patience = 10
best_weights = True
schedule_lr = False

In [None]:
# Load data

#training data
data = pd.read_csv("goemotions.csv").iloc[0:5000]

#development data
dev_data = pd.read_csv("WASSA23_essay_level_dev.tsv", sep="\t")

#gold standard dev labels
df = pd.read_csv('/content/goldstandard_dev.tsv', sep="\t", names=['1', '2', 'emotion',1,2,3,4,5,6,7,8,9])
dev_data['emotion'] = df['emotion']

In [None]:
#create a list of unique emotions
#unique_emotions = list(set(data["emotion"].str.split("/").explode().unique())) #uncomment to create a new sequence

#fixed sequence to account for saved model
unique_emotions = ['Disgust', 'Hope', 'Fear', 'Anger', 'Sadness', 'Joy', 'Surprise', 'Neutral']

unique_emotions

In [None]:
#One Hot Encode the labels
num_classes = len(unique_emotions)
for emotion in unique_emotions:
    dev_data[emotion] = dev_data["emotion"].str.contains(emotion).astype(int)


In [None]:
#get training and testing texts and labels
train_texts= data['essay'] 
train_labels = data[unique_emotions]
test_texts = dev_data['essay']
test_labels= dev_data[unique_emotions]

# Load tokenizer and pre-trained model
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
model = TFAutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_classes)

# Compute class weights for training data
class_weights = compute_sample_weight(class_weight='balanced', y=train_labels)

In [None]:
# Tokenize train and test texts
train_encodings = tokenizer(train_texts.to_list(), truncation=True, padding=True)
test_encodings = tokenizer(test_texts.to_list(), truncation=True, padding=True)

# Create TensorFlow datasets, add class weights to training dataset if flag is set
if use_class_weights:
  train_dataset = tf.data.Dataset.from_tensor_slices((
      dict(train_encodings),
      train_labels.astype(np.float32),
      class_weights
  ))
else:
  train_dataset = tf.data.Dataset.from_tensor_slices((
      dict(train_encodings),
      train_labels.astype(np.float32)
  ))

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    test_labels.astype(np.float32)
))

In [None]:
#lr scheduler
num_steps = len(train_dataset) * epochs // 16
lr_scheduler = PolynomialDecay(
    initial_learning_rate = 3e-05,
    end_learning_rate = 0.0,
    decay_steps = num_steps
)


# Define training parameters
if schedule_lr:
  optimizer = tf.keras.optimizers.AdamW(learning_rate=lr_scheduler) #, weight_decay=0.008
else:
  optimizer = tf.keras.optimizers.AdamW(learning_rate=2e-05)
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
metric = tf.keras.metrics.BinaryAccuracy('accuracy')

# Compile model
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

#early stopping, monitoring validation loss, if flag is set
if early_stopping:
  early_stopping_callback = EarlyStopping(
      monitor="val_loss",
      patience=patience,
      restore_best_weights=best_weights
  )

  #fit the model
  history = model.fit(train_dataset.shuffle(len(train_labels)).batch(16), epochs=epochs, batch_size=16, validation_data=test_dataset.shuffle(len(test_labels)).batch(16), callbacks=[early_stopping_callback])
else:
  #fit the model without early stopping
  history = model.fit(train_dataset.shuffle(len(train_labels)).batch(16), epochs=epochs, batch_size=16, validation_data=test_dataset.shuffle(len(test_labels)).batch(16))


In [None]:
#Parameters for easy toggling
use_paraphrasing = False
paraphrase_selective = False
use_class_weights = True
model_name='roberta-large'
tokenizer_name = 'roberta-large'
epochs=10
spell_check = False
early_stopping = True
patience = 7
best_weights = True
schedule_lr = True

In [None]:
# Load data

#training data
data = pd.read_csv("/content/WASSA23_essay_level_with_labels_train.tsv", sep="\t")
data = data[['emotion', 'essay']]

#One Hot Encode the labels
num_classes = len(unique_emotions)
for emotion in unique_emotions:
    data[emotion] = data["emotion"].str.contains(emotion).astype(int)

#drop string emotion column
data = data.drop(columns=["emotion"])

#get training and testing texts and labels
train_texts= data['essay'] 
train_labels = data[unique_emotions]

# Compute class weights for training data
class_weights = compute_sample_weight(class_weight='balanced', y=train_labels)

In [None]:
# Tokenize train and test texts
train_encodings = tokenizer(train_texts.to_list(), truncation=True, padding=True)

# Create TensorFlow datasets, add class weights to training dataset if flag is set
if use_class_weights:
  train_dataset = tf.data.Dataset.from_tensor_slices((
      dict(train_encodings),
      train_labels.astype(np.float32),
      class_weights
  ))
else:
  train_dataset = tf.data.Dataset.from_tensor_slices((
      dict(train_encodings),
      train_labels.astype(np.float32)
  ))

In [None]:
#lr scheduler
num_steps = len(train_dataset) * epochs // 16
lr_scheduler = PolynomialDecay(
    initial_learning_rate = 3e-05,
    end_learning_rate = 0.0,
    decay_steps = num_steps
)


# Define training parameters
if schedule_lr:
  optimizer = tf.keras.optimizers.AdamW(learning_rate=lr_scheduler) #, weight_decay=0.008
else:
  optimizer = tf.keras.optimizers.AdamW(learning_rate=2e-05)
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
metric = tf.keras.metrics.BinaryAccuracy('accuracy')

#early stopping, monitoring validation loss, if flag is set
if early_stopping:
  early_stopping_callback = EarlyStopping(
      monitor="val_loss",
      patience=patience,
      restore_best_weights=best_weights
  )

  #fit the model
  history = model.fit(train_dataset.shuffle(len(train_labels)).batch(16), epochs=epochs, batch_size=16, validation_data=test_dataset.shuffle(len(test_labels)).batch(16), callbacks=[early_stopping_callback])
else:
  #fit the model without early stopping
  history = model.fit(train_dataset.shuffle(len(train_labels)).batch(16), epochs=epochs, batch_size=16, validation_data=test_dataset.shuffle(len(test_labels)).batch(16))


In [None]:
history = model.fit(train_dataset.shuffle(len(train_labels)).batch(16), epochs=epochs, batch_size=16, validation_data=test_dataset.shuffle(len(test_labels)).batch(16), callbacks=[early_stopping_callback])