In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from transformers import AutoTokenizer

In [None]:
# Mount gdrive support and set working directory for future use

from google.colab import drive
drive.mount('/content/drive')

os.chdir('/content/drive/My Drive/emotion_detection')
os.getcwd() # use this directory as root, all logs and model checkpoints are saved here

In [None]:
data_df = pd.read_csv("{}/datasets/combined_data.csv".format(os.getcwd()), sep='\t')
data_df

In [None]:
# All the available datasets

print(np.unique(data_df["dataset"]))

# Data Description


This data table contains texts from several datasets combined together. It contains  
versatile variants text. The sources for texts include Reddit Posts,  
tweets from different users, texts from dialgues coming from English textbooks  and manually collected sentences which were used for chat bot building. Merged datasets include  


1.   [Empathetic Dialogues](https://github.com/facebookresearch/EmpatheticDialogues), [Paper](https://arxiv.org/pdf/1811.00207v5.pdf)
2.   [CrowdFlower Tweet Data](https://www.kaggle.com/pashupatigupta/emotion-detection-from-text)
3.   [GoEmotions Reddit Posts](https://github.com/google-research/google-research/tree/master/goemotions), [Paper](https://arxiv.org/pdf/2005.00547v2.pdf)
4.   [DailyDialogue dialogue text](http://yanran.li/dailydialog), [Paper](https://arxiv.org/pdf/1710.03957.pdf)


The versatility of texts that may help in build more robust emotion classifier. That is to say, a model can learn to discern emotional patterns both from contemporary style of social media texts and from something more formal, such as texts found in *EmpatheticDialogues* and *DailyDialogue* datasets.

Texts coming from different datasets have several kinds of emotion labels. Some use very fine grained labels while some have labels only for small amount of emotions.  For instance, *EmpatheticDialogues* datasets uses 32 labels. To this end, we coalesce fine grained labels from the data into 6 basic emotions according  to Parrots emotion hierarchy ([Parrots Emotion Grouping](https://en.wikipedia.org/wiki/Emotion_classification)). A more visual explanation is given in the following wheel [Wheel of Emotions](https://www.becalmwithtati.com/wp-content/uploads/2019/02/Mindfulness_for_Emotions.jpeg).  

This way, we obtain the *broadEmo* label in the dataset. Fine level emotion label is also present in the data so that both brad and fine emotions can be predicted with a model.  

Moreover, the dataset name is also placed with each row so that dataset subsetting can be performed. This may be useful to train models only on specific kind of data.

In [None]:
print(data_df.isnull().sum())

In [None]:
print("\nSentiment Counts:")
print(data_df.broadEmo.value_counts())
print()
classesBroad = data_df.broadEmo.unique().tolist()
print(classesBroad)

In [None]:
# Huge imbalance towards neutral label

fig, ax = plt.subplots(figsize=(16, 8))

ax.hist(data_df['broadEmo'])

ax.set_xticklabels(list(classesBroad), rotation=45)
plt.show()

In [None]:
print("\nSentiment Counts:")
print(data_df.fineEmo.value_counts())
print()
classesFine = data_df.fineEmo.unique().tolist()
print(classesFine)

In [None]:
# Huge imbalance towards neutral class. Consider using separate datasets from combination

fig, ax = plt.subplots(figsize=(16, 8))

ax.hist(data_df['fineEmo'])

ax.set_xticklabels(list(classesFine), rotation=45)
plt.show()

In [None]:
#Find the max length here:
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", normalization=True)

token_lens = []
for txt in data_df[data_df.dataset == "Empathetic Dialogues"].sentence:
  tokens = tokenizer.encode(txt)
  token_lens.append(len(tokens))

fig, ax = plt.subplots()
ax.hist(token_lens)
plt.show()

In [None]:
# This is too long, we should probably truncate our context + utterance to a certain threshold

# Some of the sentences are too long. Luckily, not many of them are, we are safe to
# Disregard those
arr = np.array(token_lens)
print(sum(arr[arr > 128]))
max(token_lens)

threshold_len = 128

In [None]:
empDialogueData = data_df[data_df.dataset == "Empathetic Dialogues"]
empDialogueData