# Setup

In [1]:
dataset_name = 'go_emotions'    # Name of HF dataset to load
dataset_subset = 'raw'          # Name of HF subset

synthetic_dataset_filename = 'synthetic_datasets.parquet'
synthetic_dataset_dir = '../../synthetic_dataset/'

label_tools = '../../label_tools.py'

## Common Imports

In [1]:
import pandas as pd
import numpy as np
from datasets import load_dataset
from IPython.display import display
import os

import torch
from sklearn.model_selection import train_test_split
from simpletransformers.classification import MultiLabelClassificationModel, MultiLabelClassificationArgs

from label_tools import encode, inverse_encode

ImportError: attempted relative import with no known parent package

In [3]:
print("CUDA Enabled? " + str(torch.cuda.is_available())) 
if torch.cuda.is_available():
    print("Device: " + torch.cuda.get_device_name())

CUDA Enabled? True
Device: NVIDIA GeForce RTX 3080 Ti


# Building Datasets
## go_emotions

In [4]:
# go_emotions: https://huggingface.co/datasets/go_emotions
original_dataset = load_dataset(dataset_name, dataset_subset)
original_dataset = original_dataset['train'].to_pandas()

# Remove unnecessary columns.
# All records have example_very_unclear = False
original_dataset = original_dataset.drop(['id', 'author', 'subreddit', 'link_id', 'parent_id', 'created_utc', 'rater_id', 'example_very_unclear'], axis=1)
display(original_dataset)

# Remove unlabeled records
original_dataset = original_dataset.loc[(original_dataset.drop(labels='text', axis=1)!=0).any(axis=1)]

# Aggregate labels in a list column
emotion_labels = original_dataset.columns.tolist()[1:]

original_dataset['labels'] = list(zip(
    original_dataset.admiration.tolist(),
    original_dataset.amusement.tolist(),
    original_dataset.annoyance.tolist(),
    original_dataset.anger.tolist(),
    original_dataset.approval.tolist(),
    original_dataset.caring.tolist(),
    original_dataset.confusion.tolist(),
    original_dataset.curiosity.tolist(),
    original_dataset.desire.tolist(),
    original_dataset.disappointment.tolist(),
    original_dataset.disapproval.tolist(),
    original_dataset.disgust.tolist(),
    original_dataset.embarrassment.tolist(),
    original_dataset.excitement.tolist(),
    original_dataset.fear.tolist(),
    original_dataset.gratitude.tolist(),
    original_dataset.grief.tolist(),
    original_dataset.joy.tolist(),
    original_dataset.love.tolist(),
    original_dataset.nervousness.tolist(),
    original_dataset.optimism.tolist(),
    original_dataset.pride.tolist(),
    original_dataset.realization.tolist(),
    original_dataset.relief.tolist(),
    original_dataset.remorse.tolist(),
    original_dataset.sadness.tolist(),
    original_dataset.surprise.tolist(),
    original_dataset.neutral.tolist()
))



original_dataset

Unnamed: 0,text,admiration,amusement,anger,annoyance,approval,caring,confusion,curiosity,desire,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,That game hurt.,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,>sexuality shouldn’t be a grouping category I...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"You do right, if you don't care then fuck 'em!",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,Man I love reddit.,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,"[NAME] was nowhere near them, he was by the Fa...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
211220,Everyone likes [NAME].,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
211221,Well when you’ve imported about a gazillion of...,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
211222,That looks amazing,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
211223,The FDA has plenty to criticize. But like here...,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Unnamed: 0,text,admiration,amusement,anger,annoyance,approval,caring,confusion,curiosity,desire,...,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral,labels
0,That game hurt.,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,"You do right, if you don't care then fuck 'em!",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,Man I love reddit.,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,"[NAME] was nowhere near them, he was by the Fa...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
5,Right? Considering it’s such an important docu...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
211219,"Well, I'm glad you're out of all that now. How...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
211220,Everyone likes [NAME].,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
211221,Well when you’ve imported about a gazillion of...,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,"(0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
211222,That looks amazing,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"(1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


## Train/Test Split

In [5]:
subset = original_dataset.sample(n=100000, random_state=42, axis=0)
X = subset['text']
y = subset['labels']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

train = pd.concat([X_train, y_train], axis=1)
train

(80000,) (80000,)
(20000,) (20000,)


Unnamed: 0,text,labels
137204,"Oh, naggers. Of course.","(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
187765,I don't even know what I'm going to do when th...,"(0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ..."
50888,Frinkiac not working for you?,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."
101712,It violates my right to enjoy my tinnitus. eee...,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
16121,I just picked it back up after getting a pro c...,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."
...,...,...
134162,"Hilarious video, thank you for making us smile :)","(0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
195586,He was having a nice stroll through the park t...,"(1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4093,Lol well if you see an awkward looking girl in...,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."
154711,Fuck pancreatic cancer. Kills so many and it b...,"(0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [6]:
model_args = MultiLabelClassificationArgs()
model_args.overwrite_output_dir = True
model_args.num_train_epochs = 2
model_args.output_dir = 'outputs/distilbert_full_emotions/'
model_args.threshold = 0.4
model_args.max_seq_length = 256
model_args.num_train_epochs = 3
model_args.learning_rate = 2e-5
model_args.weight_decay = 0.01

model = MultiLabelClassificationModel(
    "distilbert",
    "distilbert-base-uncased",
    use_cuda=True,
    num_labels=28,
    args=model_args
)

model.train_model(train)

Some weights of DistilBertForMultiLabelSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/160 [00:00<?, ?it/s]

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 1 of 3:   0%|          | 0/10000 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/10000 [00:00<?, ?it/s]

Running Epoch 3 of 3:   0%|          | 0/10000 [00:00<?, ?it/s]

(30000, 0.12789167047354083)

In [14]:
test = pd.concat([X_test, y_test], axis=1)

predictions, raw_outputs = model.predict(test.text.tolist())

predictions

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

[[0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [0,
  0,
  0

In [18]:
from sklearn.metrics import classification_report
predictions = np.array(predictions).astype(bool)
print(classification_report(y_test, predictions, target_names=emotion_labels))

ValueError: You appear to be using a legacy multi-label data representation. Sequence of sequences are no longer supported; use a binary array or sparse matrix instead - the MultiLabelBinarizer transformer can convert to this format.

In [15]:
from sklearn.metrics import precision_score, recall_score, f1_score

In [16]:
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
f1_score = f1_score(y_test, predictions, average='weighted')

print(f"distilBERT_full (Go) Weighted Precision: {precision:.3f}")
print(f"distilBERT_full (Go) Weighted Recall: {recall:.3f}")
print(f"distilBERT_full (Go) Weighted F1 Score: {f1_score:.3f}")

ValueError: You appear to be using a legacy multi-label data representation. Sequence of sequences are no longer supported; use a binary array or sparse matrix instead - the MultiLabelBinarizer transformer can convert to this format.