In [2]:
import torch
from transformers import AutoModel, AutoTokenizer
from transformers import TextClassificationPipeline
import pandas as pd
import numpy as np
import tqdm
from tqdm import tqdm
import sys, os
from transformers import Pipeline
from torch import Tensor 
import torch.nn as nn
from torch.utils.data import DataLoader
from datasets import load_dataset
import torch.nn as nn
import sentence_transformers

sys.path.append(os.path.abspath('../src/exlib/utils'))
from projection_helper import project_points_onto_axes

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Load Data and Model

In [92]:
DATASET_REPO = "go_emotions"
MODEL_REPO = "shreyahavaldar/roberta-base-go_emotions"
TOKENIZER_REPO = "roberta-base"

def load_data():
    hf_dataset = load_dataset(DATASET_REPO)
    return hf_dataset

def load_model():
    model = AutoModel.from_pretrained(MODEL_REPO)
    model.to(device)
    return model

#go emotions dataset
class EmotionDataset(torch.utils.data.Dataset):
    def __init__(self, split):
        dataset = load_dataset(DATASET_REPO)[split]        
        self.dataset = dataset
        self.tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_REPO)
        self.max_len = max([len(text.split()) for text in dataset['text']])

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        text = self.dataset[idx]['text']
        label = self.dataset[idx]['labels'][0]
        encoding = self.tokenizer(text, return_tensors='pt', padding='max_length', truncation=True, max_length=128)
        word_list = text.split()
        for i in range(len(word_list), self.max_len):
            word_list.append('')
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label),
            'word_list': word_list
        }

#classifier for go emotions dataset
class EmotionClassifier(nn.Module):
    def __init__(self):
        super(EmotionClassifier, self).__init__()
        self.model = AutoModel.from_pretrained(MODEL_REPO)
        self.classifier = nn.Linear(768, 28)

    def forward(self, input_ids, attention_mask):
        outputs = self.model(input_ids, attention_mask)
        last_hidden_state = outputs.last_hidden_state
        cls_token = last_hidden_state[:, 0, :]
        logits = self.classifier(cls_token)
        return logits

### Sample inference on dataset

In [101]:
dataset = EmotionDataset("train")
dataloader = DataLoader(dataset, batch_size=4, shuffle=False)

model = EmotionClassifier()
model.eval()

for batch in tqdm(dataloader): 
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    output = model(input_ids, attention_mask)
    utterances = [dataset.tokenizer.decode(input_id, skip_special_tokens=True) for input_id in input_ids]
    for utterance, label in zip(utterances, output):
        print("Text: {}\nEmotion: {}\n".format(utterance, label.argmax()))
    break

Some weights of RobertaModel were not initialized from the model checkpoint at shreyahavaldar/roberta-base-go_emotions and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 0/10853 [00:00<?, ?it/s]

Text: My favourite food is anything I didn't have to cook myself.
Emotion: 21

Text: Now if he does off himself, everyone will think hes having a laugh screwing with people instead of actually dead
Emotion: 12

Text: WHY THE FUCK IS BAYLESS ISOING
Emotion: 12

Text: To make her feel threatened
Emotion: 15






### Define Alignment Metric

In [96]:
class Metric(nn.Module): 
    def __init__(self, model_name:str="all-mpnet-base-v2"): 
        super(Metric, self).__init__()
        self.model = sentence_transformers.SentenceTransformer(model_name)
        points = self.define_circumplex()
        self.x1 = points[0]
        self.x2 = points[1]
        self.y1 = points[3]
        self.y2 = points[2]

    def define_circumplex(self):
        emotions = pd.read_csv("../src/exlib/utils/russell_emotions.csv")
        axis_labels = ["NV", "PV", "HA", "LA"]
        axis_points = []
        for label in axis_labels:
            emotion_words = emotions[emotions["label"] == label]["emotion"].values
            emotion_embeddings = self.model.encode(emotion_words)
            axis_points.append(np.mean(emotion_embeddings, axis=0))
        return axis_points
    
    def distance_from_circumplex(self, embeddings):
        projection = project_points_onto_axes(embeddings, self.x1, self.x2, self.y1, self.y2)
        x_projections = projection[0]
        y_projections = projection[1]
        distances = []
        for x, y in zip(x_projections, y_projections):
            distances.append(np.abs(np.sqrt(x**2 + y**2)-1))
        return 1/np.mean(distances)

    def mean_pairwise_dist(self, embeddings):
        projection = project_points_onto_axes(embeddings, self.x1, self.x2, self.y1, self.y2)
        distances = []
        x_coords = projection[0]
        y_coords = projection[1]
        for i in range(len(embeddings)):
            for j in range(i+1, len(embeddings)):
                x_dist = x_coords[i] - x_coords[j]
                y_dist = y_coords[i] - y_coords[j]
                distances.append(np.sqrt(x_dist**2 + y_dist**2))
        return 1/np.mean(distances)

    def sigmoid(self, x):
        return 1/(1+np.exp(-x))

    # input: list of words
    def calculate_group_alignment(self, groups:list, language:str="english"):
        alignments = []
        for group in groups:
            embeddings = self.model.encode(group)
            circumplex_dist = self.distance_from_circumplex(embeddings)
            if(len(embeddings) == 1): 
                alignments.append(circumplex_dist)
            else:
                mean_dist = self.mean_pairwise_dist(embeddings)
                combined_dist = circumplex_dist*mean_dist
                alignments.append(combined_dist)
        return alignments
        
    def forward(self, zp, x=None, y=None, z=None, reduce=True, **kwargs): 
        pass

### Example Expert Alignment Calculation

In [176]:
metric = Metric()
sample_groups = [["hooray!", "yay!", "surprise!"], 
                ["happy", "excited"],
                ["beautiful", "ugly"]]

alignments = metric.calculate_group_alignment(sample_groups)

for group, alignment in zip(sample_groups, alignments):
    print(f"Group: {group}, Alignment: {alignment}")

Group: ['hooray!', 'yay!', 'surprise!'], Alignment: 4.486955276930485
Group: ['happy', 'excited'], Alignment: 8.559298682657115
Group: ['beautiful', 'ugly'], Alignment: 2.1811173063494373


### Baselines

In [162]:
for batch in dataloader: 
    word_lists = batch['word_list']
    word_lists = list(map(list, zip(*word_lists)))
    processed_word_lists = []
    for word_list in word_lists:
        processed_word_lists.append([word for word in word_list if word != ''])
    print("---- Word Level Groups ----")
    word_alignments = []
    for word_list in processed_word_lists:
        word_groups = []
        for word in word_list:
            word_groups.append([word])
        print(word_groups)
        alignments = metric.calculate_group_alignment(word_groups)
        print(np.mean(alignments))

    print("\n---- Phrase Level Groups ----")
    phrase_alignments = []
    for word_list in processed_word_lists:
        phrase_groups = []
        #each group is 3 consecutive words
        for i in range(0, len(word_list), 3):
            phrase_groups.append(word_list[i:i+3])
        print(phrase_groups)
        alignments = metric.calculate_group_alignment(phrase_groups)
        print(np.mean(alignments))

    print("\n---- Sentence Level Groups ----")
    sentence_alignments = []
    for word_list in processed_word_lists:
        sentence_groups = []

        #reconstruct sentences from word list
        sentence = ""
        for word in word_list:
            sentence += word + " "
            if word[-1] == "." or word[-1] == "!" or word[-1] == "?":
                sentence_groups.append(sentence.split())
                sentence = ""
        if(len(sentence) > 0):
            sentence_groups.append(sentence.split())

        print(sentence_groups)
        alignments = metric.calculate_group_alignment(sentence_groups)
        print(np.mean(alignments))

    break
    

---- Word Level Groups ----
[['My'], ['favourite'], ['food'], ['is'], ['anything'], ['I'], ["didn't"], ['have'], ['to'], ['cook'], ['myself.']]
1.2176552793157727
[['Now'], ['if'], ['he'], ['does'], ['off'], ['himself,'], ['everyone'], ['will'], ['think'], ['hes'], ['having'], ['a'], ['laugh'], ['screwing'], ['with'], ['people'], ['instead'], ['of'], ['actually'], ['dead']]
1.2696060271921354
[['WHY'], ['THE'], ['FUCK'], ['IS'], ['BAYLESS'], ['ISOING']]
1.2578318794807875
[['To'], ['make'], ['her'], ['feel'], ['threatened']]
1.8772896491302233

---- Phrase Level Groups ----
[['My', 'favourite', 'food'], ['is', 'anything', 'I'], ["didn't", 'have', 'to'], ['cook', 'myself.']]
5.190700584699706
[['Now', 'if', 'he'], ['does', 'off', 'himself,'], ['everyone', 'will', 'think'], ['hes', 'having', 'a'], ['laugh', 'screwing', 'with'], ['people', 'instead', 'of'], ['actually', 'dead']]
5.836896290912729
[['WHY', 'THE', 'FUCK'], ['IS', 'BAYLESS', 'ISOING']]
4.158414830104384
[['To', 'make', 'her'