In [23]:
import torch
from transformers import AutoModel, AutoTokenizer
from transformers import TextClassificationPipeline
import pandas as pd
import numpy as np
import tqdm
from tqdm import tqdm
import sys, os
from transformers import Pipeline
from torch import Tensor 
import torch.nn as nn
from torch.utils.data import DataLoader
from datasets import load_dataset
import torch.nn as nn
import sentence_transformers

sys.path.append(os.path.abspath('../src/exlib/utils'))
from projection_helper import project_points_onto_axes

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Load Data and Model

In [24]:
DATASET_REPO = "go_emotions"
MODEL_REPO = "shreyahavaldar/roberta-base-go_emotions"
TOKENIZER_REPO = "roberta-base"

def load_data():
    hf_dataset = load_dataset(DATASET_REPO)
    return hf_dataset

def load_model():
    model = AutoModel.from_pretrained(MODEL_REPO)
    model.to(device)
    return model

#go emotions dataset
class EmotionDataset(torch.utils.data.Dataset):
    def __init__(self, split):
        dataset = load_dataset(DATASET_REPO)[split]        
        self.dataset = dataset
        self.tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_REPO)

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        text = self.dataset[idx]['text']
        label = self.dataset[idx]['labels'][0]
        encoding = self.tokenizer(text, return_tensors='pt', padding='max_length', truncation=True, max_length=128)
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label)
        }

#classifier for go emotions dataset
class EmotionClassifier(nn.Module):
    def __init__(self):
        super(EmotionClassifier, self).__init__()
        self.model = AutoModel.from_pretrained(MODEL_REPO)
        self.classifier = nn.Linear(768, 28)

    def forward(self, input_ids, attention_mask):
        outputs = self.model(input_ids, attention_mask)
        last_hidden_state = outputs.last_hidden_state
        cls_token = last_hidden_state[:, 0, :]
        logits = self.classifier(cls_token)
        return logits

### Sample inference on dataset

In [25]:
dataset = EmotionDataset("train")
dataloader = DataLoader(dataset, batch_size=8, shuffle=False)

model = EmotionClassifier()
model.eval()

for batch in tqdm(dataloader): 
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    output = model(input_ids, attention_mask)
    utterances = [dataset.tokenizer.decode(input_id, skip_special_tokens=True) for input_id in input_ids]
    for utterance, label in zip(utterances, output):
        print("Text: {}\nEmotion: {}\n".format(utterance, label.argmax()))
    break

Some weights of RobertaModel were not initialized from the model checkpoint at shreyahavaldar/roberta-base-go_emotions and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 0/5427 [00:00<?, ?it/s]

Text: My favourite food is anything I didn't have to cook myself.
Emotion: 15

Text: Now if he does off himself, everyone will think hes having a laugh screwing with people instead of actually dead
Emotion: 9

Text: WHY THE FUCK IS BAYLESS ISOING
Emotion: 6

Text: To make her feel threatened
Emotion: 14

Text: Dirty Southern Wankers
Emotion: 8

Text: OmG pEyToN iSn'T gOoD eNoUgH tO hElP uS iN tHe PlAyOfFs! Dumbass Broncos fans circa December 2015.
Emotion: 14

Text: Yes I heard abt the f bombs! That has to be why. Thanks for your reply:) until then hubby and I will anxiously wait 😝
Emotion: 12

Text: We need more boards and to create a bit more space for [NAME]. Then we’ll be good.
Emotion: 0






### Define Alignment Metric

In [26]:
class Metric(nn.Module): 
    def __init__(self, model_name:str="distiluse-base-multilingual-cased"): 
        super(Metric, self).__init__()
        self.model = sentence_transformers.SentenceTransformer(model_name)
        points = self.define_circumplex()
        self.x1 = points[0]
        self.x2 = points[1]
        self.y1 = points[3]
        self.y2 = points[2]

    def define_circumplex(self):
        emotions = pd.read_csv("../src/exlib/utils/russell_emotions.csv")
        axis_labels = ["NV", "PV", "HA", "LA"]
        axis_points = []
        for label in axis_labels:
            emotion_words = emotions[emotions["label"] == label]["emotion"].values
            emotion_embeddings = self.model.encode(emotion_words)
            axis_points.append(np.mean(emotion_embeddings, axis=0))
        return axis_points
    
    def distance_from_circumplex(self, embeddings):
        projection = project_points_onto_axes(embeddings, self.x1, self.x2, self.y1, self.y2)
        x_projections = projection[0]
        y_projections = projection[1]
        distances = []
        for x, y in zip(x_projections, y_projections):
            distances.append(np.abs(np.sqrt(x**2 + y**2)-1))
        return np.mean(distances)

    # input: list of words
    def calculate_group_alignment(self, groups:list, language:str="english"):
        distances = []
        for group in groups:
            embeddings = self.model.encode(group)
            distances.append(1 - self.distance_from_circumplex(embeddings))
        return distances
        
    def forward(self, zp, x=None, y=None, z=None, reduce=True, **kwargs): 
        pass

### Example Group Alignment Calculation

In [27]:
metric = Metric()
sample_groups = [["however", "therefore", "unless"], 
                ["sad", "happy", "thrilled"], 
                ["computer", "neural network", "compiler"], 
                ["tired", "sleepy", "calm"]]
alignments = metric.calculate_group_alignment(sample_groups)
for group, alignment in zip(sample_groups, alignments):
    print(f"Group: {group}, Alignment: {alignment}")

Group: ['however', 'therefore', 'unless'], Alignment: 0.19223788470945413
Group: ['sad', 'happy', 'thrilled'], Alignment: 0.6713999482756581
Group: ['computer', 'neural network', 'compiler'], Alignment: 0.3275037898438139
Group: ['tired', 'sleepy', 'calm'], Alignment: 0.653776497435735
