In [9]:
from transformers import pipeline
from tqdm import tqdm
import torch
import pandas as pd
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification




In [10]:
def initialize_model():
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", top_k=1, device = device)
    tokenizer = AutoTokenizer.from_pretrained("j-hartmann/emotion-english-distilroberta-base")
    model = AutoModelForSequenceClassification.from_pretrained("j-hartmann/emotion-english-distilroberta-base").to(device)
    return device, classifier, tokenizer, model

In [13]:
def get_emotion(device, tokenizer, model, text):
    max_length = 510  # Keep two spots for [CLS] and [SEP] tokens
    tokens = tokenizer.tokenize(text)
    chunks = [tokens[i:i+max_length] for i in range(0, len(tokens), max_length)]

    probabilities = []
    for chunk in chunks:
        ids = tokenizer.convert_tokens_to_ids(chunk)
        input_ids = torch.tensor([tokenizer.build_inputs_with_special_tokens(ids)]).to(device)
        attention_mask = torch.ones(input_ids.shape).to(device)

        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)
            prob = torch.nn.functional.softmax(outputs.logits, dim=-1)
            probabilities.append(prob)

    mean_probabilities = torch.mean(torch.stack(probabilities), dim=0)
    emotion_index = torch.argmax(mean_probabilities).item()

    if emotion_index == 0:
        return "anger"
    elif emotion_index == 1:
        return "disgust"
    elif emotion_index == 2:
        return "fear"    
    elif emotion_index == 3:
        return "joy"
    elif emotion_index == 4:
        return "neutral"
    elif emotion_index == 5:
        return "sadness"
    elif emotion_index == 6:
        return "surprise"

In [14]:
def get_emotions(file_name, df, column_name):
    df = df[df[column_name].notna()]
    
    device, classifier, tokenizer, model = initialize_model()
    emotions = []
    for text in tqdm(df[column_name]):
        emotions.append(get_emotion(device, tokenizer, model, text))
    
    save_emotions(emotions, file_name, df, column_name)
    return emotions

def save_emotions(emotions, file_name, df, column_name):
    df['emotion'] = emotions
    df.to_csv(file_name + "_" + column_name + "_emotions.csv", index=False)

def main():
    # Update the file_name and column_name variables
    file_name = "translated_output"
    df = pd.read_csv(file_name + ".csv")
    column_name = "text"
    
    get_emotions(file_name, df, column_name)

if __name__ == "__main__":
    main()

  3%|▎         | 215/7605 [00:44<55:14,  2.23it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (757 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 7605/7605 [18:16<00:00,  6.94it/s]  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['emotion'] = emotions
