In [1]:
from tqdm import tqdm

tqdm.pandas()

# Loading Dataset

In [2]:
from datasets import load_dataset

dataset = load_dataset("go_emotions", "raw")
dataset.set_format(type="pandas")
df_train = dataset["train"][:]

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
emotions = list(df_train.columns)[9:]
columns = ["text"] + emotions
# select only texts and emotions
df_train = df_train[columns]

In [4]:
# create new column with list of emotions (in strings)
df_train["ds_emotions"] = df_train[emotions].progress_apply(
    lambda x: x.index[x == 1].tolist(), axis=1
)

  0%|          | 0/211225 [00:00<?, ?it/s]

100%|██████████| 211225/211225 [00:12<00:00, 16675.19it/s]


In [32]:
# check if neutral can come with other emotions
values = df_train["ds_emotions"].value_counts().keys()
for value in values:
    if "neutral" in value:
        print(value)

['neutral']


In [89]:
# seed = 7102023
# # get random 20 examples
# samples = df_train.sample(20, random_state=seed)
# # drop emotions columns
# samples = samples.drop(emotions, axis=1)
# # head
# samples.head(5)
samples = df_train[df_train["ds_emotions"].apply(lambda x: "neutral" in x)]
samples = samples.sample(20, random_state=7102023)
# drop emotions columns
samples = samples.drop(emotions, axis=1)

# LLM Labeling

In [36]:
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser

SYSTEM_MESSAGE = "You're an AI expert trained to analyze and categorize emotions present in text. Your goal is to analyze each piece of text according to instructions"

PROMPT_TEMPLATE = f"Given this comment {{comment}} , make an analysis of the emotions present in the comment according to this list of emotions ONLY {emotions[:-1]} or neutral if there's no emotion and after the analysis write the class/es that apply according to the given list (one, two or three classes) inside <answer> (classes) </answer> containing the classes."


final_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", SYSTEM_MESSAGE),
        # few_shot_prompt,
        ("human", PROMPT_TEMPLATE),
    ]
)

In [60]:
groq_api = ""
openai_api = ""
fireworks_api = ""

In [117]:
# Define models
chat1 = ChatOpenAI(
    temperature=0,
    model_name="llama3:8b",
    openai_api_base="https://a6e4-34-31-3-189.ngrok-free.app/v1",
    openai_api_key=groq_api,
)

chain1 = final_prompt | chat1 | StrOutputParser()

chat2 = ChatOpenAI(api_key=openai_api)

chain2 = final_prompt | chat2 | StrOutputParser()

In [115]:
from out_parser import extract_emotions

In [119]:
samples["ollama8"] = None
# for index in range(len(samples)):
for index in tqdm(samples.index):
    comment = samples.loc[index]["text"]
    label = samples.loc[index]["ds_emotions"]
    response1 = chain1.invoke({"comment": comment})
    extracted_emotions = extract_emotions(response1)
    print(extracted_emotions)
    samples.at[index, "ollama8"] = extracted_emotions

  5%|▌         | 1/20 [00:07<02:27,  7.78s/it]

['neutral']


 10%|█         | 2/20 [00:11<01:38,  5.48s/it]

['amusement']


 15%|█▌        | 3/20 [00:14<01:11,  4.22s/it]

['amusement']


 20%|██        | 4/20 [00:16<00:53,  3.36s/it]

['disgust', 'annoyance']


 25%|██▌       | 5/20 [00:20<00:52,  3.48s/it]

['anger disgust surprise']


 30%|███       | 6/20 [00:23<00:46,  3.29s/it]

['anger', 'disappointment', 'disgust']


 35%|███▌      | 7/20 [00:26<00:43,  3.31s/it]

['amusement', 'surprise']


 40%|████      | 8/20 [00:29<00:38,  3.24s/it]

['amusement']


 45%|████▌     | 9/20 [00:34<00:40,  3.65s/it]

['anger', 'confusion', 'amusement']


 50%|█████     | 10/20 [00:36<00:31,  3.15s/it]

['anger', 'annoyance']


 55%|█████▌    | 11/20 [00:39<00:28,  3.14s/it]

['amusement']


 60%|██████    | 12/20 [00:41<00:24,  3.02s/it]

['approval']


 65%|██████▌   | 13/20 [00:44<00:20,  2.90s/it]

['sadness', 'relief']


 70%|███████   | 14/20 [00:47<00:17,  2.86s/it]

['amusement']


 75%|███████▌  | 15/20 [00:50<00:14,  2.82s/it]

['amusement']


 80%|████████  | 16/20 [00:52<00:10,  2.61s/it]

['excitement']


 85%|████████▌ | 17/20 [00:54<00:07,  2.56s/it]

['disappointment']


 90%|█████████ | 18/20 [00:56<00:04,  2.49s/it]

['anger', 'disgust']


 95%|█████████▌| 19/20 [00:59<00:02,  2.49s/it]

['amusement']


100%|██████████| 20/20 [01:02<00:00,  3.12s/it]

['excitement']





In [120]:
# save to csv
samples.to_csv("samples.csv", index=False)