import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DefaultDataCollator
)
from datasets import Dataset, DatasetDict, Value, Sequence

# 1. Read CSV files
df1 = pd.read_csv("goemotions/data/full_dataset/goemotions_1.csv")
df2 = pd.read_csv("goemotions/data/full_dataset/goemotions_2.csv")
df3 = pd.read_csv("goemotions/data/full_dataset/goemotions_3.csv")

# Combine them if needed (this step depends on how your CSVs are structured)
df = pd.concat([df1, df2, df3], ignore_index=True)

# Inspect
print("Combined dataset size:", len(df))
print(df.head())


In [None]:
# Suppose your CSV has a column "labels" that is a list of emotion strings:
# e.g. row["labels"] might be ["joy", "amusement"]

label_groups = {
    "anger": "anger",
    "annoyance": "anger",
    "disgust": "anger",
    "joy": "joy",
    "amusement": "joy",
    "excitement": "joy",
    "sadness": "sadness",
    "grief": "sadness",
    "disappointment": "sadness",
    "love": "love",
    "caring": "love",
    "fear": "fear",
    "nervousness": "fear",
    "surprise": "surprise",
    # ... etc ...
    "admiration": "admiration",
    "approval": "approval",
    "confusion": "confusion",
    "curiosity": "curiosity",
    "desire": "desire",
    "disapproval": "disapproval",
    "embarrassment": "embarrassment",
    "gratitude": "gratitude",
    "optimism": "optimism",
    "pride": "pride",
    "realization": "realization",
    "relief": "relief",
    "remorse": "remorse",
    "neutral": "neutral"
}

# Build a sorted list of unique new labels:
unique_new_labels = sorted(list(set(label_groups.values())))
print("New label set:", unique_new_labels)
# e.g. ['admiration', 'anger', 'approval', 'confusion', 'curiosity', 'desire', 
#       'disapproval', 'embarrassment', 'fear', 'gratitude', 'joy', 'love', 
#       'neutral', 'optimism', 'pride', 'realization', 'relief', 'remorse', 
#       'sadness', 'surprise']

# Create a mapping from label -> index
new_label2id = {lbl: i for i, lbl in enumerate(unique_new_labels)}

def map_labels_to_new(labels_list):
    """
    labels_list is the list of original label strings for a single example.
    We'll map each to the new label, then build a multi-hot vector.
    """
    # Start all zeros
    multi_hot = [0] * len(unique_new_labels)
    for old_lbl in labels_list:
        if old_lbl in label_groups:
            new_lbl = label_groups[old_lbl]
            idx = new_label2id[new_lbl]
            multi_hot[idx] = 1
        # If there's a label not in label_groups, decide how to handle it (ignore or add it)
    return multi_hot
