In [2]:
import torch
import datasets
import pandas as pd
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import pipeline
from datasets import DatasetDict, Dataset
from datasets import concatenate_datasets
from datasets import load_dataset 
from transformers import Trainer, TrainingArguments
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [97]:
#notwendige Funktion, um den Datansatz korrekt laden zu können (siehe Warnung im Output der Zeile emotion_dataset = load_dataset("dair-ai/emotion"))
import json

import datasets
from datasets.tasks import TextClassification


_CITATION = """/
@inproceedings{saravia-etal-2018-carer,
    title = "{CARER}: Contextualized Affect Representations for Emotion Recognition",
    author = "Saravia, Elvis  and
      Liu, Hsien-Chi Toby  and
      Huang, Yen-Hao  and
      Wu, Junlin  and
      Chen, Yi-Shin",
    booktitle = "Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing",
    month = oct # "-" # nov,
    year = "2018",
    address = "Brussels, Belgium",
    publisher = "Association for Computational Linguistics",
    url = "https://www.aclweb.org/anthology/D18-1404",
    doi = "10.18653/v1/D18-1404",
    pages = "3687--3697",
    abstract = "Emotions are expressed in nuanced ways, which varies by collective or individual experiences, knowledge, and beliefs. Therefore, to understand emotion, as conveyed through text, a robust mechanism capable of capturing and modeling different linguistic nuances and phenomena is needed. We propose a semi-supervised, graph-based algorithm to produce rich structural descriptors which serve as the building blocks for constructing contextualized affect representations from text. The pattern-based representations are further enriched with word embeddings and evaluated through several emotion recognition tasks. Our experimental results demonstrate that the proposed method outperforms state-of-the-art techniques on emotion recognition tasks.",
}
"""

_DESCRIPTION = """/
Emotion is a dataset of English Twitter messages with six basic emotions: anger, fear, joy, love, sadness, and surprise. For more detailed information please refer to the paper.
"""

_HOMEPAGE = "https://github.com/dair-ai/emotion_dataset"

_LICENSE = "The dataset should be used for educational and research purposes only"

_URLS = {
    "split": {
        "train": "data/train.jsonl.gz",
        "validation": "data/validation.jsonl.gz",
        "test": "data/test.jsonl.gz",
    },
    "unsplit": {
        "train": "data/data.jsonl.gz",
    },
}


class Emotion(datasets.GeneratorBasedBuilder):
    VERSION = datasets.Version("1.0.0")
    BUILDER_CONFIGS = [
        datasets.BuilderConfig(
            name="split", version=VERSION, description="Dataset split in train, validation and test"
        ),
        datasets.BuilderConfig(name="unsplit", version=VERSION, description="Unsplit dataset"),
    ]
    DEFAULT_CONFIG_NAME = "split"

    def _info(self):
        class_names = ["sadness", "joy", "love", "anger", "fear", "surprise"]
        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=datasets.Features(
                {"text": datasets.Value("string"), "label": datasets.ClassLabel(names=class_names)}
            ),
            supervised_keys=("text", "label"),
            homepage=_HOMEPAGE,
            citation=_CITATION,
            license=_LICENSE,
            task_templates=[TextClassification(text_column="text", label_column="label")],
        )

    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""
        paths = dl_manager.download_and_extract(_URLS[self.config.name])
        if self.config.name == "split":
            return [
                datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepath": paths["train"]}),
                datasets.SplitGenerator(name=datasets.Split.VALIDATION, gen_kwargs={"filepath": paths["validation"]}),
                datasets.SplitGenerator(name=datasets.Split.TEST, gen_kwargs={"filepath": paths["test"]}),
            ]
        else:
            return [datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepath": paths["train"]})]

    def _generate_examples(self, filepath):
        """Generate examples."""
        with open(filepath, encoding="utf-8") as f:
            for idx, line in enumerate(f):
                example = json.loads(line)
                yield idx, example

In [194]:
# Laden des Datasets von Huggingface
emotion_dataset = load_dataset("dair-ai/emotion")

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [195]:
emotion_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [196]:
# Speichern der train-test-validation-splits in pd dfs
df_train = pd.DataFrame(emotion_dataset['train'])
df_test = pd.DataFrame(emotion_dataset['train'])
df_validation = pd.DataFrame(emotion_dataset['validation'])

In [8]:
# Zusammenführen der dfs
df_set = pd.concat([df_train, df_test, df_validation], ignore_index=True)

In [12]:
# Speichern des dfs als csv --> die weitere Verarbeitung und Übersetzung des Datasets ist dem Notebook "gpt_translator.ipynb" zu entnehmen
df_set.to_csv("dair_ai_emotion.csv", index=False)

In [103]:
# Laden des nun übersetzten Datasets
df = pd.read_csv("dair_ai_emotion_de.csv")

In [106]:
# Löschen der englischen Originaltexte
df = df.drop(['Unnamed: 0', 'text'], axis=1)

In [109]:
# Umstellen der Spalten
df = df[['text_de', 'label']]

In [111]:
# Umbenennung der Spalte "text_de" zu "text"
df = df.rename(columns={"text_de": "text"})

In [1]:
# Speichern des überarbeiteten Datensatzes
df.to_csv("dair_ai_emotion_de.csv", index=False)

NameError: name 'df' is not defined

In [3]:
# Laden des Datensatzes als Huggingface Dataset
dataset = load_dataset('csv', data_files='dair_ai_emotion_de.csv')

In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 34000
    })
})

In [7]:
train_testval_split = dataset["train"].train_test_split(test_size=0.2)
test_val_split = train_testval_split['test'].train_test_split(test_size=0.5)
splitted_dataset = DatasetDict({
    'train': train_testval_split['train'],
    'test': test_val_split['test'],
    'validation': test_val_split['train']
})

print(f"Trainingsset: {splitted_dataset['train'].num_rows} Zeilen")
print(f"Testset: {splitted_dataset['test'].num_rows} Zeilen")
print(f"Validierungsset: {splitted_dataset['validation'].num_rows} Zeilen")

save_path = "dair-ai-amotion-de-test-train-val"

splitted_dataset.save_to_disk(save_path)


Trainingsset: 27200 Zeilen
Testset: 3400 Zeilen
Validierungsset: 3400 Zeilen


Saving the dataset (0/1 shards):   0%|          | 0/27200 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3400 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3400 [00:00<?, ? examples/s]

In [200]:
train_test_split = dataset['train'].train_test_split(test_size=0.2)

In [202]:
train_test_split.save_to_disk('C:/Users/JGras/instagram-topics/instagram-topics/Data/dair-ai-emotion-de-test-train')

Saving the dataset (0/1 shards):   0%|          | 0/27200 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/6800 [00:00<?, ? examples/s]

In [148]:
final_splits = {
    'train': train_val_split['train'],
    'test': train_test_split['test'],
    'validation': train_val_split['test']
}

In [153]:
dataset_dict = datasets.DatasetDict({
    'train': train_dataset,
    'validation': validation_dataset,
    'test': test_dataset
})

In [184]:
all_datasets = [dataset_dict[split] for split in dataset_dict.keys()]

In [188]:
combined_dataset = concatenate_datasets(all_datasets)

In [190]:
combined_dataset.save_to_disk('C:/Users/JGras/instagram-topics/instagram-topics/Data/dair-ai-emotion-de')

Saving the dataset (0/1 shards):   0%|          | 0/34000 [00:00<?, ? examples/s]

In [193]:
emotion_dataset = Dataset.load_from_disk('C:/Users/JGras/instagram-topics/instagram-topics/Data/dair-ai-emotion-de')

In [127]:
train_dataset = final_splits['train']
test_dataset = final_splits['test']
validation_dataset = final_splits['validation']

In [142]:
train_dataset[5]

{'text': 'Ich habe gestern einige Zeit in der Schule verbracht, mit Leuten gesprochen und Fotos von meinen Töchtern gemacht, während sie den Spaß am letzten Schultag genossen haben. Ich bin beeindruckt und glücklich davon weggegangen, was ehrlich gesagt mein übliches Gefühl für diesen Ort ist.',
 'label': 5}