In [1]:
!pip install -U datasets

Collecting datasets
  Downloading datasets-4.1.1-py3-none-any.whl.metadata (18 kB)
Collecting filelock (from datasets)
  Using cached filelock-3.19.1-py3-none-any.whl.metadata (2.1 kB)
Collecting numpy>=1.17 (from datasets)
  Using cached numpy-2.2.6-cp310-cp310-macosx_14_0_arm64.whl.metadata (62 kB)
Collecting pyarrow>=21.0.0 (from datasets)
  Using cached pyarrow-21.0.0-cp310-cp310-macosx_12_0_arm64.whl.metadata (3.3 kB)
Collecting dill<0.4.1,>=0.3.0 (from datasets)
  Using cached dill-0.4.0-py3-none-any.whl.metadata (10 kB)
Collecting pandas (from datasets)
  Using cached pandas-2.3.2-cp310-cp310-macosx_11_0_arm64.whl.metadata (91 kB)
Collecting requests>=2.32.2 (from datasets)
  Using cached requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting tqdm>=4.66.3 (from datasets)
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting xxhash (from datasets)
  Using cached xxhash-3.5.0-cp310-cp310-macosx_11_0_arm64.whl.metadata (12 kB)
Collecting multiprocess<0.70.1

In [2]:
from transformers import BertTokenizer
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
import torch
import torch.nn as nn 
from transformers import TrainingArguments, Trainer, BertForSequenceClassification
from transformers import BertModel

In [3]:
from datasets import ClassLabel
dataset = load_dataset("emotion", split = "train")

Generating train split: 100%|██████████| 16000/16000 [00:00<00:00, 1196897.82 examples/s]
Generating validation split: 100%|██████████| 2000/2000 [00:00<00:00, 630674.99 examples/s]
Generating test split: 100%|██████████| 2000/2000 [00:00<00:00, 940426.91 examples/s]


In [4]:
dataset = dataset.filter(lambda x: x['label'] in [0,1,3])

label_map = {0:0, 1:1, 3:2}
dataset = dataset.map(lambda x: {"label": label_map[x["label"]]})

new_label_feature = ClassLabel(num_classes = 3, names = ["sadness", "joy", "anger"])
dataset = dataset.cast_column("label", new_label_feature)

print(dataset.features["label"].names)

Filter: 100%|██████████| 16000/16000 [00:00<00:00, 487076.14 examples/s]
Map: 100%|██████████| 12187/12187 [00:00<00:00, 49015.85 examples/s]
Casting the dataset: 100%|██████████| 12187/12187 [00:00<00:00, 4905564.57 examples/s]

['sadness', 'joy', 'anger']





In [5]:
dataset = dataset.train_test_split(test_size=0.2)

In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 9749
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2438
    })
})

In [7]:
train_texts = dataset['train']['text']

In [8]:
print(train_texts)

Column(['im so relieved and feel so much more like myself now that this is resolved this being almost nothing at all actually just some weird energy and i cant wait to be back at camp even though ill be hacking and coughing and spluttering all day long', 'im feeling lucky button after that you will go to the landing page where you will found the alternative google search engine homepage with colors theme depend on the keywords below', 'i just am so tired of feeling lonely and yet when someone comes along who can take away that feeling i run away', 'i like about this song is how it feels bouncy and matches tiggers bouncy personality', 'i feel so repressed when compared to dear a href http eurodancemix'])


In [9]:
print(len(train_texts))

9749


In [10]:
train_labels = dataset['train']['label']

In [11]:
train_labels

Column([1, 1, 0, 1, 0])

In [12]:
print(len(train_labels))

9749


In [13]:
val_texts = dataset['test']['text']
val_labels = dataset['test']['label']

In [14]:
print(len(val_texts))
print(len(val_labels))

2438
2438


In [15]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [17]:
def tokenize(batch):
    return tokenizer(batch['text'], padding = True, truncation = True, return_tensors = 'pt')

In [18]:
dataset = dataset.map(lambda x: tokenizer(x['text'], padding = "max_length", truncation = True), batched = True)

Map: 100%|██████████| 9749/9749 [00:02<00:00, 4254.16 examples/s]
Map: 100%|██████████| 2438/2438 [00:00<00:00, 4402.49 examples/s]


In [19]:
dataset.set_format(type = 'torch', columns = ['input_ids', 'attention_mask', 'label'])

In [20]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
