In [None]:
# !pip install -U datasets



In [2]:
from transformers import BertTokenizer
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
import torch
import torch.nn as nn 
from transformers import TrainingArguments, Trainer, BertForSequenceClassification
from transformers import BertModel

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from datasets import ClassLabel
dataset = load_dataset("emotion", split = "train")

In [4]:
dataset = dataset.filter(lambda x: x['label'] in [0,1,3])

label_map = {0:0, 1:1, 3:2}
dataset = dataset.map(lambda x: {"label": label_map[x["label"]]})

new_label_feature = ClassLabel(num_classes = 3, names = ["sadness", "joy", "anger"])
dataset = dataset.cast_column("label", new_label_feature)

print(dataset.features["label"].names)

['sadness', 'joy', 'anger']


In [5]:
dataset = dataset.train_test_split(test_size=0.2)

In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 9749
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2438
    })
})

In [7]:
train_texts = dataset['train']['text']

In [8]:
print(train_texts)

Column(['i feel those artistic yearnings in my music and i know that if i was to provide for a family and couldnt do so with the gift god has given me it would be very very hard', 'i feel so all alone no ones gonna fix me when im broke how do you cry with inanimate eyes', 'i feel so shitty right now i just arugh', 'i cant help but feel like im doing something dirty', 'i just feel so heartbroken out of loneliness'])


In [9]:
print(len(train_texts))

9749


In [10]:
train_labels = dataset['train']['label']

In [11]:
train_labels

Column([1, 0, 0, 0, 0])

In [12]:
print(len(train_labels))

9749


In [13]:
val_texts = dataset['test']['text']
val_labels = dataset['test']['label']

In [14]:
print(len(val_texts))
print(len(val_labels))

2438
2438


In [15]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [16]:
def tokenize(batch):
    return tokenizer(batch['text'], padding = True, truncation = True, return_tensors = 'pt')

In [17]:
dataset = dataset.map(lambda x: tokenizer(x['text'], padding = "max_length", truncation = True), batched = True)

Map: 100%|██████████| 9749/9749 [00:02<00:00, 4411.85 examples/s]
Map: 100%|██████████| 2438/2438 [00:00<00:00, 4534.78 examples/s]


In [18]:
dataset.set_format(type = 'torch', columns = ['input_ids', 'attention_mask', 'label'])

In [19]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
for name, param in model.named_parameters():
    print(f"{name:50} | Shape: {tuple(param.shape)} | Trainable: {param.requires_grad}")

bert.embeddings.word_embeddings.weight             | Shape: (30522, 768) | Trainable: True
bert.embeddings.position_embeddings.weight         | Shape: (512, 768) | Trainable: True
bert.embeddings.token_type_embeddings.weight       | Shape: (2, 768) | Trainable: True
bert.embeddings.LayerNorm.weight                   | Shape: (768,) | Trainable: True
bert.embeddings.LayerNorm.bias                     | Shape: (768,) | Trainable: True
bert.encoder.layer.0.attention.self.query.weight   | Shape: (768, 768) | Trainable: True
bert.encoder.layer.0.attention.self.query.bias     | Shape: (768,) | Trainable: True
bert.encoder.layer.0.attention.self.key.weight     | Shape: (768, 768) | Trainable: True
bert.encoder.layer.0.attention.self.key.bias       | Shape: (768,) | Trainable: True
bert.encoder.layer.0.attention.self.value.weight   | Shape: (768, 768) | Trainable: True
bert.encoder.layer.0.attention.self.value.bias     | Shape: (768,) | Trainable: True
bert.encoder.layer.0.attention.output.den

In [21]:
model_for_cls = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", num_labels = 3
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
for name, param in model_for_cls.named_parameters():
    print(f"{name:50} | Shape: {tuple(param.shape)} | Trainable: {param.requires_grad}")

bert.embeddings.word_embeddings.weight             | Shape: (30522, 768) | Trainable: True
bert.embeddings.position_embeddings.weight         | Shape: (512, 768) | Trainable: True
bert.embeddings.token_type_embeddings.weight       | Shape: (2, 768) | Trainable: True
bert.embeddings.LayerNorm.weight                   | Shape: (768,) | Trainable: True
bert.embeddings.LayerNorm.bias                     | Shape: (768,) | Trainable: True
bert.encoder.layer.0.attention.self.query.weight   | Shape: (768, 768) | Trainable: True
bert.encoder.layer.0.attention.self.query.bias     | Shape: (768,) | Trainable: True
bert.encoder.layer.0.attention.self.key.weight     | Shape: (768, 768) | Trainable: True
bert.encoder.layer.0.attention.self.key.bias       | Shape: (768,) | Trainable: True
bert.encoder.layer.0.attention.self.value.weight   | Shape: (768, 768) | Trainable: True
bert.encoder.layer.0.attention.self.value.bias     | Shape: (768,) | Trainable: True
bert.encoder.layer.0.attention.output.den

In [23]:
trainable_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Trainable Parameters: {trainable_parameters}")

Trainable Parameters: 109483778


In [24]:
print(model_for_cls.bert.embeddings)

BertEmbeddings(
  (word_embeddings): Embedding(30522, 768, padding_idx=0)
  (position_embeddings): Embedding(512, 768)
  (token_type_embeddings): Embedding(2, 768)
  (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
  (dropout): Dropout(p=0.1, inplace=False)
)


In [25]:
print(model_for_cls.bert.encoder.layer[11])

BertLayer(
  (attention): BertAttention(
    (self): BertSdpaSelfAttention(
      (query): Linear(in_features=768, out_features=768, bias=True)
      (key): Linear(in_features=768, out_features=768, bias=True)
      (value): Linear(in_features=768, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (output): BertSelfOutput(
      (dense): Linear(in_features=768, out_features=768, bias=True)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
  )
  (intermediate): BertIntermediate(
    (dense): Linear(in_features=768, out_features=3072, bias=True)
    (intermediate_act_fn): GELUActivation()
  )
  (output): BertOutput(
    (dense): Linear(in_features=3072, out_features=768, bias=True)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
)


In [26]:
print(model_for_cls.classifier)

Linear(in_features=768, out_features=3, bias=True)


In [27]:
print(model_for_cls.config)

BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "dtype": "float32",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.56.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}



In [28]:
import accelerate, transformers, torch
print("accelerate:", accelerate.__version__)
print("transformers:", transformers.__version__)
print("torch:", torch.__version__)

accelerate: 1.10.1
transformers: 4.56.2
torch: 2.8.0


In [29]:
training_args = TrainingArguments(
    output_dir="./bert_output_1",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    logging_dir="./logs",
    report_to="none"
)

In [30]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [31]:
trainer = Trainer(
    model = model_for_cls,
    args = training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
)

In [32]:
trainer.train()



Step,Training Loss
500,0.3856
1000,0.1454


TrainOutput(global_step=1219, training_loss=0.23719414107813222, metrics={'train_runtime': 1319.091, 'train_samples_per_second': 7.391, 'train_steps_per_second': 0.924, 'total_flos': 2565092709430272.0, 'train_loss': 0.23719414107813222, 'epoch': 1.0})