In [60]:
!pip install datasets -q

In [61]:
!pip install accelerate -U -q

In [62]:
import warnings
warnings.filterwarnings('ignore')

import tensorflow as tf
from transformers import AutoTokenizer, BertModel
from datasets import load_dataset

from transformers import DataCollatorWithPadding

In [63]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
data_coll = DataCollatorWithPadding(tokenizer = tokenizer)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--bert-base-uncased/snapshots/86b5e0934494bd15c9632b12f734a8a67f723594/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.38.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--bert-base-uncased/snapshots/86b5e0934494bd15c9632b12f734a8a67f723594/vocab.txt
loading file to

In [64]:
dataset = load_dataset("SetFit/emotion")


In [65]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'label_text'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label', 'label_text'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label', 'label_text'],
        num_rows: 2000
    })
})

In [66]:
def tokenize_function(example):
  return tokenizer(example['text'], truncation = True)

emotions = dataset.map(tokenize_function, batched = True)

Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

In [67]:
emotions

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'label_text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label', 'label_text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label', 'label_text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2000
    })
})

Now i can use:

1) Trainer API

2) Use Default Pytorch way

#### 1) Trainer API

In [68]:
!pip install evaluate -q

In [69]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels = 6)


loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--bert-base-uncased/snapshots/86b5e0934494bd15c9632b12f734a8a67f723594/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "tra

In [70]:
import evaluate
import numpy as np
import torch

def compute_metric(eval_preds):
  metric = evaluate.load('accuracy')
  logits, labels = eval_preds
  predictions = np.argmax(logits, axis=-1)
  return metric.compute(predictions=predictions, references=labels)

In [71]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",  # Output directory for checkpoints etc.
    num_train_epochs=3,        # Number of training epochs
    per_device_train_batch_size=64,  # Batch size per device (GPU/TPU)
    per_device_eval_batch_size=64,   # Batch size per device for evaluation
    logging_steps=50,           # Logging frequency in training steps
    evaluation_strategy="steps",  # Evaluation strategy (steps or epoch)
    eval_steps=50,             # Evaluation frequency in steps (if strategy is "steps")
    load_best_model_at_end=True,  # Load the best model based on eval metric
)


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [72]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset = emotions['train'],
    eval_dataset = emotions['validation'],
    data_collator = data_coll,
    tokenizer = tokenizer,
    compute_metrics = compute_metric
)

In [73]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: label_text, text. If label_text, text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 16,000
  Num Epochs = 3
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 750
  Number of trainable parameters = 109,486,854


Step,Training Loss,Validation Loss,Accuracy
50,1.2539,0.797731,0.745
100,0.5615,0.329311,0.9
150,0.3101,0.235663,0.92
200,0.2308,0.208,0.923
250,0.2079,0.170741,0.9335
300,0.1371,0.172014,0.9355
350,0.131,0.163933,0.9405
400,0.1299,0.157216,0.932
450,0.1366,0.153768,0.932
500,0.1236,0.146355,0.939


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: label_text, text. If label_text, text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 64
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: label_text, text. If label_text, text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 64
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: label_text, text. If label_text, text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Runnin

TrainOutput(global_step=750, training_loss=0.24280954583485923, metrics={'train_runtime': 581.8283, 'train_samples_per_second': 82.499, 'train_steps_per_second': 1.289, 'total_flos': 1394933356366848.0, 'train_loss': 0.24280954583485923, 'epoch': 3.0})

In [74]:
res= trainer.predict(emotions['test'])

The following columns in the test set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: label_text, text. If label_text, text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 2000
  Batch size = 64


In [77]:
preds = np.argmax(res.predictions,axis = -1)

In [78]:
preds

array([0, 0, 0, ..., 1, 1, 4])

In [79]:
model.save_pretrained("save_directory")
tokenizer.save_pretrained("save_directory")


Configuration saved in save_directory/config.json
Model weights saved in save_directory/model.safetensors
tokenizer config file saved in save_directory/tokenizer_config.json
Special tokens file saved in save_directory/special_tokens_map.json


('save_directory/tokenizer_config.json',
 'save_directory/special_tokens_map.json',
 'save_directory/vocab.txt',
 'save_directory/added_tokens.json',
 'save_directory/tokenizer.json')

In [81]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [82]:
model.push_to_hub('new_emotion')
tokenizer.push_to_hub('new_emotion')

Configuration saved in /tmp/tmpv2d4pjb_/config.json
Model weights saved in /tmp/tmpv2d4pjb_/model.safetensors
Uploading the following files to Arjun4707/new_emotion: model.safetensors,README.md,config.json


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

tokenizer config file saved in /tmp/tmphzqnszcw/tokenizer_config.json
Special tokens file saved in /tmp/tmphzqnszcw/special_tokens_map.json
Uploading the following files to Arjun4707/new_emotion: vocab.txt,tokenizer_config.json,special_tokens_map.json,tokenizer.json,README.md


CommitInfo(commit_url='https://huggingface.co/Arjun4707/new_emotion/commit/af2bbf5d965daedee0f33162d12318d974e03920', commit_message='Upload tokenizer', commit_description='', oid='af2bbf5d965daedee0f33162d12318d974e03920', pr_url=None, pr_revision=None, pr_num=None)

## Inferencing

In [98]:
inp = emotions['test'][102]['text']
inp

'i started out feeling discouraged this morning'

In [104]:
toke_inps = tokenizer(inp, return_tensors= 'pt')
toke_inps

{'input_ids': tensor([[  101,  1045,  2318,  2041,  3110, 22585,  2023,  2851,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [106]:
device = model.device
toke_inps = toke_inps.to(device)

In [108]:
out = model(**toke_inps).logits
res = torch.argmax(out, dim =-1).cpu().item()

In [109]:
print(emotions['test'][102]['text'])
print(emotions['test'][102]['label'])
print('predicted_label:', res )

i started out feeling discouraged this morning
0
predicted_label: 0
