In [67]:
from transformers import RobertaForSequenceClassification, RobertaTokenizer
import torch
import transformers


In [83]:
model = RobertaForSequenceClassification.from_pretrained('roberta-large')
model.train()
tokenizer = RobertaTokenizer.from_pretrained('roberta-large')

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.weight', 'classif

In [85]:
#testing what inputs inputs and how model takes
inputs = tokenizer(["Hello, my dog is cute", "Today is a wonderful day", "I hate my boss"],\
                   padding = True, truncation = True, return_tensors="pt")
labels = torch.tensor([1, 1, 0]).unsqueeze(0)  # Batch size 1
outputs = model(**inputs, labels=labels)
loss = outputs.loss
logits = outputs.logits

display(labels)
display(labels.shape)
display(print('**inputs upcoming'))
display(**inputs)
display(print('**inputs done'))
display(inputs['input_ids'])
display(inputs['input_ids'].shape)
display(outputs)

tensor([[1, 1, 0]])

torch.Size([1, 3])

**inputs upcoming


None

**inputs done


None

tensor([[    0, 31414,     6,   127,  2335,    16, 11962,     2],
        [    0,  5625,    16,    10,  4613,   183,     2,     1],
        [    0,   100,  4157,   127,  3504,     2,     1,     1]])

torch.Size([3, 8])

SequenceClassifierOutput(loss=tensor(0.7657, grad_fn=<NllLossBackward>), logits=tensor([[ 0.2197, -0.1938],
        [ 0.0477,  0.0622],
        [ 0.0104,  0.0040]], grad_fn=<AddmmBackward>), hidden_states=None, attentions=None)

In [70]:
class EmotionDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [71]:
# load emotions dataset for fine-tuning model
from datasets import load_dataset
dataset = load_dataset("emotion")

Using custom data configuration default
Reusing dataset emotion (/home/advait/.cache/huggingface/datasets/emotion/default/0.0.0/aa34462255cd487d04be8387a2d572588f6ceee23f784f37365aa714afeb8fe6)


In [72]:
display(dataset)
train = dataset["train"]
test = dataset["test"]
validation = dataset["validation"]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [73]:
#tokenize dataset
train_encodings = tokenizer(train["text"], truncation=True, padding=True)
train.rename_column_("label", "labels")

validation_encodings = tokenizer(validation['text'], truncation = True, padding = True)
validation.rename_column_('label', 'labels')

display(train)
display(validation)

Dataset({
    features: ['text', 'labels'],
    num_rows: 16000
})

Dataset({
    features: ['text', 'labels'],
    num_rows: 2000
})

In [74]:
train.features

{'text': Value(dtype='string', id=None),
 'labels': ClassLabel(num_classes=6, names=['sadness', 'joy', 'love', 'anger', 'fear', 'surprise'], names_file=None, id=None)}

In [75]:
for item in train_encodings['input_ids']:
    if len(item) != 88:
        print('length is: ', len(item))
        raise ValueError

In [76]:
train_dataset = EmotionDataset(
    train_encodings, 
    train['labels'], 
)
validation_dataset = EmotionDataset(
    validation_encodings,
    validation['labels'],
)

In [77]:
model.classifier

RobertaClassificationHead(
  (dense): Linear(in_features=1024, out_features=1024, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (out_proj): Linear(in_features=1024, out_features=2, bias=True)
)

In [78]:
#Change number of output labels
num_labels = 6
in_features = 1024

model.classifier = torch.nn.Linear(
    in_features = in_features,
    out_features = num_labels,
    bias = True
)
model.num_labels = num_labels
model.classifier

Linear(in_features=1024, out_features=6, bias=True)

In [59]:
model.to('cuda');
display(len(train_dataset.encodings['input_ids']))
len(train_dataset.labels)

16000

16000

In [60]:
model.config

RobertaConfig {
  "_name_or_path": "roberta-large",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.2.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

In [61]:
# from transformers import Trainer, TrainingArguments
CUDA_LAUNCH_BLOCKING=1


# training_args = TrainingArguments(
#     output_dir='./results',          # output directory
#     num_train_epochs=1,              # total number of training epochs
#     per_device_train_batch_size=16,  # batch size per device during training
#     per_device_eval_batch_size=64,   # batch size for evaluation
#     warmup_steps=500,                # number of warmup steps for learning rate scheduler
#     weight_decay=0.01,               # strength of weight decay
#     logging_dir='./logs',            # directory for storing logs
#     logging_steps=10,
# )

# trainer = Trainer(
#     model=model,
#     args = training_args,
#     train_dataset = train_dataset,
# )

# trainer.train();

In [62]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle= True)

In [63]:
optim = transformers.AdamW(model.parameters(), lr = 5e-5)

In [64]:
for batch in train_loader:
#     print(batch['labels'].shape)
    batch['labels'] = batch['labels'].unsqueeze(0)
    print(batch['labels'].shape)
    break

torch.Size([1, 16])


In [81]:
train_dataset.__getitem__(0)

{'input_ids': tensor([    0,   118, 46405,   619, 32386,     2,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 

In [65]:
torch.cuda.empty_cache()
for epoch in range(3):
    for batch in train_loader:
        batch['labels'] = batch['labels'].unsqueeze(0)
        print(batch['input_ids'].shape)
        print(batch['attention_mask'].shape)
        print(batch['labels'].shape)
#         break
        optim.zero_grad()
        input_ids = batch['input_ids'].to('cuda')
        attention_mask = batch['attention_mask'].to('cuda')
        labels = batch['labels'].to('cuda')
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optim.step()
        break

torch.Size([16, 88])
torch.Size([16, 88])
torch.Size([1, 16])


ValueError: Expected input batch_size (1408) to match target batch_size (16).