In [1]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
tokenized_inputs=tokenizer.tokenize('I am worried.. I learn so slow')
ids = tokenizer.convert_tokens_to_ids(tokenized_inputs)
input_ids = tokenizer.prepare_for_model(ids)
input_ids

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'input_ids': [101, 1045, 2572, 5191, 1012, 1012, 1045, 4553, 2061, 4030, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [2]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
inputs = tokenizer(["I am worried I learn so slow", 
                   "This is me one of the worst students"],
                   padding=True, truncation=True, return_tensors="pt")
print(inputs)

from transformers import AutoModel

model = AutoModel.from_pretrained("bert-base-uncased")
outputs = model(**inputs)
print(outputs.last_hidden_state.shape)

from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased")
outputs = model(**inputs)
print(outputs.logits)

import torch

predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
print(predictions)

model.config.id2label

{'input_ids': tensor([[ 101, 1045, 2572, 5191, 1045, 4553, 2061, 4030,  102,    0],
        [ 101, 2023, 2003, 2033, 2028, 1997, 1996, 5409, 2493,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
torch.Size([2, 10, 768])


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tensor([[-0.1298, -0.3846],
        [ 0.1664, -0.1490]], grad_fn=<AddmmBackward0>)
tensor([[0.5633, 0.4367],
        [0.5782, 0.4218]], grad_fn=<SoftmaxBackward0>)


{0: 'LABEL_0', 1: 'LABEL_1'}

In [3]:
from transformers import AutoConfig
from transformers import BertConfig
from transformers import BertModel

bert_config = BertConfig.from_pretrained("bert-base-uncased")
bert_model = BertModel(bert_config)

auto_config = AutoConfig.from_pretrained("bert-base-uncased")
bert_model_auto_config = BertModel(auto_config)

print(bert_config)

BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.34.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}



In [4]:
new_bert_config = BertConfig.from_pretrained('bert-base-uncased', num_hidden_layers = 9, vocab_size=1000)
new_bert_model = BertModel(new_bert_config)

save_new_bert_model = new_bert_model.save_pretrained('new_bert_model')

load_new_bert_model = BertModel.from_pretrained('new_bert_model')

In [5]:
from datasets import load_dataset

raw_dataset = load_dataset("glue", "mrpc")
print(raw_dataset["train"], raw_dataset["train"].features, raw_dataset["train"][0], raw_dataset["train"][:5])

from transformers import AutoTokenizer
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"],
                     padding = True, truncation=True,
                     max_length=128) 

tokenized_dataset = raw_dataset.map(tokenize_function, batched=True)
tokenized_dataset

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx'],
    num_rows: 3668
}) {'sentence1': Value(dtype='string', id=None), 'sentence2': Value(dtype='string', id=None), 'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None), 'idx': Value(dtype='int32', id=None)} {'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .', 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .', 'label': 1, 'idx': 0} {'sentence1': ['Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .', "Yucaipa owned Dominick 's before selling the chain to Safeway in 1998 for $ 2.5 billion .", 'They had published an advertisement on the Internet on June 10 , offering the cargo for sale , he added .', 'Around 0335 GMT , Tab shares were up 19 cents , or 4.4 % , at A $ 4.56 , having earlier set a record high of A $ 4.

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

In [6]:
tokenized_dataset = tokenized_dataset.remove_columns(['idx','sentence1','sentence2'])
tokenized_dataset = tokenized_dataset.rename_column('label','labels')
tokenized_dataset = tokenized_dataset.with_format('torch')

tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

In [7]:
tokenized_dataset['train']

Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 3668
})

In [18]:
from datasets import load_dataset
from transformers import AutoTokenizer

raw_datasets = load_dataset("glue", "mrpc")
checkpoint = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example['sentence1'], example['sentence2'],
                     padding="max_length",truncation=True,
                     max_length=128)
    
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True) 
tokenized_datasets = tokenized_datasets.remove_columns(['idx','sentence1','sentence2'])
tokenized_datasets = tokenized_datasets.rename_column('label','labels')
tokenized_datasets = tokenized_datasets.with_format('torch')

from torch.utils.data import DataLoader

train_dataloader = DataLoader(tokenized_datasets['train'], 
                              batch_size=16, shuffle=True)

for step, batch in enumerate(train_dataloader):
    print(batch['input_ids'].shape)
    if step>5:
        break

Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

torch.Size([16, 128])
torch.Size([16, 128])
torch.Size([16, 128])
torch.Size([16, 128])
torch.Size([16, 128])
torch.Size([16, 128])
torch.Size([16, 128])


In [12]:
from datasets import load_dataset

raw_datasets = load_dataset('glue', 'mrpc')
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(example):
    return tokenizer(example['sentence1'], example['sentence2'],
                     truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['idx','sentence1','sentence2'])
tokenized_datasets = tokenized_datasets.rename_column('label','labels')
tokenized_datasets = tokenized_datasets.with_format('torch')


In [13]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

In [14]:
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
train_dataloader = DataLoader(tokenized_datasets['train'],
                              batch_size=16, shuffle=True,
                              collate_fn=data_collator)

for step, batch in enumerate(train_dataloader):
    print(batch['input_ids'].shape)
    if step>5:
        break

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


torch.Size([16, 73])
torch.Size([16, 75])
torch.Size([16, 85])
torch.Size([16, 81])
torch.Size([16, 77])
torch.Size([16, 83])
torch.Size([16, 79])


In [20]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

checkpoint = 'bert-base-uncased'
raw_dataset = load_dataset('glue', 'mrpc')
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example['sentence1'], example['sentence2'],
                     truncation=True)

tokenized_dataset = raw_dataset.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

# Specify training arguments hyperparameters:
from transformers import TrainingArguments
training_args = TrainingArguments("test-trainer",
                                  per_device_train_batch_size=16,
                                  per_device_eval_batch_size=16,
                                  num_train_epochs=5,
                                  learning_rate=2e-5,
                                  weight_decay=0.01)

# Create the Trainer instance:
from transformers import Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    data_collator=data_collator,
    tokenizer=tokenizer
)
trainer.train()

'''The results will however be anticlimatic because 
you will only get a training loss that doesn't tell you how well the
model is actually doing..
To fix this, you need to specify the evaluation metric'''

predictions = trainer.predict(tokenized_dataset['validation'])
print(predictions)
print(predictions.predictions.shape, predictions.label_ids.shape)

# it returns a named tuple with 3 elements: predictions, label_ids, metrics
# the predictions are the logits of the model with all the sentences of the dataset
# so a numpy array of shape(488 x 2)

# to match them with our labels we need to take the maximum logits for each prediction
# to know which is the maximum, use the argmax function
import numpy as np
from datasets import load_metric

metric = load_metric('glue', 'mrpc')
preds = np.argmax(predictions.predictions, axis=-1)
metric.compute(predictions=preds, references=predictions.label_ids)

'''We can see that our model did learn something!'''

In [None]:
'''To monitor the metrics during training, we need to define a
compute metric function as we did above
and pass it to the Trainer
'''
metric = load_metric('glue','mrpc')
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

training_args = TrainingArguments("test-trainer",
                                  evaluation_strategy='epoch')
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()
