# 1. Creating the model

In [12]:
from transformers import BertConfig, BertModel

# building the config
config = BertConfig()

model = BertModel(config=config)

# save the model
save_dir = "/media/dyd/UDISK/output_model/bert-model/"
model.save_pretrained(save_dir)

In [2]:
encoded_sequences = [
    [101, 7592, 999, 102],
    [101, 4658, 1012, 102],
    [101, 3835, 999, 102],
]

import torch

#  model expects a batch of inputs, therefore, two dimention
model_inputs = torch.tensor(encoded_sequences)

output = model(model_inputs)
output

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 0.7218,  1.1722, -1.2007,  ...,  0.5422,  0.9019,  0.0797],
         [-0.6419,  0.1116, -1.0009,  ..., -0.3035,  0.0134,  0.4281],
         [ 0.4921,  0.9032, -1.1902,  ...,  0.1121, -0.4422,  1.0324],
         [ 0.6242,  1.0374, -1.3022,  ...,  0.2696,  0.2177, -0.2742]],

        [[-0.0318,  0.9829, -1.5006,  ...,  0.8164,  0.1683,  0.1509],
         [-0.5306,  1.5387, -1.6326,  ..., -0.3804, -0.4625,  0.3979],
         [-0.1019, -0.0721,  0.6409,  ..., -0.1555, -1.1611,  0.0782],
         [ 0.6899,  1.0638, -0.8494,  ...,  0.7175, -0.2739,  0.6161]],

        [[-0.2035,  1.5311, -1.4224,  ...,  1.3440,  0.3439,  0.3108],
         [-1.1245,  0.9387,  0.3165,  ..., -0.5737, -0.2346,  1.0194],
         [ 0.4847,  0.8208, -0.5073,  ...,  0.2893, -0.4217,  0.7293],
         [ 0.1394,  1.3052, -1.9946,  ...,  0.8204, -0.0102,  0.2817]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[ 0.4619, -0.

# 2. Tokenizer

## 2.1 word based
Each word gets assigned an ID, starting from 0 and going up to the size of the vocabulary. The model uses these IDs to identify each word.<br>
we need a custom token to represent words that are not in our vocabulary. This is known as the “unknown” token, often represented as ”[UNK]” or ””. It’s generally a bad sign if you see that the tokenizer is producing a lot of these tokens, as it wasn’t able to retrieve a sensible representation of a word and you’re losing information along the way

In [3]:
# word based tokenizer
tokenizer_text1 = "jim henson was a puppeteer".split()
tokenizer_text1

['jim', 'henson', 'was', 'a', 'puppeteer']

## 2.2 char based
Character-based tokenizers split the text into characters, rather than words. This has two primary benefits:
1. The vocabulary is much smaller.
2. There are much fewer out-of-vocabulary (unknown) tokens, since every word can be built from characters.

## 2.3 subword tokenization
Subword tokenization algorithms rely on the principle that frequently used words should not be split into smaller subwords, but rare words should be decomposed into meaningful subwords.
1. For example, tokenization will be split into "token" and "ization".

## 2.4 load and save



In [6]:
from transformers import BertTokenizer

tokenizer2 = BertTokenizer.from_pretrained("bert-base-uncased")

tokenizer2("Using a Transformer network is simple")


  from .autonotebook import tqdm as notebook_tqdm


{'input_ids': [101, 2478, 1037, 10938, 2121, 2897, 2003, 3722, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [7]:
# encoding 
# Translating text to numbers 

seq= "using a transformer network is simple"

# use subword tokenization
# WordPiece, BPE, Unigram method
tokens = tokenizer2.tokenize(seq)


In [8]:
ids = tokenizer2.convert_tokens_to_ids(tokens)
ids

[2478, 1037, 10938, 2121, 2897, 2003, 3722]

In [9]:
# decoding
# from vocabulary indices, we want to get a string

decoded_string = tokenizer2.decode(ids)
decoded_string

'using a transformer network is simple'

# 3. Handling multiple sequences



In [2]:
# model expect a batch of inputs
import torch

input1 = torch.tensor([[101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102]])
model(input1)

NameError: name 'model' is not defined

In [11]:
# padding the inputs
# add a special word called the padding token.

batched_ids = [
    [200, 200, 200],
    [200, 200, tokenizer2.pad_token_id],
]
batched_ids

[[200, 200, 200], [200, 200, 0]]

# 3.1 attention masks
Attention masks are tensors with the exact same shape as the input IDs tensor, filled with 0s and 1s: 1s indicate the corresponding tokens should be attended to, and 0s indicate the corresponding tokens should not be attended to (i.e., they should be ignored by the attention layers of the model). <br>
[Example](https://huggingface.co/learn/nlp-course/chapter2/5?fw=pt)

In [13]:
# attention masks

sequence1_ids = [[200, 200, 200]]
sequence2_ids = [[200, 200]]
batched_ids = [
    [200, 200, 200],
    [200, 200, tokenizer2.pad_token_id],
]


attention_mask = torch.tensor([
    [1, 1, 1],
    [1, 1, 0],
])

AttributeError: 'BaseModelOutputWithPoolingAndCrossAttentions' object has no attribute 'logits'

# 4. Putting it all together



In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "bert-base-uncased"
tokenizer3 = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]

tokens =tokenizer3(sequences, padding=True, truncation=True, return_tensors="pt")
output = model(**tokens)
output

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

SequenceClassifierOutput(loss=None, logits=tensor([[-0.0500,  0.0015],
        [-0.1866, -0.0362]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)