In [16]:
import pandas as pd
from transformers import BertTokenizer, BertForTokenClassification, AdamW
import torch

# Load data
df = pd.DataFrame({
    'text': ['John lives in New York', 'Mary works in Paris'],
    'label': ['B-PER O O B-LOC I-LOC', 'B-PER O O B-LOC']
})

# Tokenize sentences and encode labels
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
tokenized_texts = [tokenizer.encode(text, add_special_tokens=True) for text in df['text']]
labels = [label.split() for label in df['label']]

# Pad and truncate sequences to a common length
MAX_LEN = max(len(seq) for seq in tokenized_texts)
padded_texts = torch.tensor([text + [0]*(MAX_LEN-len(text)) for text in tokenized_texts])
padded_labels = torch.tensor([[tokenizer.convert_tokens_to_ids(label) + [0]*(MAX_LEN-len(label)) 
                               for label in labels]])

# Define the model
model = BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=4)
optimizer = AdamW(model.parameters(), lr=2e-5)

# Train the model
NUM_EPOCHS = 3
BATCH_SIZE = 2
for epoch in range(NUM_EPOCHS):
    for i in range(0, len(padded_texts), BATCH_SIZE):
        input_ids = padded_texts[i:i+BATCH_SIZE]
        labels = padded_labels[:,i:i+BATCH_SIZE]
        outputs = model(input_ids, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    print('Epoch:', epoch, 'Loss:', loss.item())

# Test the model
test_text = 'John works in London'
tokenized_text = tokenizer.encode(test_text, add_special_tokens=True)
input_ids = torch.tensor([tokenized_text])
with torch.no_grad():
    outputs = model(input_ids)
logits = outputs[0][0].detach().numpy()
predicted_labels = [list(p) for p in np.argmax(logits, axis=1)]
predicted_tags = [tokenizer.convert_ids_to_tokens(l) for l in predicted_labels]
print(predicted_tags)


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cas

IndexError: ignored

In [49]:
import pandas as pd
import torch
from transformers import BertForTokenClassification, BertTokenizer

# Define the dataset
df = pd.DataFrame({
    'text': ['John lives in New York', 'John works in Paris', 'John works in Paris', 'John works in Paris', 'Mary works in Paris', 'Mary works in Paris', 'Mary works in Paris', 'Mary works in Paris', 'Mary works in Paris', 'Mary works in Paris', 'Mary works in Paris', 'Mary works in Paris'],
    'label': ['B-PER O O B-LOC I-LOC', 'B-PER O O B-LOC', 'B-PER O O B-LOC', 'B-PER O O B-LOC', 'B-PER O O B-LOC', 'B-PER O O B-LOC', 'B-PER O O B-LOC', 'B-PER O O B-LOC', 'B-PER O O B-LOC', 'B-PER O O B-LOC', 'B-PER O O B-LOC', 'B-PER O O B-LOC']
})

# Define the model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=4)

# Encode the text and labels
tokenized_texts = [tokenizer.encode(text, add_special_tokens=True) for text in df['text']]
labels = [label.split() for label in df['label']]
label_map = {'B-PER': 0, 'B-LOC': 1, 'I-LOC': 2, 'O': 3}
labels = [[label_map[label] for label in doc_labels] for doc_labels in labels]
print(labels)
# Pad the tokenized texts and labels
max_len = max([len(text) for text in tokenized_texts])
input_ids = torch.tensor([text + [0]*(max_len-len(text)) for text in tokenized_texts])
attention_mask = torch.where(input_ids != 0, torch.tensor(1), torch.tensor(0))
labels = torch.tensor([label + [3]*(max_len-len(label)) for label in labels])

# Train the model
optimizer = torch.optim.Adam(model.parameters(), lr=3e-5)

for epoch in range(30):
    model.train()
    optimizer.zero_grad()
    outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
    loss = outputs[0]
    loss.backward()
    optimizer.step()
    
    print(f'Epoch: {epoch + 1}, Loss: {loss.item()}')

model.save_pretrained('/content/model1.bin')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

[[0, 3, 3, 1, 2], [0, 3, 3, 1], [0, 3, 3, 1], [0, 3, 3, 1], [0, 3, 3, 1], [0, 3, 3, 1], [0, 3, 3, 1], [0, 3, 3, 1], [0, 3, 3, 1], [0, 3, 3, 1], [0, 3, 3, 1], [0, 3, 3, 1]]
Epoch: 1, Loss: 1.4481223821640015
Epoch: 2, Loss: 1.1249089241027832
Epoch: 3, Loss: 0.9239179491996765
Epoch: 4, Loss: 0.7726314067840576
Epoch: 5, Loss: 0.6276735663414001
Epoch: 6, Loss: 0.5158246159553528
Epoch: 7, Loss: 0.39880701899528503
Epoch: 8, Loss: 0.3248632550239563
Epoch: 9, Loss: 0.22921359539031982
Epoch: 10, Loss: 0.19378255307674408
Epoch: 11, Loss: 0.14890868961811066
Epoch: 12, Loss: 0.11985602974891663
Epoch: 13, Loss: 0.08933725208044052
Epoch: 14, Loss: 0.07162939757108688
Epoch: 15, Loss: 0.06200927123427391
Epoch: 16, Loss: 0.04774283617734909
Epoch: 17, Loss: 0.04803113639354706
Epoch: 18, Loss: 0.03558838739991188
Epoch: 19, Loss: 0.03141646832227707
Epoch: 20, Loss: 0.024855613708496094
Epoch: 21, Loss: 0.021429499611258507
Epoch: 22, Loss: 0.02005799114704132
Epoch: 23, Loss: 0.017545912

In [35]:
# Define a new text sequence
model = BertForTokenClassification.from_pretrained('/content/model1.bin')
text = 'Jane is going to London'

# Encode the text sequence
tokenized_text = tokenizer.encode(text, add_special_tokens=True)
input_ids = torch.tensor([tokenized_text])

# Get the model's predictions
model.eval()
with torch.no_grad():
    outputs = model(input_ids)
    predictions = outputs[0].argmax(dim=2).squeeze().tolist()

# Decode the predicted labels
print(predictions)
# predicted_labels = [label_map[label_id] for label_id in predictions[1:-1]]

# Print the predicted labels
# print(predicted_labels)


TypeError: ignored

In [54]:
import torch
from transformers import BertForTokenClassification, BertTokenizer

# Load the trained model
model = BertForTokenClassification.from_pretrained('/content/model1.bin')

# Instantiate the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Test the model on a sample sentence
# text = "John lives in New York city"
text1 = 'John Lives in New York'
inputs = tokenizer.encode_plus(text1, add_special_tokens=True, return_tensors="pt")
outputs = model(**inputs)

# Get the predicted labels
predicted_labels = outputs.logits.argmax(dim=2)
predicted_labels = predicted_labels.squeeze().tolist()
labels = tokenizer.convert_ids_to_tokens(predicted_labels)
print(predicted_labels)


[0, 3, 3, 1, 2, 3, 3]
