In [None]:
from transformers import BertTokenizerFast, BertForTokenClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
import torch
import json

# Load JSON data from a file
with open('/content/data.json', 'r') as file:
    dataset = json.load(file)

def extract_labels(json_data):
    labels = set()
    for item in json_data:
        for entity in item['entities']:
            labels.add(f"B-{entity['label']}")
            labels.add(f"I-{entity['label']}")
    labels.add("O")  # Add the 'O' label for non-entity tokens
    return list(labels)

label_set = extract_labels(dataset)
num_labels = len(label_set)

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)

# Freeze all BERT layers except the classification layer
for param in model.bert.parameters():
    param.requires_grad = False

# Define batch_size and learning rate
batch_size = 16 # Consider reducing this if you still run out of memory
learning_rate = 2e-5

def tokenize_and_format_data(dataset, tokenizer, entity_types):
    tokenized_data = []
    max_length = 512  # Maximum length for BERT input sequences

    for item in dataset:
        text = item['text']
        entities = item['entities']

        # Tokenize the input text using the BERT tokenizer
        encodings = tokenizer(text, is_split_into_words=False, truncation=True, padding='max_length', max_length=max_length, return_offsets_mapping=True)
        tokens = tokenizer.convert_ids_to_tokens(encodings['input_ids'])

        # Initialize labels for each token as 'O' (Outside)
        labels = ['O'] * len(tokens)

        # Map original text positions to token positions
        word_ids = encodings['offset_mapping']

        for entity in entities:
            start, end, entity_type = entity['start'], entity['end'], entity['label']
            # Find token indices for the entity span
            start_token = None
            end_token = None
            for idx, (orig_start, orig_end) in enumerate(word_ids):
                if orig_start <= start < orig_end:
                    start_token = idx
                if orig_start < end <= orig_end:
                    end_token = idx

            if start_token is not None and end_token is not None:
                labels[start_token] = f"B-{entity_type}"
                for i in range(start_token + 1, end_token + 1):
                    labels[i] = f"I-{entity_type}"

        # Convert labels to IDs
        label_ids = [entity_types.index(x) for x in labels]

        # Pad label_ids to max_length
        if len(label_ids) < max_length:
            label_ids.extend([entity_types.index("O")] * (max_length - len(label_ids)))
        else:
            label_ids = label_ids[:max_length]

        tokenized_data.append({
            'input_ids': encodings['input_ids'],
            'attention_mask': encodings['attention_mask'],
            'labels': label_ids
        })

    # Convert tokenized data to PyTorch dataset
    dataset = TensorDataset(
        torch.tensor([item['input_ids'] for item in tokenized_data]),
        torch.tensor([item['attention_mask'] for item in tokenized_data]),
        torch.tensor([item['labels'] for item in tokenized_data])
    )

    return dataset


train_data = tokenize_and_format_data(dataset, tokenizer, label_set)
train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Move model to the appropriate device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = AdamW(model.parameters(), lr=learning_rate)
num_epochs = 5

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    for batch in tqdm(train_dataloader, desc="Training"):
        # Move tensors to the same device as the model
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)

        optimizer.zero_grad()

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        # Backward pass
        loss.backward()

        # Update parameters
        optimizer.step()

        # Accumulate loss
        epoch_loss += loss.item()

    # Print epoch statistics
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {epoch_loss / len(train_dataloader)}")

    # Optional: Save model checkpoint
    model.save_pretrained(f'checkpoint-{epoch + 1}')

Training: 100%|██████████| 35/35 [22:57<00:00, 39.35s/it]


Epoch 1/5, Loss: 3.796078477587019


Training: 100%|██████████| 35/35 [22:39<00:00, 38.85s/it]


Epoch 2/5, Loss: 3.6422064440590995


Training: 100%|██████████| 35/35 [22:32<00:00, 38.66s/it]


Epoch 3/5, Loss: 3.4922445501599992


Training: 100%|██████████| 35/35 [22:50<00:00, 39.15s/it]


Epoch 4/5, Loss: 3.34906576020377


Training: 100%|██████████| 35/35 [22:44<00:00, 38.98s/it]


Epoch 5/5, Loss: 3.2107688699449812


In [None]:
model_save_path = './trained_model'
model.save_pretrained(model_save_path)

# Save the tokenizer
tokenizer_save_path = './tokenizer'
tokenizer.save_pretrained(tokenizer_save_path)

('./tokenizer/tokenizer_config.json',
 './tokenizer/special_tokens_map.json',
 './tokenizer/vocab.txt',
 './tokenizer/added_tokens.json',
 './tokenizer/tokenizer.json')

In [None]:
from transformers import BertForTokenClassification, BertTokenizerFast

# Load the model
model = BertForTokenClassification.from_pretrained(model_save_path)

# Load the tokenizer
tokenizer = BertTokenizerFast.from_pretrained(tokenizer_save_path)

In [None]:
model.eval()

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [None]:
import torch

# Define your input sentence
sentence = "symptoms of acute myocardial infarction."

# Tokenize the sentence
encodings = tokenizer(sentence, is_split_into_words=False, truncation=True, padding='max_length', max_length=512, return_offsets_mapping=True)
input_ids = torch.tensor([encodings['input_ids']])
attention_mask = torch.tensor([encodings['attention_mask']])

In [None]:
with torch.no_grad():
    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    predictions = torch.argmax(outputs.logits, dim=-1)

# Convert predictions to labels
predicted_labels = [tokenizer.convert_ids_to_tokens(pred) for pred in predictions[0].tolist()]


In [None]:
predicted_labels = [label_set[label_id] for label_id in predictions[0].tolist()]

In [None]:
tokens = tokenizer.convert_ids_to_tokens(encodings['input_ids'])
for token, label in zip(tokens, predicted_labels):
    print(f"Token: {token}, Label: {label}")

Token: [CLS], Label: I-PROCEDURE
Token: symptoms, Label: O
Token: of, Label: O
Token: acute, Label: I-PROBLEM
Token: my, Label: I-PROBLEM
Token: ##oca, Label: I-PROBLEM
Token: ##rdial, Label: I-PROBLEM
Token: in, Label: O
Token: ##far, Label: I-PROBLEM
Token: ##ction, Label: O
Token: ., Label: I-PROBLEM
Token: [SEP], Label: O
Token: [PAD], Label: O
Token: [PAD], Label: O
Token: [PAD], Label: O
Token: [PAD], Label: O
Token: [PAD], Label: O
Token: [PAD], Label: O
Token: [PAD], Label: O
Token: [PAD], Label: O
Token: [PAD], Label: O
Token: [PAD], Label: I-PROBLEM
Token: [PAD], Label: O
Token: [PAD], Label: O
Token: [PAD], Label: I-PROBLEM
Token: [PAD], Label: O
Token: [PAD], Label: O
Token: [PAD], Label: O
Token: [PAD], Label: O
Token: [PAD], Label: O
Token: [PAD], Label: O
Token: [PAD], Label: O
Token: [PAD], Label: O
Token: [PAD], Label: O
Token: [PAD], Label: O
Token: [PAD], Label: I-PROBLEM
Token: [PAD], Label: O
Token: [PAD], Label: I-PROBLEM
Token: [PAD], Label: I-PROBLEM
Token: [PAD