### Step 1: Imports and Setup


In [127]:
import json
import numpy as np
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel, AutoTokenizer, BertModel, TFBertForTokenClassification, AutoConfig, TFAutoModelForTokenClassification, AutoModel, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical

import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertAdam, BertForSequenceClassification
from tqdm import tqdm , trange
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt

In [128]:
# install pytorch_pretrained_bert the previous version of Pytorch-Transformers
%pip install pytorch-pretrained-bert

Note: you may need to restart the kernel to use updated packages.


In [129]:
# Check to confirm the specific GPU model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

'NVIDIA GeForce GTX 1660 SUPER'

### Step 2: Load and Prepare Data and create BIO labels

In [130]:
# Load data from JSON file

with open('./data/classNERData.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# Extract requests and entities
requests = [example["request"] for example in data]
entities = [example['entities'] for example in data]

# Function to create BIO format
def replace_words_with_indexes(sentence):
    indexes = ""
    counter = 0
    for word in sentence.split(" "):
        indexes += str(counter) + "-" + str(counter + len(word)) + " "
        counter += len(word) + 1
    return indexes.rstrip()

def create_BIO(data):
    bio_labels = []
    for item in data:
        bio = ["O"] * len(item['request'].split())
        indexes = replace_words_with_indexes(item['request'])
        for entity in item['entities']:
            start = entity['start']
            end = entity['end']
            label_type = entity['category']
            for i, index in enumerate(indexes.split(" ")):
                if int(index.split("-")[0]) >= start and int(index.split("-")[1]) <= end:
                    if int(index.split("-")[0]) == start:
                        bio[i] = 'B-' + label_type 
                    else:
                        bio[i] = 'I-' + label_type
        bio_labels.append(' '.join(bio))
    return bio_labels

# Convert entities to BIO format
bio_labels = create_BIO(data)

In [131]:
print(bio_labels)
print(len(bio_labels))

['O O O O O O O', 'O O O B-DESTINATION', 'O O O O B-DESTINATION', 'O O O O O O O', 'O O O O B-ORIGIN I-ORIGIN B-DESTINATION I-DESTINATION O O', 'O O O O O O B-ORIGIN O B-DESTINATION B-DATE', 'O B-DATE I-DATE O O O B-DESTINATION', 'O O O O O O O O O', 'O O O O O O', 'O O O B-DESTINATION O B-DATE', 'O O O O O', 'O O O O O O', 'O O O O B-DESTINATION', 'O O O O O O O', 'O O O B-DESTINATION O O O', 'O O O O B-DESTINATION', 'O O O O O O', 'O O O B-DATE O', 'O O O O O', 'O O O O O O O']
20


#### Label Maping

In [132]:
import numpy as np
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences

# Corrected label mapping without trailing spaces
label_mapping = {
    'O': 0, 
    'B-DESTINATION': 1, 
    'I-DESTINATION': 2, 
    'B-ORIGIN': 3, 
    'I-ORIGIN': 4, 
    'B-DATE': 5, 
    'I-DATE': 6
}

# Split each sequence into individual labels and map them
mapped_labels = []
for sequence in bio_labels:
    mapped_sequence = [label_mapping[label] for label in sequence.split()]
    mapped_labels.append(mapped_sequence)


# Determine the maximum length of the sequences
max_len = max(len(seq) for seq in mapped_labels)

# Pad the sequences
padded_labels = pad_sequences(mapped_labels, maxlen=max_len, padding='post', value=label_mapping['O'])

# One-hot encode the labels
# one_hot_labels = [to_categorical(seq, num_classes=len(label_mapping)) for seq in padded_labels]

# Convert to numpy array
# one_hot_labels = np.array(one_hot_labels)

labels = np.array(padded_labels)
print(labels.shape)
print(labels[4])

(20, 10)
[0 0 0 0 3 4 1 2 0 0]


### Step 3: Tokenization and Padding


In [133]:
# Initialize the tokenizer
# model_name = "avichr/heBERT"
model_name = "bert-base-multilingual-cased"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer = BertTokenizer.from_pretrained(model_name, do_lower_case = False)
# Tokenize the requests
tokenized_texts = [tokenizer.tokenize(sent) for sent in requests]

In [134]:
# Padding Sentences
# Set the maximum sequence length. The longest sequence in our training set
# is 47, but we'll leave room on the end anyway.
# In the original paper, the authors used a length of 512.
MAX_LEN = 128

# Pad our input tokens
input_ids = pad_sequences(
    [tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts], 
    maxlen=MAX_LEN, dtype="long", truncating="post", padding="post"
    )

# Index Numbers and Padding
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]

# pad sentences
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, 
                          dtype ="long", truncating="post",padding ="post")

In [135]:
# Attention masks

# Create attention masks
attention_masks = []

# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
  seq_mask = [float(i > 0) for i in seq]
  attention_masks.append(seq_mask)


In [136]:
# aligned_labels = []
# for i, (request, labels) in enumerate(zip(tokenized_requests['input_ids'], labels)):
#     # Skip special tokens ([CLS], [SEP], [PAD])
#     tokens = tokenizer.convert_ids_to_tokens(request.numpy())[1:-1]
    
#     # Trim labels to match the number of tokens
#     trimmed_labels = labels[:len(tokens)]

#     aligned_labels.append(trimmed_labels)

# # Print the aligned labels for each request
# for i, labels in enumerate(aligned_labels):
#     print(f"Request {i+1}:")
#     print(labels)

# print(len(aligned_labels))



### Step 4: Train-Test Split and Create Dataloaders


In [137]:
# Train and Validation Set

train_inputs, validation_inputs, train_labels, validation_labels = \
  train_test_split(input_ids, labels, random_state=2018, test_size=0.1)

train_masks, validation_masks , _, _ = \
  train_test_split(attention_masks, input_ids, random_state=2018, test_size=0.1)
  
# transform all data into torch tensors

train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

In [138]:
# Preparation for Training

# Select a batch size for training. For fine tuning BERT on a
# specific task , BERT authors recommend a batch size of 16 or 32
batch_size = 32

# Create an iterator of our data with torch DataLoader
# This helps save on memory during training because, unlike a for loop, 
# with iterator the entire dataset does not need to be loaded into memory

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = \
  DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = \
  TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = \
  DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [139]:
# # Convert the tokenized input IDs and attention masks to lists
# input_ids = tokenized_requests['input_ids'].numpy()
# attention_masks = tokenized_requests['attention_mask'].numpy()

# # Convert aligned labels to a tensor
# # aligned_labels = np.array(aligned_labels)
# labels = np.array(labels)

# # Create a tf.data.Dataset object
# dataset = tf.data.Dataset.from_tensor_slices((input_ids, attention_masks, labels))

# # Function to split dataset
# def train_test_split_dataset(dataset, test_size=0.2):
#     # Calculate the size of the train and test splits
#     dataset_size = len(list(dataset))
#     test_size = int(test_size * dataset_size)
#     train_size = dataset_size - test_size

#     # Split the dataset
#     train_dataset = dataset.take(train_size)
#     test_dataset = dataset.skip(train_size)

#     return train_dataset, test_dataset

# # Split the dataset into training and testing sets
# train_dataset, test_dataset = train_test_split_dataset(dataset, test_size=0.2)

# # Batch and shuffle the datasets
# train_dataset = train_dataset.shuffle(buffer_size=1024).batch(16).prefetch(tf.data.AUTOTUNE)
# test_dataset = test_dataset.batch(16).prefetch(tf.data.AUTOTUNE)

# # Check shapes
# for batch in train_dataset.take(1):
#     input_ids_batch, attention_masks_batch, labels_batch = batch
#     print(f"input_ids_batch shape: {input_ids_batch.shape}")
#     print(f"attention_masks_batch shape: {attention_masks_batch.shape}")
#     print(f"labels_batch shape: {labels_batch.shape}")


### Cell 5: Model Definition


In [140]:
from torchinfo import summary
# Define the number of labels
num_labels = 7  # Number of unique labels (O, B-DESTINATION, I-DESTINATION, B-ORIGIN, I-ORIGIN, B-DATE, I-DATE)

# Load the configuration for the model
# config = AutoConfig.from_pretrained("avichr/heBERT", num_labels=num_labels)

# Load the BERT model with a token classification head
model = BertForSequenceClassification.from_pretrained(model_name,  num_labels=num_labels)
model.cuda()
# Print the model summary
# model.summary()



BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1, inplace=F

In [141]:
# Hyperparameters
from pytorch_pretrained_bert import BertAdam, BertForSequenceClassification

param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']

optimizer_grouped_parameters = [
    {
        'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay_rate': 0.01
    },
    {
        'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 
        'weight_decay_rate': 0.00
    }
  ]

optimizer = BertAdam(optimizer_grouped_parameters, lr=2e-5, warmup=0.1)

t_total value of -1 results in schedule not being applied


In [142]:
# calculates the accuracy of our predictions vs labels

def flat_accuracy(preds, labels):
  pred_flat  = np.argmax(preds , axis=1).flatten()
  labels_flat = labels.flatten()
  return np.sum(pred_flat == labels_flat)/len(labels_flat)

### Step 6: Model Compilation and Training


In [143]:
# Training Loop

# Store our loss and accuracy for plotting
train_loss_set = []

# Number of training epochs (authors recommend between 2 and 4)
epochs = 4
# trange is a tqdm wrapper around the normal python range
for _ in trange(epochs, desc="Epoch"):
  # Training
  # Set our model to training mode (as opposed to evaluation mode)
  model.train()
  # Tracking variables
  tr_loss = 0
  nb_tr_examples, nb_tr_steps = 0, 0
  
  # Train the data for one epoch
  for step, batch in enumerate(train_dataloader):
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask , b_labels = batch
    # Clear out the gradients (by default they accumulate)
    optimizer.zero_grad()
    # Forward pass
    loss = model(b_input_ids, token_type_ids=None, 
                 attention_mask=b_input_mask, labels=b_labels)
    
    train_loss_set.append(loss.item())
    # Backward pass
    loss.backward()
    # Update parameters and take a step using the computed gradient
    optimizer.step()
    # Update tracking variables
    tr_loss += loss.item()
    nb_tr_examples += b_input_ids.size(0)
    nb_tr_steps += 1
  print("Train loss: {}".format(tr_loss/nb_tr_steps))
    
# Validation
# Put model in evaluation mode to evaluate loss on the validation set
model.eval()

# Tracking variables
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0

# Evaluate data for one epoch
for batch in validation_dataloader:
  # Add batch to GPU
  batch = tuple(t.to(device) for t in batch)
  # Unpack the inputs from our dataloader
  b_input_ids, b_input_mask, b_labels = batch
  
  # Telling the model not to compute or store gradients,
  # ve memory and speede up validation
  with torch.no_grad():
    # Forward pass, calculate logit predictions
    logits = model(b_input_ids, token_type_ids =None, attention_mask=b_input_mask)
    
  # Move logits and labels to CPU
  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()
  
  tmp_eval_accuracy = flat_accuracy(logits, label_ids)
  eval_accuracy += tmp_eval_accuracy
  nb_eval_steps += 1

print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))


Epoch:   0%|          | 0/4 [00:02<?, ?it/s]


ValueError: Expected input batch_size (18) to match target batch_size (180).

### Debug cell

In [None]:
print("Train input ids shape:", train_input_ids.shape)
print("Train attention mask shape:", train_attention_mask.shape)
print("Validation input ids shape:", val_input_ids.shape)
print("Validation attention mask shape:", val_attention_mask.shape)
print("Train labels shape:", train_labels.shape)
print("Validation labels shape:", val_labels.shape)


Train input ids shape: (16, 19)
Train attention mask shape: (16, 19)
Validation input ids shape: (4, 19)
Validation attention mask shape: (4, 19)
Train labels shape: (16,)
Validation labels shape: (4,)
