In [19]:
import json
from transformers import BertTokenizer
import tensorflow as tf
from sklearn.model_selection import train_test_split


In [20]:
with open('./data/classNERData.json', 'r', encoding='utf-8') as f:
    data = json.load(f)



In [21]:
# get an array of requests and entities
requests = []
entities = []
for example in data:
    requests.append(example["request"])
    entities.append(example['entities'])

In [22]:
#function for BIO format building
def replace_words_with_indexes(sentence):
    indexes = ""
    counter=0
    for word in sentence.split(" "):
        indexes += str(counter) + "-" + str(counter + len(word)) + " "
        counter += len(word) + 1
    return indexes.rstrip()

def create_BIO(data):
    bio_labels = []
    for item in data:
        bio = ["O"] * len(item['request'].split())
        indexes = replace_words_with_indexes(item['request'])
        for entity in item['entities']:
            start = entity['start']
            end = entity['end']
            label_type = entity['category']
            for i, index in enumerate(indexes.split(" ")):
                if int(index.split("-")[0]) >= start and int(index.split("-")[1]) <= end:
                    if int(index.split("-")[0]) == start:
                        bio[i] = 'B-' + label_type 
                    else:
                        bio[i] = 'I-' + label_type
        bio_labels.append(' '.join(bio))
    return bio_labels


In [23]:
#convert the entities to BIO format
bio_labels = create_BIO(data)
print(bio_labels)

['O O O O O O O', 'O O O B-DESTINATION', 'O O O O B-DESTINATION', 'O O O O O O O', 'O O O O B-ORIGIN I-ORIGIN B-DESTINATION I-DESTINATION O O', 'O O O O O O B-ORIGIN O B-DESTINATION B-DATE', 'O B-DATE I-DATE O O O B-DESTINATION', 'O O O O O O O O O', 'O O O O O O', 'O O O B-DESTINATION O B-DATE', 'O O O O O', 'O O O O O O', 'O O O O B-DESTINATION', 'O O O O O O O', 'O O O B-DESTINATION O O O', 'O O O O B-DESTINATION', 'O O O O O O', 'O O O B-DATE O', 'O O O O O', 'O O O O O O O']


In [7]:
# # Tokenize the sentences
# tokenizer = BertTokenizer.from_pretrained("avichr/heBERT_NER")
# max_length = 64
# tokenized_input = tokenizer(requests, truncation=True, padding=True, max_length=max_length, return_tensors='tf')





In [10]:
# # Convert tokenized input and BIO labels to TensorFlow Dataset
# def create_tf_dataset(tokenized_input, bio_labels):
#     input_ids = tokenized_input['input_ids']
#     attention_mask = tokenized_input['attention_mask']
    
#     dataset = tf.data.Dataset.from_tensor_slices(({
#         'input_ids': input_ids,
#         'attention_mask': attention_mask
#     }, {
#         'bio_labels': bio_labels
#     }))
    
#     return dataset

# # Create TensorFlow Dataset
# tf_dataset = create_tf_dataset(tokenized_input, bio_labels)

In [24]:
# Split the dataset into training and validation sets
train_requests, val_requests, train_bio_labels, val_bio_labels = train_test_split(requests, bio_labels, test_size=0.2, random_state=42)

max_length = 64
tokenizer = BertTokenizer.from_pretrained("avichr/heBERT_NER")
# Tokenize the training and validation sets
train_tokenized_input = tokenizer(train_requests, truncation=True, padding=True, max_length=max_length, return_tensors='tf')
val_tokenized_input = tokenizer(val_requests, truncation=True, padding=True, max_length=max_length, return_tensors='tf')

In [25]:
print(len(train_tokenized_input['input_ids']))
print(len(val_tokenized_input['input_ids']))
print(len(train_bio_labels))
print(len(val_bio_labels))

16
4
16
4


In [27]:
import numpy as np

# Convert tokenized input and BIO labels to NumPy arrays
train_input_ids = np.array(train_tokenized_input['input_ids'])
train_attention_mask = np.array(train_tokenized_input['attention_mask'])
train_labels = np.array(train_bio_labels)

val_input_ids = np.array(val_tokenized_input['input_ids'])
val_attention_mask = np.array(val_tokenized_input['attention_mask'])
val_labels = np.array(val_bio_labels)

# Print shapes for verification
print("Train input ids shape:", train_input_ids.shape)
print("Train attention mask shape:", train_attention_mask.shape)
print("Train labels shape:", train_labels.shape)

print("Validation input ids shape:", val_input_ids.shape)
print("Validation attention mask shape:", val_attention_mask.shape)
print("Validation labels shape:", val_labels.shape)


Train input ids shape: (16, 19)
Train attention mask shape: (16, 19)
Train labels shape: (16,)
Validation input ids shape: (4, 9)
Validation attention mask shape: (4, 9)
Validation labels shape: (4,)


In [28]:
from transformers import TFBertModel
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
num_labels = 3
num_epochs = 3

# Define the model architecture
def create_ner_model():
    # Load pre-trained BERT model
    bert_model = TFBertModel.from_pretrained("bert-base-multilingual-cased")
    
    # Freeze BERT layers
    for layer in bert_model.layers:
        layer.trainable = False
    
    # Input layer
    input_ids = Input(shape=(max_length,), dtype=tf.int32, name='input_ids')
    attention_mask = Input(shape=(max_length,), dtype=tf.int32, name='attention_mask')
    
    # BERT layer
    bert_output = bert_model(input_ids, attention_mask=attention_mask)[0]
    
    # Classification layer
    outputs = Dense(num_labels, activation='softmax')(bert_output)
    
    # Model
    model = Model(inputs=[input_ids, attention_mask], outputs=outputs)
    
    return model

# Create the NER model
ner_model = create_ner_model()

# Compile the model
ner_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])



# Train the model
history = ner_model.fit(
    [train_input_ids, train_attention_mask],
    train_labels,
    validation_data=(
        [val_input_ids, val_attention_mask],
        val_labels
    ),
    epochs=num_epochs
)


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

Epoch 1/3


ValueError: in user code:

    File "c:\Users\nizan\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\engine\training.py", line 1401, in train_function  *
        return step_function(self, iterator)
    File "c:\Users\nizan\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\engine\training.py", line 1384, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\nizan\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\engine\training.py", line 1373, in run_step  **
        outputs = model.train_step(data)
    File "c:\Users\nizan\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\engine\training.py", line 1150, in train_step
        y_pred = self(x, training=True)
    File "c:\Users\nizan\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "c:\Users\nizan\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\engine\input_spec.py", line 298, in assert_input_compatibility
        raise ValueError(

    ValueError: Input 0 of layer "model_2" is incompatible with the layer: expected shape=(None, 64), found shape=(None, 19)
