### Step 1: Imports and Setup


In [11]:
import json
import numpy as np
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical



### Step 2: Load and Prepare Data

In [12]:
# Load data from JSON file
with open('./data/classNERData.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# Extract requests and entities
requests = [example["request"] for example in data]
entities = [example['entities'] for example in data]

# Function to create BIO format
def replace_words_with_indexes(sentence):
    indexes = ""
    counter = 0
    for word in sentence.split(" "):
        indexes += str(counter) + "-" + str(counter + len(word)) + " "
        counter += len(word) + 1
    return indexes.rstrip()

def create_BIO(data):
    bio_labels = []
    for item in data:
        bio = ["O"] * len(item['request'].split())
        indexes = replace_words_with_indexes(item['request'])
        for entity in item['entities']:
            start = entity['start']
            end = entity['end']
            label_type = entity['category']
            for i, index in enumerate(indexes.split(" ")):
                if int(index.split("-")[0]) >= start and int(index.split("-")[1]) <= end:
                    if int(index.split("-")[0]) == start:
                        bio[i] = 'B-' + label_type 
                    else:
                        bio[i] = 'I-' + label_type
        bio_labels.append(' '.join(bio))
    return bio_labels

# Convert entities to BIO format
bio_labels = create_BIO(data)

### Step 3: Tokenization


In [13]:
# Tokenize requests using BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("avichr/heBERT_NER")
max_length = 64
tokenized_input = tokenizer(requests, truncation=True, padding='max_length', max_length=max_length)

# Convert tokenized input to numpy arrays
input_ids = np.array(tokenized_input['input_ids'])
attention_mask = np.array(tokenized_input['attention_mask'])

# Convert labels to numpy array

labels = np.array(bio_labels)


In [14]:
# label_mapping = {'O': 0, 'B-ENTITY': 1, 'I-ENTITY': 2}
# labels = np.array([label_mapping[label] for label in bio_labels])
# labels = to_categorical(labels, num_classes=3)


### Step 4: Train-Test Split


In [15]:
# Split data into train and test sets
train_input_ids, val_input_ids, train_attention_mask, val_attention_mask, train_labels, val_labels = train_test_split(
    input_ids, attention_mask, labels, test_size=0.2, random_state=42
)


### Cell 5: Model Definition


In [16]:
# Define the model architecture
num_labels = 3
def create_ner_model():
    # Load pre-trained BERT model
    bert_model = TFBertModel.from_pretrained("bert-base-multilingual-cased")
    
    # Freeze BERT layers
    for layer in bert_model.layers:
        layer.trainable = False
    
    # Input layer
    input_ids = Input(shape=(max_length,), dtype=tf.int32, name='input_ids')
    attention_mask = Input(shape=(max_length,), dtype=tf.int32, name='attention_mask')
    
    # BERT layer
    bert_output = bert_model(input_ids, attention_mask=attention_mask)[0]
    
    # Classification layer
    outputs = Dense(num_labels, activation='softmax')(bert_output)
    
    # Model
    model = Model(inputs=[input_ids, attention_mask], outputs=outputs)
    
    return model

# Create NER model
ner_model = create_ner_model()





Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

KeyboardInterrupt: 

### Step 6: Model Compilation and Training


In [None]:
# Compile the model
ner_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
num_epochs = 3


# Train the model
history = ner_model.fit(
    [train_input_ids, train_attention_mask],
    train_labels_encoded,
    validation_data=([val_input_ids, val_attention_mask], val_labels_encoded),
    epochs=num_epochs
)

# Train the model
history = ner_model.fit(
    [train_input_ids, train_attention_mask],
    train_labels,
    validation_data=([val_input_ids, val_attention_mask], val_labels),
    epochs=num_epochs
)


Epoch 1/3


ValueError: in user code:

    File "c:\Users\nizan\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\engine\training.py", line 1401, in train_function  *
        return step_function(self, iterator)
    File "c:\Users\nizan\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\engine\training.py", line 1384, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\nizan\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\engine\training.py", line 1373, in run_step  **
        outputs = model.train_step(data)
    File "c:\Users\nizan\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\engine\training.py", line 1151, in train_step
        loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "c:\Users\nizan\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\engine\training.py", line 1209, in compute_loss
        return self.compiled_loss(
    File "c:\Users\nizan\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\engine\compile_utils.py", line 277, in __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "c:\Users\nizan\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\losses.py", line 143, in __call__
        losses = call_fn(y_true, y_pred)
    File "c:\Users\nizan\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\losses.py", line 270, in call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "c:\Users\nizan\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\losses.py", line 2221, in categorical_crossentropy
        return backend.categorical_crossentropy(
    File "c:\Users\nizan\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\backend.py", line 5573, in categorical_crossentropy
        target.shape.assert_is_compatible_with(output.shape)

    ValueError: Shapes (None,) and (None, 64, 3) are incompatible


### Debug cell

In [None]:
print("Train input ids shape:", train_input_ids.shape)
print("Train attention mask shape:", train_attention_mask.shape)
print("Validation input ids shape:", val_input_ids.shape)
print("Validation attention mask shape:", val_attention_mask.shape)
print("Train labels shape:", train_labels.shape)
print("Validation labels shape:", val_labels.shape)


Train input ids shape: (16, 19)
Train attention mask shape: (16, 19)
Validation input ids shape: (4, 19)
Validation attention mask shape: (4, 19)
Train labels shape: (16,)
Validation labels shape: (4,)
