In [None]:
import os
import tensorflow as tf
import csv
import numpy as np
import pandas as pd
from transformers import BertTokenizer
from transformers import TFBertModel
from tensorflow.keras.layers import Dropout, Dense
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import SparseCategoricalAccuracy
from sklearn.model_selection import train_test_split

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'  # Suppress TensorFlow logging (1 = INFO, 2 = WARNING, 3 = ERROR)
tf.get_logger().setLevel('ERROR')         # Suppress TensorFlow logging

model_name = "bert-base-german-cased"
tokenizer = BertTokenizer.from_pretrained(model_name)
base_bert_model = TFBertModel.from_pretrained(model_name)


In [None]:
max_seq = 60
data_path = "Train_Tagged_Titles.tsv"
df = pd.read_csv(data_path, sep="\t", dtype=str, keep_default_na=False, na_values=[""], quoting=csv.QUOTE_NONE)


In [None]:
def process_row(row, last_non_nan_entity):
    if pd.isna(row['Tag']):
        if last_non_nan_entity is not None:
            return 'I-' + last_non_nan_entity[0]
        else:
            return row['Tag']
    else:
        last_non_nan_entity[0] = row['Tag']
        return 'B-' + row['Tag']

In [None]:
# Initialize a list to keep track of the last non-NaN entity
last_non_nan_entity = [None]

# Use apply with a lambda function
df['mod_Tag'] = df.apply(lambda row: process_row(row, last_non_nan_entity), axis=1)

# Check the result
df_entities = df[['Record Number','Token','mod_Tag']]
vocab = ['[PAD]'] + df_entities['mod_Tag'].unique().tolist()
voc_map = {}
for label in vocab:
    voc_map[label] = len(voc_map)


In [None]:
train_seq = df['Title'].unique().tolist()

token_ids = np.zeros(shape=(len(train_seq), max_seq), dtype=np.int32)

for i, text in enumerate(train_seq):
    encoded = tokenizer.encode(text)
    token_ids[i, 0:len(encoded)] = encoded

attention_masks = (token_ids != 0).astype(np.int32)


In [None]:
df_entities['Tokenized_Length'] = df_entities['Token'].apply(lambda x: len(tokenizer.tokenize(x)))

# Group the DataFrame by 'Record Number'
grouped_entities = df_entities.groupby('Record Number')

token_labels = np.zeros(shape=(len(train_seq), max_seq), dtype=np.int32)

for i in range(5000):
    if str(i + 1) in grouped_entities.groups:
        curr_entities = grouped_entities.get_group(str(i + 1))
        pointer = 1
        for _, row in curr_entities.iterrows():
            token_len = row['Tokenized_Length']
            token_labels[i, pointer:(pointer + token_len)] = np.array([voc_map[row['mod_Tag']]])
            pointer += token_len


In [None]:
class TokenLevelF1Score(tf.keras.metrics.Metric):
    def __init__(self, num_classes, name='f1_score', **kwargs):
        super(TokenLevelF1Score, self).__init__(name=name, **kwargs)
        self.num_classes = num_classes
        self.true_positives = self.add_weight(name='tp', shape=(num_classes,), initializer='zeros')
        self.false_positives = self.add_weight(name='fp', shape=(num_classes,), initializer='zeros')
        self.false_negatives = self.add_weight(name='fn', shape=(num_classes,), initializer='zeros')

    def update_state(self, y_true, y_pred, sample_weight=None):
      y_pred = tf.argmax(y_pred, axis=-1)
      y_true = tf.cast(y_true, 'int32')
      y_pred = tf.cast(y_pred, 'int32')

      for class_id in range(self.num_classes):
          y_true_class = tf.equal(y_true, class_id)
          y_pred_class = tf.equal(y_pred, class_id)

          tp = tf.reduce_sum(tf.cast(tf.logical_and(y_true_class, y_pred_class), 'int32'))
          fp = tf.reduce_sum(tf.cast(y_pred_class, 'int32')) - tp
          fn = tf.reduce_sum(tf.cast(y_true_class, 'int32')) - tp

          self.true_positives.assign_add(tf.cast(tf.scatter_nd([[class_id]], [tp], shape=[self.num_classes]), 'float32'))
          self.false_positives.assign_add(tf.cast(tf.scatter_nd([[class_id]], [fp], shape=[self.num_classes]), 'float32'))
          self.false_negatives.assign_add(tf.cast(tf.scatter_nd([[class_id]], [fn], shape=[self.num_classes]), 'float32'))


    def result(self):
        precision = tf.math.divide_no_nan(self.true_positives, self.true_positives + self.false_positives)
        recall = tf.math.divide_no_nan(self.true_positives, self.true_positives + self.false_negatives)
        f1 = 2 * (precision * recall) / (precision + recall + tf.keras.backend.epsilon())
        return tf.reduce_mean(f1)

    def reset_state(self):
        self.true_positives.assign(tf.zeros_like(self.true_positives))
        self.false_positives.assign(tf.zeros_like(self.false_positives))
        self.false_negatives.assign(tf.zeros_like(self.false_negatives))

In [None]:
train_ids, val_ids, train_labels, val_labels = train_test_split(token_ids, token_labels, test_size=0.05, random_state=42)

class EntityNamingModel(tf.keras.Model):

    def __init__(self, entity_labels=None,
                dropout_prob=0.1):
        super().__init__(name="entity_namer")

        self.bert = base_bert_model
        self.dropout = Dropout(dropout_prob)
        self.entity_classifier = Dense(entity_labels,
                                     name="entity_classifier")

    def call(self, inputs, **kwargs):

        tokens_output, _ = self.bert(inputs, **kwargs, return_dict=False)

        tokens_output = self.dropout(tokens_output, training=kwargs.get("training", False))
        entity_logits = self.entity_classifier(tokens_output)

        return entity_logits



In [None]:
histories = []
c=0
for i in [(5,0.8), (5, 0.7), (5, 0.6), (4,0.5), (3, 0.4)]:
    model = EntityNamingModel(entity_labels=len(voc_map), dropout_prob=i[1])

    model.compile(optimizer=Adam(learning_rate=3e-5, epsilon=1e-08),
                        loss=[SparseCategoricalCrossentropy(from_logits=True)],
                        metrics=[SparseCategoricalAccuracy('accuracy'), TokenLevelF1Score(num_classes = len(voc_map))],
                        run_eagerly=True)

    with tf.device("/cpu:0"): #running with gpu on m3 mac causes training deficiencies
        history = model.fit(train_ids, train_labels,
            validation_data=(val_ids, val_labels),
            epochs=i[0], batch_size=4)
        histories.append(history)
    
    loss, accuracy, f1_score = model.evaluate(val_ids,val_labels)
    print("Model Param: {i}")
    print("Test Loss: {loss}")
    print("Test Accuracy: {accuracy}")
    print("Test F1_Score: {f1_score}")
    print(" ------ ")
    model.save('model_{c}.keras')
    c+=1

In [None]:
model.save('model_11-22-23_9561_6070.keras')

In [None]:
def show_predictions(text, entity_names):
    inputs = tf.constant(tokenizer.encode(text))[None, :]
    outputs = model(inputs)
    entity_logits = outputs
    entity_ids = entity_logits.numpy().argmax(axis=-1)[0, 1:-1]
    print("## Entities:")
    for token, entity_id in zip(tokenizer.tokenize(text), entity_ids):
        print(f"{token:>10} : {entity_names[entity_id]}")

rev_map = {v: k for k,v in voc_map.items()}

In [None]:
show_predictions(df["Title"][0], rev_map)