In [None]:
!pip install numpy scikit-learn tensorflow



In [None]:
def transform_dataset(page_dataset, for_inference):
    labeled_text_dataset = []
    for page in page_dataset:
        page_words = page["representativeData"]["page_data_words"]

        geo_dictionary = {}
        if not for_inference:
            page_answers = page.get("answers")
            for page_answer in page_answers[0]["answer"]:
                geo_label = page_answer["id"]
                for geo_part in page_answer["data"]:
                    for index in range(geo_part["start"], geo_part["end"]):
                        geo_dictionary[index] = geo_label

        labeled_text = []
        for word_index, word in enumerate(page_words):
            word_label = "0" if for_inference else geo_dictionary.get(word_index, "O")
            labeled_text.append((word, word_label))

        if not for_inference:
            labeled_text_dataset.append(labeled_text)
        else:
            labeled_text_dataset.append((page["taskId"], labeled_text))

    return labeled_text_dataset

In [None]:
import json

def get_labeled_dataset(dataset_path, for_inference=False):
    with open(dataset_path) as json_dataset:
        dataset = json.load(json_dataset)

    labeled_dataset = transform_dataset(dataset["data"]["results"], for_inference)
    return labeled_dataset

In [None]:
def get_validation_result(X_validation, y_pred):
    validation_result = []

    for ((task_id, _), predictions) in zip(X_validation, y_pred):
        answers = {}
        current_label = None
        start_index = None

        for current_index, label in enumerate(predictions):
            if label == current_label:
                continue
            else:
                if current_label is not None and current_label != "O":
                    if current_label not in answers:
                        answers[current_label] = []
                    answers[current_label].append({"start": start_index, "end": current_index})

                if label != "0":
                    current_label = label
                    start_index = current_index
                else:
                    current_label = None

        if current_label is not None and current_label != "O":
            if current_label not in answers:
                answers[current_label] = []
            answers[current_label].append({"start": start_index, "end": len(predictions)})

        validation_answers = []
        for label, segments in answers.items():
            validation_answers.append({"id": label, "data": segments})

        validation_result.append({
            "taskId": task_id,
            "answer": validation_answers
        })

    return validation_result

In [None]:
import tensorflow as tf

def focal_loss(alpha=0.25, gamma=2.):
    def focal_loss_parametrized(y_true, y_pred):
        e = 1.e-9
        y_true = tf.convert_to_tensor(y_true, tf.float32)
        y_pred = tf.convert_to_tensor(y_pred, tf.float32)

        model_output = tf.add(y_pred, e)
        ce = tf.multiply(y_true, -tf.math.log(model_output))
        w = tf.multiply(y_true, tf.pow(tf.subtract(1., model_output), gamma))
        fl = tf.multiply(alpha, tf.multiply(w, ce))
        reduced_fl = tf.reduce_max(fl, axis=1)
        return tf.reduce_mean(reduced_fl)

    return focal_loss_parametrized

2024-04-12 16:44:23.838704: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-12 16:44:23.838818: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-12 16:44:23.892771: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-04-12 16:44:24.072372: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

train_dataset = get_labeled_dataset("/content/drive/My Drive/Colab Notebooks/datasets/train_geo_extractor.json")
test_dataset = get_labeled_dataset("/content/drive/My Drive/Colab Notebooks/datasets/test_geo_extractor.json")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
print(len(train_dataset))
print(len(test_dataset))

2788
930


In [None]:
train_dataset = train_dataset[0:200]
test_dataset = test_dataset[0:50]

print(len(train_dataset))
print(len(test_dataset))

200
50


In [None]:
train_dataset = get_labeled_dataset("datasets/train_geo_extractor.json")

FileNotFoundError: [Errno 2] No such file or directory: 'datasets/train_geo_extractor.json'

In [None]:
max_text_length = max([len(text) for text in train_dataset])

words = [word for text in train_dataset for word, _ in text]
words.append("UNKNOWN")
words.append("ENDPAD")
words = list(set(words))

labels = list(set([label for text in train_dataset for _, label in text]))

In [None]:
word2index = {word: index for index, word in enumerate(words)}
label2index = {label: index for index, label in enumerate(labels)}

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

X_train = [[word2index[word] for word, _ in text] for text in train_dataset]
X_train = pad_sequences(maxlen=max_text_length, sequences=X_train, padding="post", value=len(words) - 1)

y_train = [[label2index[label] for _, label in text] for text in train_dataset]
y_train = pad_sequences(maxlen=max_text_length, sequences=y_train, padding="post", value=label2index["O"])

In [None]:
from tensorflow.keras.utils import to_categorical

y_train = [to_categorical(index, num_classes=len(labels)) for index in y_train]

In [None]:
import os
from uuid import uuid4
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, TimeDistributed, BatchNormalization
from tensorflow.keras.layers import Embedding, Dropout, Dense
from tensorflow.keras.layers import Bidirectional, LSTM

model_input = Input(shape=(max_text_length, ))
embedding_output = Embedding(input_dim=len(words), output_dim=max_text_length,
                             input_length=max_text_length)(model_input)
dropout_output = Dropout(0.1)(embedding_output)

lstm_output = Bidirectional(LSTM(units=300, return_sequences=True))(dropout_output)

model_output = TimeDistributed(Dense(len(labels), activation="softmax"))(lstm_output)

model = Model(model_input, model_output)
model.compile(optimizer="adam", loss=focal_loss(), metrics=["accuracy"])

model.fit(X_train, np.array(y_train), batch_size=16, epochs=5, use_multiprocessing=True, workers=os.cpu_count())

random_model_name = str(uuid4())
model.save(f"saved_model/{random_model_name}")
print(f"Model {random_model_name} has successfully been saved!")

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
INFO:tensorflow:Assets written to: saved_model/a438be36-e0ab-4993-81c8-b17bace74911/assets


INFO:tensorflow:Assets written to: saved_model/a438be36-e0ab-4993-81c8-b17bace74911/assets


Model a438be36-e0ab-4993-81c8-b17bace74911 has successfully been saved!


In [None]:
!pip install numpy scikit-learn gensim tf2crf tensorflow

In [None]:
import os
from gensim.models import FastText

sentences = [[word for word, _ in text] for text in train_dataset]

model = FastText(sentences, vector_size=100, window=3, min_count=1, workers=os.cpu_count(), sg=1)

In [None]:
import numpy as np

embedding_matrix = np.zeros((len(word2index), 100))

for word, index in word2index.items():
    embedding_vector = model.wv[word]
    embedding_matrix[index] = embedding_vector

In [None]:
from tensorflow.keras.layers import Embedding

embedding_layer = Embedding(input_dim=len(word2index),
                           output_dim=100,
                           weights=[embedding_matrix],
                           input_length=max_text_length,
                           trainable=False)

In [None]:
import os
from uuid import uuid4
import numpy as np
from tf2crf import CRF, ModelWithCRFLoss
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, TimeDistributed
from tensorflow.keras.layers import Embedding, Dropout, Dense
from tensorflow.keras.layers import Bidirectional, LSTM, Attention

model_input = Input(shape=(max_text_length, ))
model = Embedding(input_dim=len(words), output_dim=max_text_length, input_length=max_text_length)(model_input)
model = Dropout(0.1)(model)

lstm_output = Bidirectional(LSTM(units=100, return_sequences=True))(model)

crf = CRF(dtype="float32")
model_output = crf(lstm_output)

hybrid_model = Model(model_input, model_output)
model = ModelWithCRFLoss(hybrid_model)
model.compile(optimizer="adam")

model.fit(X_train, np.array(y_train), batch_size=8, epochs=5, use_multiprocessing=True, workers=os.cpu_count())

random_model_name = str(uuid4())
model.save(f"saved_model/{random_model_name}")
print(f"Model {random_model_name} has successfully been saved!")

In [None]:
test_dataset = get_labeled_dataset("datasets/test_geo_extractor.json")

In [None]:
import os
from tensorflow.keras import models

model_name = input("Enter the model name: ")
model_path = "saved_model/" + model_name

if not os.path.exists(model_path):
    print(f"The model {model_name} does not exist!")

recognizer = models.load_model(model_path, custom_objects={"focal_loss_parametrized": focal_loss})

Enter the model name: a438be36-e0ab-4993-81c8-b17bace74911


In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

X_test = [[word2index.get(word, word2index["UNKNOWN"]) for word, _ in text] for text in test_dataset]
X_test = pad_sequences(maxlen=max_text_length, sequences=X_test, padding="post", value=len(word2index) - 1)

In [None]:
predictions = recognizer.predict(X_test, use_multiprocessing=True, workers=os.cpu_count())



In [None]:
import numpy as np

y_pad_pred_test = [[labels[np.argmax(prediction)] for prediction in text_prediction]
                   for text_prediction in predictions]

In [None]:
y_pred_test = []

for i, text in enumerate(test_dataset):
    text_predictions = []
    for j, (word, _) in enumerate(text):
        if j < len(y_pad_pred_test[i]):
            text_predictions.append((word, y_pad_pred_test[i][j]))

    y_pred_test.append(text_predictions)

In [None]:
from sklearn.metrics import classification_report, matthews_corrcoef

y_test_flat = [label for text in test_dataset for _, label in text]
y_pred_flat = [label for text in y_pred_test for _, label in text]

print(classification_report(y_test_flat, y_pred_flat))
print(f"Matthews Correlation Coefficient: {matthews_corrcoef(y_test_flat, y_pred_flat)}")

                   precision    recall  f1-score   support

                O       0.98      1.00      0.99     62822
     central_city       0.31      0.60      0.41       184
      geo_address       0.91      0.58      0.71      1040
     geo_building       0.85      0.64      0.73       453
         geo_city       0.89      0.63      0.73      1433
     geo_district       0.89      0.70      0.79       387
geo_microdistrict       0.67      0.49      0.57       382
       geo_region       0.99      0.98      0.99      1733
geo_region_oblast       0.91      0.64      0.75       297
       geo_street       0.75      0.72      0.74      1059

         accuracy                           0.97     69790
        macro avg       0.82      0.70      0.74     69790
     weighted avg       0.97      0.97      0.97     69790

Matthews Correlation Coefficient: 0.8242336994803532


In [None]:
validation_dataset = get_labeled_dataset("datasets/val_no_answer_geo_extractor.json", for_inference=True)

In [None]:
X_validation = [[word2index.get(word, word2index["UNKNOWN"]) for word, _ in text]
                for task_id, text in validation_dataset]
X_validation = pad_sequences(maxlen=max_text_length, sequences=X_validation, padding="post",
                             value=len(word2index) - 1)

In [None]:
y_pred_validation = recognizer.predict(X_validation)

X_validation = [(task_id, text) for task_id, text in validation_dataset]



In [None]:
y_pad_pred_validation = [[labels[np.argmax(prediction)] for prediction in text_prediction]
              for text_prediction in y_pred_validation]

In [None]:
y_pred_validation = []

for i, text in enumerate(validation_dataset):
    text_predictions = []
    for j, (word, _) in enumerate(text[1]):
        if j < len(y_pad_pred_validation[i]):
            text_predictions.append((word, y_pad_pred_validation[i][j]))

    y_pred_validation.append(text_predictions)

In [None]:
import json

y_pred_validation = [[label for _, label in text] for text in y_pred_validation]

validation_result = get_validation_result(X_validation, y_pred_validation)

with open("lstm_validation_result.json", "w", encoding="utf-8") as file:
    json.dump(validation_result, file, ensure_ascii=False, indent=4)

print("Validation result has been saved!")

Validation result has been saved!


In [None]:
!pip install numpy scikit-learn torch datasets transformers==4.30



In [None]:
!pip install accelerate -U



In [None]:
import torch

print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.current_device())

True
1
0


In [None]:
from transformers import AutoModelForTokenClassification, AutoTokenizer

model_name = "xlm-roberta-large-finetuned-conll03-english"
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=10, ignore_mismatched_sizes=True)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-large-finetuned-conll03-english and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([8]) in the checkpoint and torch.Size([10]) in the model instantiated
- classifier.weight: found shape torch.Size([8, 1024]) in the checkpoint and torch.Size([10, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
label_list = ["O", "central_city", "geo_address", "geo_building", "geo_city",
              "geo_district", "geo_microdistrict", "geo_region",
              "geo_region_oblast", "geo_street"]

label_dictionary = {label: i for i, label in enumerate(label_list)}

In [None]:
def tokenize_and_align_labels(dataset):
    texts = [[word for word, label in text] for text in dataset]
    tokenized_inputs = tokenizer(texts, padding=True, truncation=True,
                                 is_split_into_words=True, return_tensors="pt")

    encoded_labels = []
    for i, text in enumerate(dataset):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(label_dictionary["O"])
            elif word_idx != previous_word_idx:
                label_ids.append(label_dictionary[text[word_idx][1]])
            else:
                label_ids.append(label_dictionary["O"])
            previous_word_idx = word_idx
        encoded_labels.append(label_ids)

    tokenized_inputs["labels"] = encoded_labels
    return tokenized_inputs

In [None]:
from datasets import Dataset

tokenized_train_dataset = tokenize_and_align_labels(train_dataset)
tokenized_test_dataset = tokenize_and_align_labels(test_dataset)

transformed_train_dataset = Dataset.from_dict(tokenized_train_dataset)
transformed_test_dataset = Dataset.from_dict(tokenized_test_dataset)

print(transformed_train_dataset)
print(transformed_test_dataset)

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 2788
})
Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 930
})


In [None]:
!pip install seqeval



In [None]:
import numpy as np
from datasets import load_metric
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

metric = load_metric("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label)]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label)]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels, zero_division=0)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    "ner",
    evaluation_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy='no',
    report_to='none'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=transformed_train_dataset,
    eval_dataset=transformed_test_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()



Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.008091,0.866376,0.912324,0.888756,0.997625
2,No log,0.007571,0.872519,0.909565,0.890657,0.997665
3,0.009600,0.00756,0.87577,0.915236,0.895068,0.997776




TrainOutput(global_step=525, training_loss=0.009391136424882072, metrics={'train_runtime': 693.4421, 'train_samples_per_second': 12.062, 'train_steps_per_second': 0.757, 'total_flos': 7767915974664192.0, 'train_loss': 0.009391136424882072, 'epoch': 3.0})

In [None]:
predictions = trainer.predict(transformed_test_dataset)

In [None]:
import numpy as np

predictions_logits = predictions.predictions
true_labels = predictions.label_ids

predicted_labels = np.argmax(predictions_logits, axis=2)

true_labels = [[label for label in sentence] for sentence in true_labels]

In [None]:
predicted_labels = [
    [p for (p, label) in zip(prediction, labels)]
    for prediction, labels in zip(predicted_labels, true_labels)
]

In [None]:
id_to_label = {id: label for label, id in label_dictionary.items()}

def labels_to_names(labels, id_to_label):
    return [[id_to_label[label] for label in sentence] for sentence in labels]

predicted_label_names = labels_to_names(predicted_labels, id_to_label)
true_label_names = labels_to_names(true_labels, id_to_label)

In [None]:
from collections import Counter

def tokens_to_words(dataset, predictions, label_list):
    texts = [" ".join([word for word, label in text]) for text in dataset]
    tokenized_inputs = tokenizer(texts, padding=True, truncation=True,
                                 is_split_into_words=False, return_tensors="pt")
    word_ids = tokenized_inputs.word_ids()
    current_word_predictions = []
    word_predictions = []

    for word_idx, prediction_idx in zip(word_ids, predictions):
        if word_idx is not None:
            if word_idx != len(word_predictions):
                if current_word_predictions:
                    most_common_label = Counter(current_word_predictions).most_common(1)[0][0]
                    word_predictions.append(most_common_label)
                    current_word_predictions = []
                else:
                    word_predictions.append(label_list[prediction_idx])
            current_word_predictions.append(label_list[prediction_idx])

    if current_word_predictions:
        most_common_label = Counter(current_word_predictions).most_common(1)[0][0]
        word_predictions.append(most_common_label)

    return word_predictions

In [None]:
import itertools

predicted_labels = list(itertools.chain(*predicted_labels))
true_labels = list(itertools.chain(*true_labels))

pr_labels = tokens_to_words(test_dataset, predicted_labels, label_list)
tr_labels = tokens_to_words(test_dataset, true_labels, label_list)

In [None]:
from sklearn.metrics import classification_report, matthews_corrcoef
import itertools

true_labels_flatten = list(itertools.chain(*true_label_names))
predicted_labels_flatten = list(itertools.chain(*predicted_label_names))

print(classification_report(true_labels_flatten, predicted_labels_flatten))
print(f"Matthews Correlation Coefficient: {matthews_corrcoef(true_labels_flatten, predicted_labels_flatten)}")

                   precision    recall  f1-score   support

                O       1.00      1.00      1.00    469195
     central_city       0.51      0.74      0.60       184
      geo_address       0.90      0.96      0.93      1037
     geo_building       0.80      0.90      0.85       453
         geo_city       0.90      0.93      0.91      1433
     geo_district       0.84      0.81      0.83       387
geo_microdistrict       0.63      0.74      0.68       382
       geo_region       0.99      0.99      0.99      1733
geo_region_oblast       0.90      0.93      0.91       297
       geo_street       0.91      0.91      0.91      1059

         accuracy                           1.00    476160
        macro avg       0.84      0.89      0.86    476160
     weighted avg       1.00      1.00      1.00    476160

Matthews Correlation Coefficient: 0.9253394423404717


In [None]:
!pip install numpy pandas tqdm scikit-learn tensorflow transformers



In [None]:
train_dataset = get_labeled_dataset("datasets/train_geo_extractor.json")

In [None]:
from sklearn import preprocessing

sentences = []
labels = []

label_list = ["O", "central_city", "geo_address", "geo_building", "geo_city",
              "geo_district", "geo_microdistrict", "geo_region",
              "geo_region_oblast", "geo_street"]

label_dictionary = {label: i for i, label in enumerate(label_list)}

for text in train_dataset:
    sentences.append([word for word, _ in text])
    labels.append([label for _, label in text])

encoded_labels = []

for labels_for_sentence in labels:
    encoded_labels.append([label_dictionary.get(label) for label in labels_for_sentence])

In [None]:
import numpy as np
from tqdm import tqdm
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

max_sentence_length = max([len(text) for text in train_dataset])

def tokenize(data, max_len):
    input_ids = list()
    attention_mask = list()

    for index in tqdm(range(len(data))):
        encoded_data = tokenizer.encode_plus(data[index],
                                            add_special_tokens=True,
                                            max_length=max_len,
                                            is_split_into_words=True,
                                            return_attention_mask=True,
                                            padding="max_length",
                                            truncation=True,
                                            return_tensors="np")

        input_ids.append(encoded_data["input_ids"])
        attention_mask.append(encoded_data["attention_mask"])

    return np.vstack(input_ids), np.vstack(attention_mask)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(sentences, encoded_labels, test_size=0.1, random_state=42)

input_ids, attention_mask = tokenize(X_train, max_len=max_sentence_length)
val_input_ids, val_attention_mask = tokenize(X_test, max_len=max_sentence_length)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2509/2509 [00:03<00:00, 828.57it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 279/279 [00:00<00:00, 796.66it/s]


In [None]:
def pad_labels(input_labels, max_len):
    padded_labels = list()

    for index in range(len(input_labels)):
        padded_labels.append(np.array(input_labels[index] + [0] * (max_len - len(input_labels[index]))))

    return padded_labels

In [None]:
train_labels = pad_labels(y_train, max_sentence_length)
test_labels = pad_labels(y_test, max_sentence_length)

In [None]:
from tensorflow.keras.utils import to_categorical

train_labels = [to_categorical(index, num_classes=len(label_list)) for index in train_labels]
test_labels = [to_categorical(index, num_classes=len(label_list)) for index in test_labels]

2024-04-12 16:22:58.227231: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-12 16:22:58.227347: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-12 16:22:58.331483: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-04-12 16:22:58.532493: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
from tensorflow.keras import Model
from tensorflow.keras.layers import Input, Dropout, Dense
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import Accuracy

def instantiate_model(bert_model, max_len):
    input_ids = Input(shape=(max_len, ), dtype="int32")
    attention_mask = Input(shape=(max_len, ), dtype="int32")
    bert_layer = bert_model(input_ids, attention_mask=attention_mask, return_dict=True)

    embedding_layer = Dropout(0.3)(bert_layer["last_hidden_state"])
    output_layer = Dense(len(label_list), activation="softmax")(embedding_layer)

    model = Model(inputs=[input_ids, attention_mask], outputs=[output_layer])

    model.compile(optimizer=Adam(learning_rate=0.00001), loss=SparseCategoricalCrossentropy(), metrics=[Accuracy()])

    return model

In [None]:
from transformers import TFBertModel

bert_model = TFBertModel.from_pretrained("bert-base-uncased")
model = instantiate_model(bert_model, max_sentence_length)

2024-04-12 16:23:12.269866: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-04-12 16:23:12.281327: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-04-12 16:23:12.282080: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

In [None]:
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping

early_stopping_callback = EarlyStopping(mode="min", patience=5)

bert_history = model.fit([input_ids, attention_mask], np.array(train_labels),
                        validation_data=([val_input_ids, val_attention_mask], np.array(test_labels)),
                        epochs=25, batch_size=32,
                        callbacks=early_stopping_callback)

2024-04-12 16:23:31.690298: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 18104944 exceeds 10% of free system memory.
2024-04-12 16:23:31.733990: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 18104944 exceeds 10% of free system memory.


Epoch 1/25


2024-04-12 16:23:31.987557: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 90524720 exceeds 10% of free system memory.
2024-04-12 16:23:32.108140: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 18104944 exceeds 10% of free system memory.




2024-04-12 16:23:55.225094: I external/local_tsl/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2024-04-12 16:24:06.739342: W external/local_tsl/tsl/framework/bfc_allocator.cc:485] Allocator (GPU_0_bfc) ran out of memory trying to allocate 1.16GiB (rounded to 1249695744)requested by op model/tf_bert_model/bert/encoder/layer_._0/attention/self/MatMul
If the cause is memory fragmentation maybe the environment variable 'TF_GPU_ALLOCATOR=cuda_malloc_async' will improve the situation. 
Current allocation summary follows.
Current allocation summary follows.
2024-04-12 16:24:06.739421: I external/local_tsl/tsl/framework/bfc_allocator.cc:1039] BFCAllocator dump for GPU_0_bfc
2024-04-12 16:24:06.739450: I external/local_tsl/tsl/framework/bfc_allocator.cc:1046] Bin (256): 	Total Chunks: 69, Chunks in use: 68. 17.2KiB allocated for chunks. 17.0KiB in use in bin. 420B client-requested in use in bin.
2024-04-12 16:24:06.739469: I external/local_t

2024-04-12 16:24:06.744257: I external/local_tsl/tsl/framework/bfc_allocator.cc:1095] InUse at 7f684ee45200 of size 2359296 next 102
2024-04-12 16:24:06.744264: I external/local_tsl/tsl/framework/bfc_allocator.cc:1095] InUse at 7f684f085200 of size 2359296 next 104
2024-04-12 16:24:06.744271: I external/local_tsl/tsl/framework/bfc_allocator.cc:1095] InUse at 7f684f2c5200 of size 2359296 next 77
2024-04-12 16:24:06.744287: I external/local_tsl/tsl/framework/bfc_allocator.cc:1095] InUse at 7f684f505200 of size 9437184 next 78
2024-04-12 16:24:06.744294: I external/local_tsl/tsl/framework/bfc_allocator.cc:1095] InUse at 7f684fe05200 of size 9437184 next 80
2024-04-12 16:24:06.744301: I external/local_tsl/tsl/framework/bfc_allocator.cc:1095] InUse at 7f6850705200 of size 2359296 next 115
2024-04-12 16:24:06.744308: I external/local_tsl/tsl/framework/bfc_allocator.cc:1095] InUse at 7f6850945200 of size 2359296 next 117
2024-04-12 16:24:06.744315: I external/local_tsl/tsl/framework/bfc_alloc

ResourceExhaustedError: Graph execution error:

Detected at node model/tf_bert_model/bert/encoder/layer_._0/attention/self/MatMul defined at (most recent call last):
  File "/home/alex_braun/anaconda3/lib/python3.9/runpy.py", line 197, in _run_module_as_main

  File "/home/alex_braun/anaconda3/lib/python3.9/runpy.py", line 87, in _run_code

  File "/home/alex_braun/anaconda3/lib/python3.9/site-packages/ipykernel_launcher.py", line 17, in <module>

  File "/home/alex_braun/anaconda3/lib/python3.9/site-packages/traitlets/config/application.py", line 846, in launch_instance

  File "/home/alex_braun/anaconda3/lib/python3.9/site-packages/ipykernel/kernelapp.py", line 712, in start

  File "/home/alex_braun/anaconda3/lib/python3.9/site-packages/tornado/platform/asyncio.py", line 199, in start

  File "/home/alex_braun/anaconda3/lib/python3.9/asyncio/base_events.py", line 601, in run_forever

  File "/home/alex_braun/anaconda3/lib/python3.9/asyncio/base_events.py", line 1905, in _run_once

  File "/home/alex_braun/anaconda3/lib/python3.9/asyncio/events.py", line 80, in _run

  File "/home/alex_braun/anaconda3/lib/python3.9/site-packages/ipykernel/kernelbase.py", line 510, in dispatch_queue

  File "/home/alex_braun/anaconda3/lib/python3.9/site-packages/ipykernel/kernelbase.py", line 499, in process_one

  File "/home/alex_braun/anaconda3/lib/python3.9/site-packages/ipykernel/kernelbase.py", line 406, in dispatch_shell

  File "/home/alex_braun/anaconda3/lib/python3.9/site-packages/ipykernel/kernelbase.py", line 730, in execute_request

  File "/home/alex_braun/anaconda3/lib/python3.9/site-packages/ipykernel/ipkernel.py", line 390, in do_execute

  File "/home/alex_braun/anaconda3/lib/python3.9/site-packages/ipykernel/zmqshell.py", line 528, in run_cell

  File "/home/alex_braun/anaconda3/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 2914, in run_cell

  File "/home/alex_braun/anaconda3/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 2960, in _run_cell

  File "/home/alex_braun/anaconda3/lib/python3.9/site-packages/IPython/core/async_helpers.py", line 78, in _pseudo_sync_runner

  File "/home/alex_braun/anaconda3/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3185, in run_cell_async

  File "/home/alex_braun/anaconda3/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3377, in run_ast_nodes

  File "/home/alex_braun/anaconda3/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3457, in run_code

  File "/tmp/ipykernel_8515/3556622529.py", line 6, in <module>

  File "/home/alex_braun/anaconda3/lib/python3.9/site-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler

  File "/home/alex_braun/anaconda3/lib/python3.9/site-packages/keras/src/engine/training.py", line 1807, in fit

  File "/home/alex_braun/anaconda3/lib/python3.9/site-packages/keras/src/engine/training.py", line 1401, in train_function

  File "/home/alex_braun/anaconda3/lib/python3.9/site-packages/keras/src/engine/training.py", line 1384, in step_function

  File "/home/alex_braun/anaconda3/lib/python3.9/site-packages/keras/src/engine/training.py", line 1373, in run_step

  File "/home/alex_braun/anaconda3/lib/python3.9/site-packages/keras/src/engine/training.py", line 1150, in train_step

  File "/home/alex_braun/anaconda3/lib/python3.9/site-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler

  File "/home/alex_braun/anaconda3/lib/python3.9/site-packages/keras/src/engine/training.py", line 590, in __call__

  File "/home/alex_braun/anaconda3/lib/python3.9/site-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler

  File "/home/alex_braun/anaconda3/lib/python3.9/site-packages/keras/src/engine/base_layer.py", line 1149, in __call__

  File "/home/alex_braun/anaconda3/lib/python3.9/site-packages/keras/src/utils/traceback_utils.py", line 96, in error_handler

  File "/home/alex_braun/anaconda3/lib/python3.9/site-packages/keras/src/engine/functional.py", line 515, in call

  File "/home/alex_braun/anaconda3/lib/python3.9/site-packages/keras/src/engine/functional.py", line 672, in _run_internal_graph

  File "/home/alex_braun/anaconda3/lib/python3.9/site-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler

  File "/home/alex_braun/anaconda3/lib/python3.9/site-packages/keras/src/engine/training.py", line 590, in __call__

  File "/home/alex_braun/anaconda3/lib/python3.9/site-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler

  File "/home/alex_braun/anaconda3/lib/python3.9/site-packages/keras/src/engine/base_layer.py", line 1149, in __call__

  File "/home/alex_braun/anaconda3/lib/python3.9/site-packages/keras/src/utils/traceback_utils.py", line 96, in error_handler

  File "/home/alex_braun/anaconda3/lib/python3.9/site-packages/transformers/modeling_tf_utils.py", line 1207, in run_call_with_unpacked_inputs

  File "/home/alex_braun/anaconda3/lib/python3.9/site-packages/transformers/models/bert/modeling_tf_bert.py", line 1234, in call

  File "/home/alex_braun/anaconda3/lib/python3.9/site-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler

  File "/home/alex_braun/anaconda3/lib/python3.9/site-packages/keras/src/engine/base_layer.py", line 1149, in __call__

  File "/home/alex_braun/anaconda3/lib/python3.9/site-packages/keras/src/utils/traceback_utils.py", line 96, in error_handler

  File "/home/alex_braun/anaconda3/lib/python3.9/site-packages/transformers/modeling_tf_utils.py", line 1207, in run_call_with_unpacked_inputs

  File "/home/alex_braun/anaconda3/lib/python3.9/site-packages/transformers/models/bert/modeling_tf_bert.py", line 994, in call

  File "/home/alex_braun/anaconda3/lib/python3.9/site-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler

  File "/home/alex_braun/anaconda3/lib/python3.9/site-packages/keras/src/engine/base_layer.py", line 1149, in __call__

  File "/home/alex_braun/anaconda3/lib/python3.9/site-packages/keras/src/utils/traceback_utils.py", line 96, in error_handler

  File "/home/alex_braun/anaconda3/lib/python3.9/site-packages/transformers/models/bert/modeling_tf_bert.py", line 628, in call

  File "/home/alex_braun/anaconda3/lib/python3.9/site-packages/transformers/models/bert/modeling_tf_bert.py", line 634, in call

  File "/home/alex_braun/anaconda3/lib/python3.9/site-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler

  File "/home/alex_braun/anaconda3/lib/python3.9/site-packages/keras/src/engine/base_layer.py", line 1149, in __call__

  File "/home/alex_braun/anaconda3/lib/python3.9/site-packages/keras/src/utils/traceback_utils.py", line 96, in error_handler

  File "/home/alex_braun/anaconda3/lib/python3.9/site-packages/transformers/models/bert/modeling_tf_bert.py", line 527, in call

  File "/home/alex_braun/anaconda3/lib/python3.9/site-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler

  File "/home/alex_braun/anaconda3/lib/python3.9/site-packages/keras/src/engine/base_layer.py", line 1149, in __call__

  File "/home/alex_braun/anaconda3/lib/python3.9/site-packages/keras/src/utils/traceback_utils.py", line 96, in error_handler

  File "/home/alex_braun/anaconda3/lib/python3.9/site-packages/transformers/models/bert/modeling_tf_bert.py", line 411, in call

  File "/home/alex_braun/anaconda3/lib/python3.9/site-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler

  File "/home/alex_braun/anaconda3/lib/python3.9/site-packages/keras/src/engine/base_layer.py", line 1149, in __call__

  File "/home/alex_braun/anaconda3/lib/python3.9/site-packages/keras/src/utils/traceback_utils.py", line 96, in error_handler

  File "/home/alex_braun/anaconda3/lib/python3.9/site-packages/transformers/models/bert/modeling_tf_bert.py", line 315, in call

OOM when allocating tensor with shape[32,12,902,902] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[{{node model/tf_bert_model/bert/encoder/layer_._0/attention/self/MatMul}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.
 [Op:__inference_train_function_34705]