<a href="https://colab.research.google.com/github/667029/KVP10k/blob/main/layoutlmv3_kvp10k_relation_layer_v3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets torch transformers accelerate numpy tqdm tensorflow scikit-learn



In [None]:
# Definerer drive lokasjon
drive_mount_path='/content/drive'

In [None]:
hidden_size = 768           # Size of the hidden vectors.
batch_size = 32             # Batch size.
test_size_percentage = 20   # Percentage of dataset to be dedicated to test
seed = 42                   # Seed for reproducibility.

In [None]:
import torch
from datasets import load_from_disk
import tensorflow as tf
import numpy as np


class DataUtil:
    @staticmethod
    def get_output(dataset_path=None, model_output=None, drive_mount_path=drive_mount_path,
                   extract_hidden=False, verbose=True):
        """
        Loads a Hugging Face dataset from disk (if dataset_path is provided) or returns
        a given model_output.
        """
        if dataset_path is not None:
            try:
                from google.colab import drive
                drive.mount(drive_mount_path, force_remount=False)
            except Exception as e:
                if verbose:
                    print("Google Drive may already be mounted. Continuing...")
            dataset = load_from_disk(dataset_path)
            if verbose:
                print("Loaded dataset from drive.")
                print("Number of samples:", len(dataset))
                print("Column names:", dataset.column_names)
                print("First sample:", dataset[0])
            return dataset
        elif model_output is not None:
            if extract_hidden:
                if isinstance(model_output, dict) and "hidden_states" in model_output:
                    model_output = model_output["hidden_states"]
                elif isinstance(model_output, (list, tuple)):
                    model_output = model_output[-1]
            if verbose:
                print("Provided model output:")
                print(model_output)
            return model_output
        else:
            raise ValueError("Please provide either a dataset_path or a model_output.")

    @staticmethod
    def to_tf_dataset(dataset, hidden_size, batch_size=32, shuffle_buffer=1000, verbose=True):
        """
        Converts a Hugging Face dataset into a tf.data.Dataset.
        Assumes each sample contains keys "h_i", "h_j", and "label".
        If the label is a sequence, the first element is used.
        Assumes that "h_i" and "h_j" are 1D vectors (of length hidden_size).
        """
        def generator():
            for sample in dataset:
                h_i = np.array(sample["h_i"], dtype=np.float32)  # shape: (hidden_size,)
                h_j = np.array(sample["h_j"], dtype=np.float32)  # shape: (hidden_size,)
                label_arr = np.array(sample["label"], dtype=np.int32)
                label = label_arr[0] if label_arr.ndim > 0 else label_arr
                yield (h_i, h_j, label)

        # Use first sample to determine shape.
        first_sample = dataset[0]
        h_i_first = np.array(first_sample["h_i"], dtype=np.float32)
        if h_i_first.ndim == 1:
              # If h_i and h_j are fixed-length vectors.
              output_signature = (
                  tf.TensorSpec(shape=(hidden_size,), dtype=tf.float32),
                  tf.TensorSpec(shape=(hidden_size,), dtype=tf.float32),
                  tf.TensorSpec(shape=(), dtype=tf.int32)
              )
              ds = tf.data.Dataset.from_generator(generator, output_signature=output_signature)
              ds = ds.shuffle(shuffle_buffer).batch(batch_size)
        elif h_i_first.ndim == 2:
              # h_i and h_j are sequences with shape (seq_len, hidden_size) but seq_len may vary.
              output_signature = (
                  tf.TensorSpec(shape=(None, hidden_size), dtype=tf.float32),
                  tf.TensorSpec(shape=(None, hidden_size), dtype=tf.float32),
                  tf.TensorSpec(shape=(), dtype=tf.int32)
              )
              ds = tf.data.Dataset.from_generator(generator, output_signature=output_signature)
              # Use padded_batch so that all sequences in the batch are padded to the maximum length.
              ds = ds.shuffle(shuffle_buffer).padded_batch(batch_size,
                      padded_shapes=([None, hidden_size], [None, hidden_size], []))
        else:
            raise ValueError("Expected h_i to be 1D or 2D; got shape " + str(h_i_first.shape))

        tf_dataset = tf.data.Dataset.from_generator(generator, output_signature=output_signature)
        tf_dataset = tf_dataset.shuffle(shuffle_buffer).batch(batch_size)
        if verbose:
            print(f"Converted dataset to tf.data.Dataset with batch size {batch_size} and h_i shape {(hidden_size,)}.")
        return tf_dataset



In [None]:
# 1. # Hent dataset fra drive
dataset_path = "/content/drive/MyDrive/RE_ready/re_dataset_test_combined"
loaded_dataset = DataUtil.get_output(dataset_path=dataset_path)
split_dataset = loaded_dataset.train_test_split(test_size=test_size_percentage/100, seed=seed)
train_dataset_hf = split_dataset["train"]
test_dataset_hf = split_dataset["test"]

# 2. # Hent dataset fra output av layoutmvl3 modellen
# model_output = outputs
# dataset = DataUtil.save_output_as_dataset(model_output=model_output, verbose=True)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Loaded dataset from drive.
Number of samples: 143476
Column names: ['h_i', 'h_j', 'label']
First sample: {'h_i': [-0.03574247285723686, -0.015568173490464687, 0.04433594271540642, -0.2551768124103546, 0.5431303381919861, -0.052284833043813705, 0.1384091079235077, 0.27487412095069885, -0.17961081862449646, -0.10797158628702164, -0.46370697021484375, -0.14926613867282867, -0.09300041943788528, 0.06408052891492844, 0.12591618299484253, -0.3600463271141052, -0.06332714110612869, -0.20246948301792145, -0.25451621413230896, 0.36145108938217163, -0.012236740440130234, -0.0203297920525074, 0.12791365385055542, -0.06409116834402084, -0.023888980969786644, 0.1217065379023552, 0.14684250950813293, -0.341422438621521, -0.08902009576559067, 0.03113497979938984, -0.3619805574417114, -0.2470576912164688, -0.39857348799705505, -0.16561433672904968, -0.2645741105079651, -0.19

In [None]:
import numpy as np
import tensorflow as tf

# Opprett dataset laster
train_dataset = DataUtil.to_tf_dataset(train_dataset_hf, hidden_size=hidden_size, batch_size=32)

# For visuel verifisering av datasetets oppbygging stemmer med forventninger
for batch in train_dataset.take(1):
    h_i_batch, h_j_batch, labels_batch = batch
    print("h_i batch shape:", h_i_batch.shape)  # Expect (batch_size, 768)
    print("h_j batch shape:", h_j_batch.shape)  # Expect (batch_size, 768)
    print("labels batch shape:", labels_batch.shape)  # Expect (batch_size,)

test_dataset = DataUtil.to_tf_dataset(test_dataset_hf, hidden_size=hidden_size, batch_size=32)
for batch in test_dataset.take(1):
    h_i_batch, h_j_batch, labels_batch = batch
    print("h_i batch shape:", h_i_batch.shape)  # Expect (batch_size, 768)
    print("h_j batch shape:", h_j_batch.shape)  # Expect (batch_size, 768)
    print("labels batch shape:", labels_batch.shape)  # Expect (batch_size,)

Converted dataset to tf.data.Dataset with batch size 32 and h_i shape (768,).
h_i batch shape: (32, 768)
h_j batch shape: (32, 768)
labels batch shape: (32,)
Converted dataset to tf.data.Dataset with batch size 32 and h_i shape (768,).
h_i batch shape: (32, 768)
h_j batch shape: (32, 768)
labels batch shape: (32,)


In [None]:
import tensorflow as tf

# Define the relation extraction model as a subclass of tf.keras.Model.
class RelationExtractionHead(tf.keras.Model):
    def __init__(self, hidden_size, num_relations):
        super(RelationExtractionHead, self).__init__()
        self.head_proj = tf.keras.layers.Dense(hidden_size, activation='relu', name="head_proj")
        self.tail_proj = tf.keras.layers.Dense(hidden_size, activation='relu', name="tail_proj")
        self.combined_dense = tf.keras.layers.Dense(num_relations, name="combined_dense")

    def call(self, inputs, training=False):
        h_i, h_j = inputs  # Each: (batch, hidden_size)
        head = self.head_proj(h_i)
        tail = self.tail_proj(h_j)
        # Element-wise multiplication.
        x = head * tail
        logits = self.combined_dense(x)  # Output shape: (batch, num_relations)
        return logits

In [None]:
import tensorflow as tf
from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np
import random


num_relations = 5                                 # Number of relation classes.
num_epochs = 5                                    # Number of training epochs.
learning_rate = 1e-4                              # Learning rate.
sample_range_min = len(train_dataset_hf) * 0.65;   # Minimum entries per sample
sample_range_max = len(train_dataset_hf) * 0.75;  # Maximum entries per sample

model_tf = RelationExtractionHead(hidden_size=hidden_size, num_relations=num_relations)

loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

num_samples = len(train_dataset_hf)
total_batches = int(np.ceil(num_samples / batch_size))
loss_history = []

for epoch in range(num_epochs):
    samples_this_epoch = random.randint(sample_range_min, sample_range_max)
    total_batches = int(np.ceil(samples_this_epoch / batch_size))
    epoch_dataset = (
        train_dataset.unbatch()
        .shuffle(buffer_size=10000, reshuffle_each_iteration=True)
        .take(samples_this_epoch)
        .batch(batch_size)
    )

    epoch_loss = 0.0
    batches = 0

    progress_bar = tqdm(epoch_dataset, desc=f"Epoch {epoch+1}/{num_epochs}", unit="batch", total=total_batches)
    for h_i_batch, h_j_batch, labels_batch in progress_bar:
        with tf.GradientTape() as tape:
            logits = model_tf((h_i_batch, h_j_batch), training=True)
            loss_value = loss_fn(labels_batch, logits)
        grads = tape.gradient(loss_value, model_tf.trainable_variables)
        optimizer.apply_gradients(zip(grads, model_tf.trainable_variables))

        epoch_loss += loss_value.numpy()
        batches += 1
        progress_bar.set_postfix(loss=loss_value.numpy())

    avg_loss = epoch_loss / batches
    loss_history.append(avg_loss)
    print(f"Epoch {epoch+1}/{num_epochs}, Average Loss: {avg_loss:.4f}")

plt.figure(figsize=(8, 5))
plt.plot(range(1, num_epochs+1), loss_history, marker='o', linestyle='-')
plt.title("Training Loss Over Epochs")
plt.xlabel("Epoch")
plt.ylabel("Average Loss")
plt.grid(True)
plt.show()


Epoch 1/5:  68%|██████▊   | 1741/2578 [02:30<01:04, 12.95batch/s, loss=0.000275]

In [None]:
import tensorflow as tf
import numpy as np
from sklearn.metrics import classification_report


class RelationModelTesterTF:
    def __init__(self, model, dataset, batch_size=32):
        self.model = model
        # Assume the dataset is not already batched:
        self.dataset = dataset.batch(batch_size)

    def evaluate(self, total_batches=None):
        all_predictions = []
        all_ground_truths = []

        # If total_batches is not provided, calculate it by iterating through the dataset.
        if total_batches is None:
            total_batches = sum(1 for _ in self.dataset)

        progress_bar = tqdm(self.dataset, desc="Evaluating", unit="batch", total=total_batches)
        for h_i_batch, h_j_batch, labels_batch in progress_bar:
            logits = self.model((h_i_batch, h_j_batch), training=False)
            preds = tf.argmax(logits, axis=-1)
            all_predictions.extend(preds.numpy().tolist())
            all_ground_truths.extend(labels_batch.numpy().tolist())

        all_predictions = np.array(all_predictions)
        all_ground_truths = np.array(all_ground_truths)
        accuracy = np.mean(all_predictions == all_ground_truths)
        report = classification_report(all_ground_truths, all_predictions, digits=4)
        print("Classification Report:\n", report)
        print("Overall Accuracy: {:.2f}%".format(accuracy * 100))

        return {
            "predictions": all_predictions,
            "ground_truth": all_ground_truths,
            "accuracy": accuracy
        }

In [None]:
tester = RelationModelTesterTF(model_tf, train_dataset, batch_size=32)
results = tester.evaluate()

print("Test Accuracy: {:.2f}%".format(results["accuracy"] * 100))
print("Example Predictions (first 10 samples):", results["predictions"][:10])

# Print out a mapping for a few samples (e.g., the first 10)
print("Sample-wise Predictions:")
for i in range(min(10, len(results["predictions"]))):
    true_label = results["ground_truth"][i]
    predicted_label = results["predictions"][i]
    probabilities = results["probabilities"][i]
    print(f"Sample {i}: True Label = {true_label} | Predicted Label = {predicted_label} | Probabilities = {probabilities}")