<a href="https://colab.research.google.com/github/667029/KVP10k/blob/main/layoutlmv3_kvp10k_relation_layer_v6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets torch transformers accelerate numpy tqdm tensorflow scikit-learn

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_c

In [None]:
# Definerer drive lokasjon
drive_mount_path='/content/drive'

In [None]:
hidden_size = 768           # Size of the hidden vectors.
batch_size = 32             # Batch size.
test_size_percentage = 10   # Percentage of dataset to be dedicated to test
seed = 42                   # Seed for reproducibility.

In [None]:
import torch
from datasets import load_from_disk
import tensorflow as tf
import numpy as np


class DataUtil:
    @staticmethod
    def get_output(dataset_path=None, model_output=None, drive_mount_path=drive_mount_path, extract_hidden=False, verbose=True):
        if dataset_path is not None:
            try:
                from google.colab import drive
                drive.mount(drive_mount_path, force_remount=False)
            except Exception as e:
                if verbose:
                    print("Google Drive may already be mounted. Continuing...")
            dataset = load_from_disk(dataset_path)
            if verbose:
                print("Loaded dataset from drive.")
                print("Number of samples:", len(dataset))
                print("Column names:", dataset.column_names)
                print("First sample:", dataset[0])
            return dataset
        elif model_output is not None:
            if extract_hidden:
                if isinstance(model_output, dict) and "hidden_states" in model_output:
                    model_output = model_output["hidden_states"]
                elif isinstance(model_output, (list, tuple)):
                    model_output = model_output[-1]
            if verbose:
                print("Provided model output:")
                print(model_output)
            return model_output
        else:
            raise ValueError("Please provide either a dataset_path or a model_output.")

    @staticmethod
    def to_tf_dataset(dataset, hidden_size, batch_size=32, shuffle_buffer=1000, verbose=True):
        def generator():
            for sample in dataset:
                h_i = np.array(sample["h_i"], dtype=np.float32)
                h_j = np.array(sample["h_j"], dtype=np.float32)
                label_arr = np.array(sample["label"], dtype=np.int32)
                label = label_arr[0] if label_arr.ndim > 0 else label_arr
                key = sample["key"] if "key" in sample else "unknown"
                yield (h_i, h_j, label, key)

        first_sample = dataset[0]
        h_i_first = np.array(first_sample["h_i"], dtype=np.float32)

        if h_i_first.ndim == 1:
            output_signature = (
                tf.TensorSpec(shape=(hidden_size,), dtype=tf.float32),
                tf.TensorSpec(shape=(hidden_size,), dtype=tf.float32),
                tf.TensorSpec(shape=(), dtype=tf.int32),
                tf.TensorSpec(shape=(), dtype=tf.string),
            )
            ds = tf.data.Dataset.from_generator(generator, output_signature=output_signature)
            ds = ds.shuffle(shuffle_buffer).batch(batch_size)
        elif h_i_first.ndim == 2:
            output_signature = (
                tf.TensorSpec(shape=(None, hidden_size), dtype=tf.float32),
                tf.TensorSpec(shape=(None, hidden_size), dtype=tf.float32),
                tf.TensorSpec(shape=(), dtype=tf.int32),
                tf.TensorSpec(shape=(), dtype=tf.string),
            )
            ds = tf.data.Dataset.from_generator(generator, output_signature=output_signature)
            ds = ds.shuffle(shuffle_buffer).padded_batch(
                batch_size,
                padded_shapes=([None, hidden_size], [None, hidden_size], [], [])
            )
        else:
            raise ValueError("Expected h_i to be 1D or 2D; got shape " + str(h_i_first.shape))

        if verbose:
            print(f"Converted dataset to tf.data.Dataset with batch size {batch_size}.")
        return ds



In [None]:
# 1. # Hent dataset fra drive
# Paths to your datasets
train_dataset_path = "/content/drive/MyDrive/RE_ready/re_dataset_train_combined"
test_dataset_path = "/content/drive/MyDrive/RE_ready/re_dataset_test_combined"

# Load training dataset
train_loaded_dataset = DataUtil.get_output(dataset_path=train_dataset_path)
# Split into train/eval sets (e.g., 90% train, 10% eval)
split_dataset = train_loaded_dataset.train_test_split(test_size=test_size_percentage/100, seed=seed)

# Explicitly naming sets
train_dataset_hf = split_dataset["train"]           # Training set (with labels)
eval_dataset_hf = split_dataset["test"]             # Eval set (with labels for evaluation during training)

# Load your separate test dataset for inference/evaluation
test_loaded_dataset = DataUtil.get_output(dataset_path=test_dataset_path)
test_dataset_hf = test_loaded_dataset



# 2. # Hent dataset fra output av layoutmvl3 modellen
# model_output = outputs
# dataset = DataUtil.save_output_as_dataset(model_output=model_output, verbose=True)


In [None]:
import numpy as np
import tensorflow as tf

# Oppretter trening og test set
train_dataset = DataUtil.to_tf_dataset(train_dataset_hf, hidden_size=hidden_size, batch_size=32)
for batch in train_dataset.take(1):
    h_i_batch, h_j_batch, labels_batch, keys_batch = batch
    print("h_i batch shape:", h_i_batch.shape)
    print("h_j batch shape:", h_j_batch.shape)
    print("labels batch shape:", labels_batch.shape)
    print("keys batch shape:", keys_batch.shape)

test_dataset = DataUtil.to_tf_dataset(test_dataset_hf, hidden_size=hidden_size, batch_size=32)
for batch in test_dataset.take(1):
    h_i_batch, h_j_batch, labels_batch, keys_batch = batch
    print("h_i batch shape:", h_i_batch.shape)
    print("h_j batch shape:", h_j_batch.shape)
    print("labels batch shape:", labels_batch.shape)
    print("keys batch shape:", keys_batch.shape)

In [None]:
import numpy as np
import tensorflow as tf

class Bilinear(tf.keras.layers.Layer):
    def __init__(self, hidden_size, num_relations):
        super(Bilinear, self).__init__()
        self.hidden_size = hidden_size
        self.num_relations = num_relations

    def build(self, input_shape):
        # Create weight tensor of shape (hidden_size, num_relations, hidden_size)
        self.W = self.add_weight(
            shape=(self.hidden_size, self.num_relations, self.hidden_size),
            initializer='glorot_uniform',
            trainable=True,
            name="bilinear_W"
        )
        # Bias vector of shape (num_relations,)
        self.bias = self.add_weight(
            shape=(self.num_relations,),
            initializer='zeros',
            trainable=True,
            name="bilinear_bias"
        )
        super(Bilinear, self).build(input_shape)

    def call(self, inputs):
        head, tail = inputs  # Both are (batch, hidden_size)
        # Use einsum to compute: for each sample, head^T * W * tail.
        # Equation: 'bi,irk,bj->br' where:
        #   b: batch, i: hidden dimension from head, j: hidden dimension from tail, r: relation index.
        logits = tf.einsum('bi,irk,bj->br', head, self.W, tail)
        logits = logits + self.bias
        return logits

In [None]:
import tensorflow as tf

# Define the relation extraction model as a subclass of tf.keras.Model.
class RelationExtractionHead(tf.keras.Model):
    def __init__(self, hidden_size, num_relations):
        super(RelationExtractionHead, self).__init__()
        self.head_proj = tf.keras.layers.Dense(hidden_size, name="head_proj")
        self.tail_proj = tf.keras.layers.Dense(hidden_size, name="tail_proj")
        self.bilinear = Bilinear(hidden_size, num_relations)

    def call(self, inputs, training=False):
        h_i, h_j = inputs  # both are (batch, hidden_size)
        head = self.head_proj(h_i)
        tail = self.tail_proj(h_j)
        logits = self.bilinear((head, tail))  # (batch, num_relations)
        return logits

In [None]:
import tensorflow as tf
from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np
import random


num_relations = 5                                         # Number of relation classes.
num_epochs = 1                                            # Number of training epochs.
learning_rate = 1e-4                                      # Learning rate.
num_train_samples = len(train_dataset_hf)
sample_base = int(np.ceil(num_train_samples / num_epochs))
sample_range_min = int(sample_base * 0.95)
sample_range_max = int(sample_base * 1.10)

model_tf = RelationExtractionHead(hidden_size=hidden_size, num_relations=num_relations)

loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

num_samples = len(train_dataset_hf)
total_batches = int(np.ceil(num_samples / batch_size))
loss_history = []

for epoch in range(num_epochs):
    samples_this_epoch = random.randint(sample_range_min, sample_range_max)
    total_batches = int(np.ceil(samples_this_epoch / batch_size))
    epoch_dataset = (
        train_dataset.unbatch()
        .shuffle(buffer_size=10000, reshuffle_each_iteration=True)
        .take(samples_this_epoch)
        .batch(batch_size)
    )

    epoch_loss = 0.0
    batches = 0

    print(f"Epoch {epoch+1}/{num_epochs}, sampling {samples_this_epoch} examples...")
    epoch_loss = []
    for batch, (h_i_batch, h_j_batch, labels_batch, keys_batch) in enumerate(train_dataset.take(samples_this_epoch)):
        if len(h_i_batch.shape) == 3:
            h_i_batch = tf.reduce_mean(h_i_batch, axis=1)
            h_j_batch = tf.reduce_mean(h_j_batch, axis=1)

        with tf.GradientTape() as tape:
            logits = model_tf((h_i_batch, h_j_batch), training=True)
            loss = loss_fn(labels_batch, logits)
        grads = tape.gradient(loss, model_tf.trainable_variables)
        optimizer.apply_gradients(zip(grads, model_tf.trainable_variables))
        epoch_loss.append(loss.numpy())

    avg_loss = np.mean(epoch_loss)
    loss_history.append(avg_loss)
    print(f"Epoch {epoch+1}, Average Loss: {avg_loss:.4f}")


plt.figure(figsize=(8, 5))
plt.plot(range(1, num_epochs+1), loss_history, marker='o', linestyle='-')
plt.title("Training Loss Over Epochs")
plt.xlabel("Epoch")
plt.ylabel("Average Loss")
plt.grid(True)
plt.show()

In [None]:
from sklearn.metrics import classification_report
import numpy as np
from tqdm import tqdm
import tensorflow as tf

class RelationModelTesterTF:
    def __init__(self, model, dataset, batch_size=32, num_samples=None):
        self.model = model
        self.dataset = dataset
        self.batch_size = batch_size
        self.num_samples = num_samples or sum(1 for _ in dataset)
        self.total_batches = int(np.ceil(self.num_samples / batch_size))

    def evaluate(self):
        all_predictions = []
        all_ground_truths = []
        all_probabilities = []
        all_keys = []
        all_h_i = []
        all_h_j = []

        progress_bar = tqdm(self.dataset, desc="Evaluating", unit="batch", total=self.total_batches)

        for h_i_batch, h_j_batch, labels_batch, keys_batch in progress_bar:
            # Pool if necessary
            if len(h_i_batch.shape) == 3:

                h_j_pooled = tf.reduce_mean(h_j_batch, axis=1)
            else:
                h_i_pooled = h_i_batch
                h_j_pooled = h_j_batch

            logits = self.model((h_i_pooled, h_j_pooled), training=False)
            probabilities = tf.nn.softmax(logits, axis=-1)
            predictions = tf.argmax(probabilities, axis=-1)

            all_predictions.extend(predictions.numpy())
            all_ground_truths.extend(labels_batch.numpy())
            all_probabilities.extend(probabilities.numpy())
            all_keys.extend(keys_batch.numpy())
            all_h_i.extend(h_i_pooled.numpy())
            all_h_j.extend(h_j_pooled.numpy())

        accuracy = np.mean(np.array(all_predictions) == np.array(all_ground_truths))
        return {
            "predictions": all_predictions,
            "ground_truth": all_ground_truths,
            "accuracy": accuracy,
            "probabilities": all_probabilities,
            "keys": all_keys,
            "h_i": all_h_i,
            "h_j": all_h_j
        }

In [None]:
# Convert the test split to a stable, properly batched tf.data.Dataset
# Now pass the test dataset to the tester
tester = RelationModelTesterTF(model_tf, test_dataset, batch_size=32, num_samples=len(test_dataset_hf))
results = tester.evaluate()

print("Test Accuracy: {:.2f}%".format(results["accuracy"] * 100))
print("Example Predictions (first 10 samples):", results["predictions"][:10])

# Print out predictions and labels
print("Sample-wise Predictions:")
for i in range(min(10, len(results["predictions"]))):
    print(f"\nSample {i} (Key: {results['keys'][i]})")
    print("True Label      :", results["ground_truth"][i])
    print("Predicted Label :", results["predictions"][i])
    print("Probabilities   :", np.round(results["probabilities"][i], 4))
    print("h_i (first 5)   :", np.round(results["h_i"][i][:5], 4))
    print("h_j (first 5)   :", np.round(results["h_j"][i][:5], 4))

In [None]:
import os
import json
from datasets import load_from_disk
from collections import defaultdict
from data_util import DataUtil  # your utility class

# === CONFIG ===
# OCR JSON folder (mounted under MyDrive)
OCR_JSON_DIR = '/content/drive/MyDrive/KVP10K-dataset/kvp10k/test/ocrs'

# map integer label → relation name
ID2REL = {
    0: 'no_relation',
    1: 'parent_identifier',
    2: 'invoice_date',
    3: 'order_number',
    4: 'sum',
    5: 'customer_name',
}

# --- RUN INFERENCE ---
# assume you already instantiated and trained:
#   model_tf  = RelationExtractionHead(...)
#   tester    = RelationModelTesterTF(model_tf, test_dataset, batch_size=32)
results = tester.evaluate()

# --- AGGREGATE TEXTUAL KEY/VALUE PAIRS ---
matches = defaultdict(lambda: defaultdict(list))

for example, pred_id, prob_dist in zip(
        test_dataset_hf,
        results['predictions'],
        results['probabilities']
    ):
    doc_id   = example['doc_id']
    head_idx = example['head_idx']
    tail_idx = example['tail_idx']
    rel_name = ID2REL[pred_id]
    score    = round(float(prob_dist[pred_id]), 4)

    # Load the OCR JSON and extract text lines
    ocr_json = json.load(open(os.path.join(OCR_JSON_DIR, f"{doc_id}.json"), encoding='utf-8'))
    lines    = ocr_json['form']
    key_text   = lines[head_idx]['text']
    value_text = lines[tail_idx]['text']

    matches[doc_id][rel_name].append({
        'key': key_text,
        'value': value_text,
        'score': score,
    })

# --- PRINT TOP-SCORING PAIR PER DOC ---
for doc_id, rels in matches.items():
    print(f"\n📄 Document: {doc_id}")
    for rel, items in rels.items():
        best = max(items, key=lambda x: x['score'])
        print(f"{rel}: {best['value']}   "
              f"(key='{best['key']}', score={best['score']})")
