<a href="https://colab.research.google.com/github/667029/KVP10k/blob/main/layoutlmv3_kvp10k_relation_layer_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets torch transformers accelerate numpy tqdm tensorflow scikit-learn

In [2]:
# Definerer drive lokasjon
drive_mount_path='/content/drive'

# Definerer hidden state kontanter
hidden_size = 768

In [3]:
import torch
from datasets import load_from_disk
import tensorflow as tf
import numpy as np


class DataUtil:
    @staticmethod
    def get_output(dataset_path=None, model_output=None, drive_mount_path=drive_mount_path,
                   extract_hidden=False, verbose=True):
        """
        Loads a Hugging Face dataset from disk (if dataset_path is provided) or returns
        a given model_output.
        """
        if dataset_path is not None:
            try:
                from google.colab import drive
                drive.mount(drive_mount_path, force_remount=False)
            except Exception as e:
                if verbose:
                    print("Google Drive may already be mounted. Continuing...")
            dataset = load_from_disk(dataset_path)
            if verbose:
                print("Loaded dataset from drive.")
                print("Number of samples:", len(dataset))
                print("Column names:", dataset.column_names)
                print("First sample:", dataset[0])
            return dataset
        elif model_output is not None:
            if extract_hidden:
                if isinstance(model_output, dict) and "hidden_states" in model_output:
                    model_output = model_output["hidden_states"]
                elif isinstance(model_output, (list, tuple)):
                    model_output = model_output[-1]
            if verbose:
                print("Provided model output:")
                print(model_output)
            return model_output
        else:
            raise ValueError("Please provide either a dataset_path or a model_output.")

    @staticmethod
    def to_tf_dataset(dataset, hidden_size, batch_size=32, shuffle_buffer=1000, verbose=True):
        """
        Converts a Hugging Face dataset into a tf.data.Dataset.
        Assumes each sample contains keys "h_i", "h_j", and "label".
        If the label is a sequence, the first element is used.
        Assumes that "h_i" and "h_j" are 1D vectors (of length hidden_size).
        """
        def generator():
            for sample in dataset:
                h_i = np.array(sample["h_i"], dtype=np.float32)  # shape: (hidden_size,)
                h_j = np.array(sample["h_j"], dtype=np.float32)  # shape: (hidden_size,)
                label_arr = np.array(sample["label"], dtype=np.int32)
                label = label_arr[0] if label_arr.ndim > 0 else label_arr
                yield (h_i, h_j, label)

        # Use first sample to determine shape.
        first_sample = dataset[0]
        h_i_first = np.array(first_sample["h_i"], dtype=np.float32)
        if h_i_first.ndim != 1:
            raise ValueError("Expected h_i to be a 1D vector; got shape " + str(h_i_first.shape))
        output_signature = (
            tf.TensorSpec(shape=(hidden_size,), dtype=tf.float32),
            tf.TensorSpec(shape=(hidden_size,), dtype=tf.float32),
            tf.TensorSpec(shape=(), dtype=tf.int32)
        )

        tf_dataset = tf.data.Dataset.from_generator(generator, output_signature=output_signature)
        tf_dataset = tf_dataset.shuffle(shuffle_buffer).batch(batch_size)
        if verbose:
            print(f"Converted dataset to tf.data.Dataset with batch size {batch_size} and h_i shape {(hidden_size,)}.")
        return tf_dataset



ModuleNotFoundError: No module named 'datasets'

In [None]:
# 1. # Hent dataset fra drive
dataset_path = "/content/drive/MyDrive/RE_ready/re_dataset_test_combined"
loaded_dataset = DataUtil.get_output(dataset_path=dataset_path)


# 2. # Hent dataset fra output av layoutmvl3 modellen
# model_output = outputs
# dataset = DataUtil.save_output_as_dataset(model_output=model_output, verbose=True)


In [None]:
import numpy as np
import tensorflow as tf

# Opprett dataset laster
train_dataset_tf = DataUtil.to_tf_dataset(loaded_dataset, hidden_size=hidden_size, batch_size=32)

# For visuel verifisering av datasetets oppbygging stemmer med forventninger
for batch in train_dataset_tf.take(1):
    h_i_batch, h_j_batch, labels_batch = batch
    print("h_i batch shape:", h_i_batch.shape)  # Expect (batch_size, 768)
    print("h_j batch shape:", h_j_batch.shape)  # Expect (batch_size, 768)
    print("labels batch shape:", labels_batch.shape)  # Expect (batch_size,)

In [None]:
# Opprett dataset batches
train_dataset = train_dataset.shuffle(buffer_size=1000).batch(32)

In [None]:
import tensorflow as tf

# Define the relation extraction model as a subclass of tf.keras.Model.
class RelationExtractionHead(tf.keras.Model):
    def __init__(self, hidden_size, num_relations):
        super(RelationExtractionHead, self).__init__()
        self.head_proj = tf.keras.layers.Dense(hidden_size, activation='relu', name="head_proj")
        self.tail_proj = tf.keras.layers.Dense(hidden_size, activation='relu', name="tail_proj")
        self.combined_dense = tf.keras.layers.Dense(num_relations, name="combined_dense")

    def call(self, inputs, training=False):
        h_i, h_j = inputs  # Each: (batch, hidden_size)
        head = self.head_proj(h_i)
        tail = self.tail_proj(h_j)
        # Element-wise multiplication.
        x = head * tail
        logits = self.combined_dense(x)  # Output shape: (batch, num_relations)
        return logits

In [None]:
import tensorflow as tf
from tqdm import tqdm  # For progress visualization.
import matplotlib.pyplot as plt
import numpy as np

# Assume these hyperparameters are set:
num_relations = 5       # Number of relation classes.
num_epochs = 2          # Number of training epochs.
learning_rate = 1e-4    # Learning rate.
hidden_size = 768       # Size of the hidden vectors.
batch_size = 32

# Define the model (RelationExtractionHead) (using the updated Bilinear layer).
model_tf = RelationExtractionHead(hidden_size=hidden_size, num_relations=num_relations)

# Define loss function and optimizer.
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

# Assume train_dataset is your tf.data.Dataset (created earlier by DataUtil.to_tf_dataset).
# Compute total number of batches from the original loaded dataset.
# Assume "loaded_dataset" is the Hugging Face dataset loaded using DataUtil.get_output.
num_samples = len(loaded_dataset)
total_batches = int(np.ceil(num_samples / batch_size))

loss_history = []

loss_history = []

for epoch in range(num_epochs):
    epoch_loss = 0.0
    batches = 0
    progress_bar = tqdm(train_dataset, desc=f"Epoch {epoch+1}/{num_epochs}", unit="batch", total=total_batches)
    for h_i_batch, h_j_batch, labels_batch in progress_bar:
        with tf.GradientTape() as tape:
            logits = model_tf((h_i_batch, h_j_batch), training=True)
            loss_value = loss_fn(labels_batch, logits)
        grads = tape.gradient(loss_value, model_tf.trainable_variables)
        optimizer.apply_gradients(zip(grads, model_tf.trainable_variables))
        epoch_loss += loss_value.numpy()
        batches += 1
        progress_bar.set_postfix(loss=loss_value.numpy())
    avg_loss = epoch_loss / batches
    loss_history.append(avg_loss)
    print(f"Epoch {epoch+1}/{num_epochs}, Average Loss: {avg_loss:.4f}")

plt.figure(figsize=(8, 5))
plt.plot(range(1, num_epochs+1), loss_history, marker='o', linestyle='-')
plt.title("Training Loss Over Epochs")
plt.xlabel("Epoch")
plt.ylabel("Average Loss")
plt.grid(True)
plt.show()

In [None]:
import tensorflow as tf
import numpy as np
from sklearn.metrics import classification_report


class RelationModelTesterTF:
    def __init__(self, model, dataset, batch_size=32):
        self.model = model
        self.dataset = dataset.batch(batch_size)

    def evaluate(self):
        all_predictions = []
        all_ground_truths = []
        for h_i_batch, h_j_batch, labels_batch in self.dataset:
            logits = self.model((h_i_batch, h_j_batch), training=False)
            preds = tf.argmax(logits, axis=-1)
            all_predictions.extend(preds.numpy().tolist())
            all_ground_truths.extend(labels_batch.numpy().tolist())
        all_predictions = np.array(all_predictions)
        all_ground_truths = np.array(all_ground_truths)
        accuracy = np.mean(all_predictions == all_ground_truths)
        report = classification_report(all_ground_truths, all_predictions, digits=4)
        print("Classification Report:\n", report)
        print("Overall Accuracy: {:.2f}%".format(accuracy * 100))
        return {"predictions": all_predictions, "ground_truth": all_ground_truths, "accuracy": accuracy}

In [None]:
tester_tf = RelationModelTesterTF(model_tf, train_dataset, batch_size=32)
results_tf = tester_tf.evaluate()

print("Test Accuracy: {:.2f}%".format(results_tf["accuracy"] * 100))
print("Example Predictions (first 10 samples):", results_tf["predictions"][:10])