<a href="https://colab.research.google.com/github/667029/KVP10k/blob/main/Relation_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Imports and Setup
In this section, we import necessary libraries (e.g., PyTorch, TensorFlow, and Hugging Face `datasets`) and configure any global settings.

In [None]:
!pip install datasets torch transformers accelerate numpy tqdm tensorflow scikit-learn

In [None]:
# Definerer drive lokasjon
drive_mount_path='/content/drive'

hidden_size = 768           # Size of the hidden vectors.
batch_size = 32             # Batch size.
test_size_percentage = 10   # Percentage of dataset to be dedicated to test
seed = 42                   # Seed for reproducibility.


# Paths to your datasets
train_dataset_path = "/content/drive/MyDrive/RE_ready/re_dataset_train_combined"
test_dataset_path = "/content/drive/MyDrive/RE_ready/re_dataset_test_combined"


## Data Loading
Load the prepared training, evaluation, and test datasets from disk using HF `load_from_disk`.

In [None]:
import torch
from datasets import load_from_disk
import tensorflow as tf
import numpy as np


class DataUtil:
    @staticmethod
    def c(dataset_path=None, model_output=None, drive_mount_path=drive_mount_path, extract_hidden=False, verbose=True):
        if dataset_path is not None:
            try:
                from google.colab import drive
                drive.mount(drive_mount_path, force_remount=False)
            except Exception as e:
                if verbose:
                    print("Google Drive may already be mounted. Continuing...")
            dataset = load_from_disk(dataset_path)
            if verbose:
                print("Loaded dataset from drive.")
                print("Number of samples:", len(dataset))
                print("Column names:", dataset.column_names)
                print("First sample:", dataset[0])
            return dataset
        elif model_output is not None:
            if extract_hidden:
                if isinstance(model_output, dict) and "hidden_states" in model_output:
                    model_output = model_output["hidden_states"]
                elif isinstance(model_output, (list, tuple)):
                    model_output = model_output[-1]
            if verbose:
                print("Provided model output:")
                print(model_output)
            return model_output
        else:
            raise ValueError("Please provide either a dataset_path or a model_output.")

    @staticmethod
    def to_tf_dataset(dataset, hidden_size, batch_size=32, shuffle_buffer=1000, verbose=True):
        def generator():
            for sample in dataset:
                h_i = np.array(sample["h_i"], dtype=np.float32)
                h_j = np.array(sample["h_j"], dtype=np.float32)
                label_arr = np.array(sample["label"], dtype=np.int32)
                label = label_arr[0] if label_arr.ndim > 0 else label_arr
                key = sample["key"] if "key" in sample else "unknown"
                yield (h_i, h_j, label, key)

        first_sample = dataset[0]
        h_i_first = np.array(first_sample["h_i"], dtype=np.float32)

        if h_i_first.ndim == 1:
            output_signature = (
                tf.TensorSpec(shape=(hidden_size,), dtype=tf.float32),
                tf.TensorSpec(shape=(hidden_size,), dtype=tf.float32),
                tf.TensorSpec(shape=(), dtype=tf.int32),
                tf.TensorSpec(shape=(), dtype=tf.string),
            )
            ds = tf.data.Dataset.from_generator(generator, output_signature=output_signature)
            ds = ds.shuffle(shuffle_buffer).batch(batch_size)
        elif h_i_first.ndim == 2:
            output_signature = (
                tf.TensorSpec(shape=(None, hidden_size), dtype=tf.float32),
                tf.TensorSpec(shape=(None, hidden_size), dtype=tf.float32),
                tf.TensorSpec(shape=(), dtype=tf.int32),
                tf.TensorSpec(shape=(), dtype=tf.string),
            )
            ds = tf.data.Dataset.from_generator(generator, output_signature=output_signature)
            ds = ds.shuffle(shuffle_buffer).padded_batch(
                batch_size,
                padded_shapes=([None, hidden_size], [None, hidden_size], [], [])
            )
        else:
            raise ValueError("Expected h_i to be 1D or 2D; got shape " + str(h_i_first.shape))

        if verbose:
            print(f"Converted dataset to tf.data.Dataset with batch size {batch_size}.")
        return ds



## Training Loop
Train the model: iterate over epochs and batches, compute loss, and update model weights.

In [None]:
# Load training dataset
train_loaded_dataset = DataUtil.get_output(dataset_path=train_dataset_path)
split_dataset = train_loaded_dataset.train_test_split(test_size=test_size_percentage/100, seed=seed)

## Hent dataset fra output av layoutmvl3 modellen
# train_loaded_dataset = DataUtil.get_output(model_output=outputs, verbose=True)
# split_dataset = train_loaded_dataset.train_test_split(test_size=0.2, seed=seed)


# Explicitly naming sets
train_dataset_hf = split_dataset["train"]
eval_dataset_hf = split_dataset["test"]


# Load your separate test dataset for inference/evaluation
test_loaded_dataset = DataUtil.get_output(dataset_path=test_dataset_path)
test_dataset_hf = test_loaded_dataset


In [None]:
import numpy as np
import tensorflow as tf

# Oppretter trening og test set
train_dataset = DataUtil.to_tf_dataset(train_dataset_hf, hidden_size=hidden_size, batch_size=32)
for batch in train_dataset.take(1):
    h_i_batch, h_j_batch, labels_batch, keys_batch = batch
    print("h_i batch shape:", h_i_batch.shape)
    print("h_j batch shape:", h_j_batch.shape)
    print("labels batch shape:", labels_batch.shape)
    print("keys batch shape:", keys_batch.shape)

test_dataset = DataUtil.to_tf_dataset(test_dataset_hf, hidden_size=hidden_size, batch_size=32)
for batch in test_dataset.take(1):
    h_i_batch, h_j_batch, labels_batch, keys_batch = batch
    print("h_i batch shape:", h_i_batch.shape)
    print("h_j batch shape:", h_j_batch.shape)
    print("labels batch shape:", labels_batch.shape)
    print("keys batch shape:", keys_batch.shape)

In [None]:
import numpy as np
import tensorflow as tf

class Bilinear(tf.keras.layers.Layer):
    def __init__(self, hidden_size, num_relations):
        super(Bilinear, self).__init__()
        self.hidden_size = hidden_size
        self.num_relations = num_relations

    def build(self, input_shape):
        # Create weight tensor of shape (hidden_size, num_relations, hidden_size)
        self.W = self.add_weight(
            shape=(self.hidden_size, self.num_relations, self.hidden_size),
            initializer='glorot_uniform',
            trainable=True,
            name="bilinear_W"
        )
        # Bias vector of shape (num_relations,)
        self.bias = self.add_weight(
            shape=(self.num_relations,),
            initializer='zeros',
            trainable=True,
            name="bilinear_bias"
        )
        super(Bilinear, self).build(input_shape)

    def call(self, inputs):
        head, tail = inputs
        # Use einsum to compute: for each sample, head^T * W * tail.
        # Equation: 'bi,irk,bj->br' where:
        #   b: batch, i: hidden dimension from head, j: hidden dimension from tail, r: relation index.
        logits = tf.einsum('bi,irk,bj->br', head, self.W, tail)
        logits = logits + self.bias
        return logits

## Model Definition
Define the Relation Link layer/model components, including custom layers and heads.

In [None]:
import tensorflow as tf

class RelationExtractionHead(tf.keras.Model):
    def __init__(self, hidden_size, num_relations):
        super(RelationExtractionHead, self).__init__()
        self.head_proj = tf.keras.layers.Dense(hidden_size, name="head_proj")
        self.tail_proj = tf.keras.layers.Dense(hidden_size, name="tail_proj")
        self.bilinear = Bilinear(hidden_size, num_relations)

    def call(self, inputs, training=False):
        h_i, h_j = inputs
        head = self.head_proj(h_i)
        tail = self.tail_proj(h_j)
        logits = self.bilinear((head, tail))
        return logits

## Train the model

Train the model...

In [None]:
import numpy as np

# Hyperparameters
num_relations = 5
num_epochs = 1
learning_rate = 1e-4
num_train_samples = len(train_dataset_hf)
sample_base = int(np.ceil(num_train_samples / num_epochs))
sample_range_min = int(sample_base * 0.95)
sample_range_max = int(sample_base * 1.10)

In [None]:
import tensorflow as tf
from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np
import random

# Model and optimizer
model_tf = RelationExtractionHead(hidden_size=hidden_size, num_relations=num_relations)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

loss_history = []

for epoch in range(num_epochs):
    samples_this_epoch = random.randint(sample_range_min, sample_range_max)
    total_batches = int(np.ceil(samples_this_epoch / batch_size))

    epoch_dataset = (
        train_dataset.unbatch()
        .shuffle(buffer_size=10000, reshuffle_each_iteration=True)
        .take(samples_this_epoch)
        .batch(batch_size)
    )

    print(f"Epoch {epoch+1}/{num_epochs}, sampling {samples_this_epoch} examples...")
    epoch_loss = []

    for batch, (h_i_batch, h_j_batch, labels_batch, keys_batch) in enumerate(tqdm(epoch_dataset, total=total_batches, desc=f"Epoch {epoch+1}", unit="batch")):
        if len(h_i_batch.shape) == 3:
            h_i_batch = tf.reduce_mean(h_i_batch, axis=1)
            h_j_batch = tf.reduce_mean(h_j_batch, axis=1)

        with tf.GradientTape() as tape:
            logits = model_tf((h_i_batch, h_j_batch), training=True)
            loss = loss_fn(labels_batch, logits)

        grads = tape.gradient(loss, model_tf.trainable_variables)
        optimizer.apply_gradients(zip(grads, model_tf.trainable_variables))
        epoch_loss.append(loss.numpy())

    avg_loss = np.mean(epoch_loss)
    loss_history.append(avg_loss)
    print(f"Epoch {epoch+1}, Average Loss: {avg_loss:.4f}")

# Plot training loss
plt.figure(figsize=(8, 5))
plt.plot(range(1, num_epochs + 1), loss_history, marker='o', linestyle='-')
plt.title("Training Loss Over Epochs")
plt.xlabel("Epoch")
plt.ylabel("Average Loss")
plt.grid(True)
plt.show()


## Evaluation
Run model evaluation on the test dataset, computing predictions, probabilities, and performance metrics.

In [None]:
from sklearn.metrics import classification_report, precision_recall_fscore_support
import numpy as np
from tqdm import tqdm
import tensorflow as tf

class RelationModelTesterTF:
    def __init__(self, model, dataset, batch_size=32, num_samples=None):
        self.model = model
        self.dataset = dataset
        self.batch_size = batch_size
        # estimate num_samples if not provided
        self.num_samples = num_samples or sum(1 for _ in dataset)
        self.total_batches = int(np.ceil(self.num_samples / batch_size))

    def evaluate(self):
        all_preds = []
        all_labels = []
        all_probs = []
        all_keys = []
        all_h_i = []
        all_h_j = []

        bar = tqdm(self.dataset, desc="Evaluating", total=self.total_batches, unit="batch")
        for h_i, h_j, labels, keys in bar:
            # pool sequences if 3D
            if len(h_i.shape) == 3:
                h_i = tf.reduce_mean(h_i, axis=1)
                h_j = tf.reduce_mean(h_j, axis=1)

            logits = self.model((h_i, h_j), training=False)
            probs  = tf.nn.softmax(logits, axis=-1)
            preds  = tf.argmax(probs, axis=-1)

            all_preds.extend(preds.numpy())
            all_labels.extend(labels.numpy())
            all_probs.extend(probs.numpy())
            all_keys.extend(keys.numpy())
            all_h_i.extend(h_i.numpy())
            all_h_j.extend(h_j.numpy())

        # Core metrics
        accuracy = np.mean(np.array(all_preds) == np.array(all_labels))
        precision, recall, f1, _ = precision_recall_fscore_support(
            all_labels, all_preds, average="weighted"
        )
        report_str = classification_report(all_labels, all_preds)


        return {
            "predictions": all_preds,
            "ground_truth": all_labels,
            "accuracy": accuracy,
            "precision": precision,
            "recall": recall,
            "f1": f1,
            "classification_report": report_str,
            "probabilities": all_probs,
            "keys": all_keys,
            "h_i": all_h_i,
            "h_j": all_h_j,
        }


In [None]:
# Run evaluation
tester = RelationModelTesterTF(
    model_tf,
    test_dataset,
    batch_size=32,
    num_samples=len(test_dataset_hf)
)
results = tester.evaluate()

# Print overall metrics
print(f"Test Accuracy       : {results['accuracy']:.2%}")
print(f"Weighted Precision  : {results['precision']:.4f}")
print(f"Weighted Recall     : {results['recall']:.4f}")
print(f"Weighted F1‑score   : {results['f1']:.4f}\n")

# Print some example predictions
print("Example Predictions (first 10 samples):", results["predictions"][:10])

# Detailed sample‑wise breakdown
print("\nSample-wise Predictions:")
for i in range(min(10, len(results["predictions"]))):
    print(f"\nSample {i}")
    print("  True Label      :", results["ground_truth"][i])
    print("  Predicted Label :", results["predictions"][i])
    print("  Probabilities   :", np.round(results["probabilities"][i], 4))
    print("  h_i (first 5)   :", np.round(results["h_i"][i][:5], 4))
    print("  h_j (first 5)   :", np.round(results["h_j"][i][:5], 4))
