<a href="https://colab.research.google.com/github/667029/KVP10k/blob/main/Dataset_processor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q transformers datasets seqeval
!pip install evaluate

In [None]:
import os              #navigere mapper og filer, hente filbaner
from PIL import Image  #åpne, vise og manipulere bilder
import json            #lese/skrive til JSON-filer
from transformers import LayoutLMv3Processor
import torch
from google.colab import drive

In [None]:
drive.mount('/content/drive')
base_path = "/content/drive/MyDrive/DAT255/KVP10k-dataset/kvp10k/"
print(os.listdir(base_path))

In [None]:
processor = LayoutLMv3Processor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False) # <-- Viktig fordi vi allerede har utført OCR på bildet og har tekst og bboxes

In [None]:
# === Imports ===
from datasets import Dataset, DatasetDict, Features, Sequence, Value, Array2D, Array3D, Image as HFImage
from tqdm import tqdm
from PIL import Image
import json
import os

# === Label map for KVP ===
label_map = {
    "O": 0,
    "B-KEY": 1,
    "I-KEY": 2,
    "B-VALUE": 3,
    "I-VALUE": 4,
}

# === BBox helpers ===
def normalize_bbox(bbox, width, height):
    return [
        int(1000 * (bbox[0] / width)),
        int(1000 * (bbox[1] / height)),
        int(1000 * (bbox[2] / width)),
        int(1000 * (bbox[3] / height))
    ]

def box_overlap(box1, box2):
    x0 = max(box1[0], box2[0])
    y0 = max(box1[1], box2[1])
    x1 = min(box1[2], box2[2])
    y1 = min(box1[3], box2[3])
    return max(0, x1 - x0) * max(0, y1 - y0)

def assign_label_for_box(box, boxes, label_type):
    overlaps = [i for i, token_box in enumerate(boxes) if box_overlap(box, token_box) > 0]
    overlaps = sorted(overlaps)
    return [(idx, f"B-{label_type}" if j == 0 else f"I-{label_type}") for j, idx in enumerate(overlaps)]

def iob_from_kvps(words, boxes, kvps):
    labels = ["O"] * len(words)
    for kvp in kvps:
        if "key" in kvp and "bbox" in kvp["key"]:
            for idx, tag in assign_label_for_box(kvp["key"]["bbox"], boxes, "KEY"):
                labels[idx] = tag
        if "value" in kvp and "bbox" in kvp["value"]:
            for idx, tag in assign_label_for_box(kvp["value"]["bbox"], boxes, "VALUE"):
                labels[idx] = tag
    return labels

# === Eksempel-laster ===
def load_example(doc_id, base_path):
    image_path = os.path.join(base_path, "images", f"{doc_id}.png")
    ocr_path = os.path.join(base_path, "ocrs", f"{doc_id}.json")
    gt_path = os.path.join(base_path, "gts", f"{doc_id}.json")

    image = Image.open(image_path).convert("RGB")
    with open(ocr_path, "r", encoding="utf-8") as f:
        ocr_data = json.load(f)
    with open(gt_path, "r", encoding="utf-8") as f:
        gt_data = json.load(f)

    page = ocr_data["pages"][0]
    width, height = page["width"], page["height"]
    words = [w["text"] for w in page["words"]]
    raw_boxes = [w["bbox"] for w in page["words"]]
    boxes = [normalize_bbox(b, width, height) for b in raw_boxes]
    string_labels = iob_from_kvps(words, raw_boxes, gt_data["kvps_list"])

    return words, boxes, [label_map[l] for l in string_labels], image

# === Prepare-funksjon for batching ===
def prepare_examples(examples):
    images = examples["image"]
    words = examples["tokens"]
    boxes = examples["bboxes"]
    word_labels = examples["ner_tags"]

    encoding = processor(
        images,
        words,
        boxes=boxes,
        word_labels=word_labels,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )

    return {
      "input_ids": encoding["input_ids"].tolist(),
      "attention_mask": encoding["attention_mask"].tolist(),
      "bbox": encoding["bbox"].tolist(),
      "labels": encoding["labels"].tolist(),
      "pixel_values": encoding["pixel_values"].tolist(),
}

features = Features({
    "pixel_values": Array3D(dtype="float32", shape=(3, 224, 224)),
    "input_ids": Sequence(Value("int64")),
    "attention_mask": Sequence(Value("int64")),
    "bbox": Array2D(dtype="int64", shape=(512, 4)),
    "labels": Sequence(Value("int64")),
})





#Viktig!
Dette er grunnen til at vi bruker **gts-mappen** for å finne dokumenter for KVP-modellering. Vi ser at:
- 📄 Antall treningsdokumenter: 7843
- 🧪 Antall testdokumenter: 828
Dette betyr altså at ikke alle dokumenter i KVP10k-datasettet er beregnet for KVP-predikering.
gts-mappen er den som faktisk inneholder ground truth-annotasjoner for NER (KEY/VALUE)

In [None]:
import os

base_path = "/content/drive/MyDrive/DAT255/KVP10k-dataset/kvp10k"

train_gt_dir = os.path.join(base_path, "train", "gts")
test_gt_dir = os.path.join(base_path, "test", "gts")

num_train_docs = len([f for f in os.listdir(train_gt_dir) if f.endswith(".json")])
num_test_docs = len([f for f in os.listdir(test_gt_dir) if f.endswith(".json")])

print(f"📄 Antall treningsdokumenter: {num_train_docs}")
print(f"🧪 Antall testdokumenter: {num_test_docs}")

In [None]:
from datasets import Dataset, DatasetDict, concatenate_datasets
from datasets import Features, Sequence, Value, Array2D, Array3D, Image as HFImage
from tqdm import tqdm
import os
from PIL import Image
import json
#JUSTER DISSE
NUM_TRAIN_DOCS = 7843
NUM_TEST_DOCS = 828

# === Batch-loader for gitte IDer
def load_examples_from_ids(doc_ids, split_path):
    examples = []
    for doc_id in tqdm(doc_ids, desc="Laster batch"):
        try:
            image_path = os.path.join(split_path, "images", f"{doc_id}.png")
            ocr_path = os.path.join(split_path, "ocrs", f"{doc_id}.json")
            gt_path = os.path.join(split_path, "gts", f"{doc_id}.json")

            image = Image.open(image_path).convert("RGB")
            with open(ocr_path, "r", encoding="utf-8") as f:
                ocr_data = json.load(f)
            with open(gt_path, "r", encoding="utf-8") as f:
                gt_data = json.load(f)

            page = ocr_data["pages"][0]
            width, height = page["width"], page["height"]
            words = [w["text"] for w in page["words"]]
            raw_boxes = [w["bbox"] for w in page["words"]]
            boxes = [normalize_bbox(b, width, height) for b in raw_boxes]
            string_labels = iob_from_kvps(words, raw_boxes, gt_data["kvps_list"])
            label_ids = [label_map[l] for l in string_labels]

            examples.append({
                "id": doc_id,
                "tokens": words,
                "bboxes": boxes,
                "ner_tags": label_ids,
                "image": image
            })
        except Exception as e:
            print(f"❌ Feil i {doc_id}: {e}")

    return Dataset.from_list(examples).cast_column("image", HFImage(decode=True))


# === Batchvis prosessering og lagring
train_path = os.path.join(base_path, "train")
gts_dir = os.path.join(train_path, "gts")
doc_ids = sorted(f.replace(".json", "") for f in os.listdir(gts_dir) if f.endswith(".json"))
doc_ids = doc_ids[:NUM_TRAIN_DOCS]

batch_size = 500
processed_batches = []

for i in range(0, len(doc_ids), batch_size):
    print(f"\n🔁 Behandler batch {i // batch_size + 1}")
    batch_ids = doc_ids[i:i+batch_size]

    raw = load_examples_from_ids(batch_ids, train_path)
    processed = raw.map(
        prepare_examples,
        batched=True,
        batch_size=32,
        remove_columns=["tokens", "bboxes", "image", "ner_tags", "id"],
        features=features,
        load_from_cache_file=True
    )
    processed_batches.append(processed)

# === Slå sammen og splitt
full_train_dataset = concatenate_datasets(processed_batches)
split = full_train_dataset.train_test_split(test_size=0.2, seed=42)

# === Load og prosesser test-settet separat
test_path = os.path.join(base_path, "test")
gts_test_dir = os.path.join(test_path, "gts")
test_doc_ids = sorted(f.replace(".json", "") for f in os.listdir(gts_test_dir) if f.endswith(".json"))
test_doc_ids = test_doc_ids[:NUM_TEST_DOCS]

raw_test = load_examples_from_ids(test_doc_ids, test_path)
test_dataset = raw_test.map(
    prepare_examples,
    batched=True,
    batch_size=32,
    remove_columns=["tokens", "bboxes", "image", "ner_tags", "id"],
    features=features,
    desc="Preprosesserer test",
    load_from_cache_file=True
)

# === Lag og lagre hele datasettet
final_dataset = DatasetDict({
    "train": split["train"],
    "eval": split["test"],
    "test": test_dataset
})

In [None]:
import os

output_dir = "/content/drive/MyDrive/KVP10k_processed_ready/dataset_all_gts"
os.makedirs(output_dir, exist_ok=True)

final_dataset.save_to_disk(output_dir)


In [None]:
!ls -lh "/content/drive/MyDrive/KVP10k_processed_ready"


In [None]:
!ls -lh /content/drive/MyDrive/KVP10k_processed_ready/dataset


In [None]:
from google.colab import drive
import shutil
import os

# Monter Google Drive
drive.mount("/content/drive", force_remount=True)

# Slett eksisterende mappe hvis den finnes
output_path = "/content/drive/MyDrive/KVP10k_processed_ready"
shutil.rmtree(output_path, ignore_errors=True)

# Lagre til lokal disk først
local_tmp_path = "/content/KVP10k_tmp"
shutil.rmtree(local_tmp_path, ignore_errors=True)
final_dataset.save_to_disk(local_tmp_path)

# Kopier til Drive
shutil.copytree(local_tmp_path, output_path)

# Verifiser
print("✅ Datasett kopiert til Drive:")
!ls -lh "$output_path"


In [None]:
from collections import Counter
from itertools import chain

def tell_labels(dataset, label_column="labels", label_map=label_map):
    all_labels = list(chain.from_iterable(dataset[label_column]))
    label_counts = Counter(all_labels)
    inv_map = {v: k for k, v in label_map.items()}
    named_counts = {inv_map.get(k, f"UNK({k})"): v for k, v in label_counts.items()}
    return named_counts

print("Train labels:", tell_labels(final_dataset["train"]))


In [None]:
example = final_dataset["train"][0]
print("Tokens:", len(example["input_ids"]))
print("Labels:", len(example["labels"]))
