<a href="https://colab.research.google.com/github/Arnavvv16/Project_task3/blob/main/LayoutLM_Implementation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# installing
!pip install transformers datasets pandas pyarrow pillow -q

#Imports
import pandas as pd
import torch
from PIL import Image
from datasets import Dataset
from transformers import LayoutLMTokenizer, LayoutLMForSequenceClassification
import io
import numpy as np


In [None]:
!pip install --upgrade datasets fsspec aiohttp

Collecting fsspec
  Using cached fsspec-2025.5.1-py3-none-any.whl.metadata (11 kB)


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load Parquet (uploaded through left side)
#df = pd.read_parquet("/content/train-00000-of-00001.parquet")
#df = df[:30]  # First 30 rows

from datasets import load_dataset

df = load_dataset("dvgodoy/rvl_cdip_mini", cache_dir="./new_hf_cache")

# Select only the first 30 examples from the training split
df["train"] = df["train"].select(range(30))

# Normalizing bounding boxes to 0–1000
def normalize_box(box, width, height):
    return [
        int(1000 * box[0] / width),
        int(1000 * box[1] / height),
        int(1000 * box[2] / width),
        int(1000 * box[3] / height),
    ]

# Preprocessingg
def preprocess_examples(example):
    words, boxes, labels = [], [], []

    for i in range(len(example["image"])):
        # Access the image directly as it is already a PIL Image object
        image = example["image"][i].convert("RGB")
        w, h = image.size
        words.append(example["ocr_words"][i])
        boxes.append([normalize_box(b, w, h) for b in example["word_boxes"][i]])
        labels.append(example["label"][i])

    return {"words": words, "boxes": boxes, "labels": labels}

print (df)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/341M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/42.4M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/41.5M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3200 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/400 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/400 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['image', 'width', 'height', 'category', 'ocr_words', 'word_boxes', 'ocr_paragraphs', 'paragraph_boxes', 'label'],
        num_rows: 2
    })
    validation: Dataset({
        features: ['image', 'width', 'height', 'category', 'ocr_words', 'word_boxes', 'ocr_paragraphs', 'paragraph_boxes', 'label'],
        num_rows: 400
    })
    test: Dataset({
        features: ['image', 'width', 'height', 'category', 'ocr_words', 'word_boxes', 'ocr_paragraphs', 'paragraph_boxes', 'label'],
        num_rows: 400
    })
})


In [None]:
# Tokenizer
tokenizer = LayoutLMTokenizer.from_pretrained("microsoft/layoutlm-base-uncased")
model = LayoutLMForSequenceClassification.from_pretrained(
    "microsoft/layoutlm-base-uncased", num_labels=len(df["train"].unique("label"))
).to(device)

# Tokenization
def encode_examples(example):
    encoded = tokenizer(
        example["words"],
        boxes=example["boxes"],
        padding="max_length",
        truncation=True,
        max_length=512,
        return_attention_mask=True,
        is_split_into_words=True
    )
    encoded["labels"] = example["labels"]
    return encoded

# Process
hf_dataset = df.map(preprocess_examples, batched=True)
hf_dataset = hf_dataset.map(encode_examples, batched=True) # Apply batched=True here as well


# PyTorch Dataset
class LayoutLMDataset(torch.utils.data.Dataset):
    def __init__(self, dataset):
        self.dataset = dataset

    def __getitem__(self, idx):
        item = self.dataset[idx]
        # Removed the print statement to avoid excessive output during training
        batch = {
            k: torch.tensor(v, dtype=torch.long) if k == "labels" and isinstance(v, list)
            else torch.tensor(v) if isinstance(v, (list, tuple, np.ndarray))
            else v
            for k, v in item.items()
        }
        return batch

    def __len__(self):
        return len(self.dataset)

train_dataset = LayoutLMDataset(hf_dataset["train"]) # Use the train split for the dataset
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=2, shuffle=True)

# Training loop
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

model.train()
for epoch in range(3):
    print(f"\nEpoch {epoch+1}")
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        print("Loss:", loss.item())
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

# Saving model
model.save_pretrained("layoutlm-v1-colab-model")
tokenizer.save_pretrained("layoutlm-v1-colab-model")
print("Model and tokenizer saved.")