## Mount drive fsa

## Installing requirements

In [None]:
! pip3 install -q git+https://github.com/huggingface/transformers.git
! pip3 install -q git+https://github.com/huggingface/datasets.git "dill<0.3.5" seqeval
! apt install tesseract-ocr
! apt install libtesseract-dev
! pip3 install pytesseract

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
[K     |████████████████████████████████| 120 kB 15.4 MB/s 
[K     |████████████████████████████████| 6.6 MB 71.3 MB/s 
[?25h  Building wheel for transformers (PEP 517) ... [?25l[?25hdone
[K     |████████████████████████████████| 86 kB 2.9 MB/s 
[K     |████████████████████████████████| 43 kB 36 kB/s 
[K     |████████████████████████████████| 212 kB 66.8 MB/s 
[K     |████████████████████████████████| 115 kB 74.1 MB/s 
[K     |████████████████████████████████| 127 kB 53.0 MB/s 
[K     |████████████████████████████████| 112 kB 77.1 MB/s 
[?25h  Building wheel for datasets (setup.py) ... [?25l[?25hdone
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone


In [None]:
from datasets import load_metric
from transformers import TrainingArguments, Trainer
from transformers import LayoutLMv3ForTokenClassification,AutoProcessor
from transformers.data.data_collator import default_data_collator
import torch
from datasets import load_from_disk


## Loading dataset from Drive

In [None]:
#!/bin/bash
IOB_DATA_PATH = "/content/drive/MyDrive/test_502.zip"
! cd /content/
! rm -r data
! mkdir data
! cp "$IOB_DATA_PATH" data/dataset.zip 
! cd data && unzip -q dataset && rm dataset.zip
! cd ..

## defining preprocessing params and running the script

In [None]:
#!/bin/bash
#preprocessing args
TEST_SIZE = 0.33
DATA_OUTPUT_PATH = "/content/"

In [None]:
! rm -r layoutlmv3FineTuning
! git clone -b master https://github.com/12levoav/LayoutLMV3

In [None]:
! python3 LayoutLMV3/preprocess.py --valid_size $TEST_SIZE --output_path $DATA_OUTPUT_PATH

100% 1/1 [00:12<00:00, 12.60s/ba]
Downloading preprocessor_config.json: 100% 275/275 [00:00<00:00, 347kB/s]
Downloading tokenizer_config.json: 100% 1.12k/1.12k [00:00<00:00, 1.56MB/s]
Downloading config.json: 100% 856/856 [00:00<00:00, 869kB/s]
Downloading vocab.json: 100% 878k/878k [00:00<00:00, 39.5MB/s]
Downloading merges.txt: 100% 446k/446k [00:00<00:00, 22.0MB/s]
100% 1/1 [00:14<00:00, 14.10s/ba]
100% 1/1 [00:08<00:00,  8.15s/ba]
Flattening the indices: 100% 1/1 [00:00<00:00,  1.68ba/s]
Flattening the indices: 100% 1/1 [00:00<00:00,  4.69ba/s]


In [None]:
# load datasets
train_dataset = load_from_disk(f'/content/train_split')
eval_dataset = load_from_disk(f'/content/eval_split')

In [None]:
label_list = train_dataset.features["labels"].feature.names
num_labels = len(label_list)
label2id, id2label = dict(), dict()
for i, label in enumerate(label_list):
    label2id[label] = i
    id2label[i] = label

## Defining metric

In [None]:
metric = load_metric("seqeval")
import numpy as np

return_entity_level_metrics = False

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels,zero_division='0')
    if return_entity_level_metrics:
        # Unpack nested dictionaries
        final_results = {}
        for key, value in results.items():
            if isinstance(value, dict):
                for n, v in value.items():
                    final_results[f"{key}_{n}"] = v
            else:
                final_results[key] = value
        return final_results
    else:
        return {
            "precision": results["overall_precision"],
            "recall": results["overall_recall"],
            "f1": results["overall_f1"],
            "accuracy": results["overall_accuracy"],
        }

Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

## Loading model and preprocessor (also required for Hugging face trainer)

In [None]:
model = LayoutLMv3ForTokenClassification.from_pretrained("microsoft/layoutlmv3-large",
                                                         id2label=id2label,
                                                         label2id=label2id)

processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-large", apply_ocr=False)

Downloading pytorch_model.bin:   0%|          | 0.00/478M [00:00<?, ?B/s]

Some weights of LayoutLMv3ForTokenClassification were not initialized from the model checkpoint at microsoft/layoutlmv3-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## let's train the model

In [None]:
NUM_TRAIN_EPOCHS = 40
PER_DEVICE_TRAIN_BATCH_SIZE = 1
PER_DEVICE_EVAL_BATCH_SIZE = 1
LEARNING_RATE = 4e-5

In [None]:
training_args = TrainingArguments(output_dir="test",
                                  # max_steps=1500,
                                  num_train_epochs=NUM_TRAIN_EPOCHS,
                                  logging_strategy="epoch",
                                  save_total_limit=1,
                                  per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH_SIZE,
                                  per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH_SIZE,
                                  learning_rate=LEARNING_RATE,
                                  evaluation_strategy="no",
                                  save_strategy="no",
                                  # eval_steps=100,
                                  load_best_model_at_end=True,
                                  metric_for_best_model="f1")

In [None]:
# Initialize our Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=processor,
    data_collator=default_data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 96
  Batch size = 1


{'epoch': 50.0,
 'eval_accuracy': 0.9467279435562173,
 'eval_f1': 0.7112092766427388,
 'eval_loss': 0.521497368812561,
 'eval_precision': 0.6531440162271805,
 'eval_recall': 0.7806060606060606,
 'eval_runtime': 11.2146,
 'eval_samples_per_second': 8.56,
 'eval_steps_per_second': 8.56}

### Save the model for upcoming fine-tuning/infrence

In [None]:
torch.save(model,'/content/drive/MyDrive/Sosias/layout_models/layoutlmv3_720.pth')

NameError: ignored

## Inference

In [None]:
import os
import warnings
from PIL import Image
warnings.filterwarnings('ignore')

In [None]:
# move all inference images from /content to 'images' folder

os.makedirs('/content/images',exist_ok=True)
for image in os.listdir():
  try:
    img = Image.open(f'{os.curdir}/{image}')
    os.system(f'mv "{image}" "images/{image}"')
  except:
    pass

In [None]:
# defining inference parameters
model_path = "/content/drive/MyDrive/Sosias/layout_models/layoutlmv3.pth" # path to Layoutlmv3 model
imag_path = "/content/images" # images folder

In [None]:
# if inference model is pth then convert it to pre-trained format
if model_path.endswith('.pth'):
  layoutlmv3_model = torch.load(model_path)
  # model_path = '/content/pre_trained_layoutlmv3'
  model_path = '/content/lilt_model'
  layoutlmv3_model.save_pretrained(model_path)

In [None]:
! python3 La    youtLMV3/run_inference.py --model_path "$model_path" --images_path $imag_path