[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/DiTo97/binarization-segformer/blob/main/fine-tuning.ipynb)

# Fine-tuning Segformer for Document Image Binarization

A notebook by F. Minutoli ([@DiTo97](https://github.com/DiTo97)) that fine-tunes a Segformer model for document image binarization

In [1]:
requirements = " ".join([
    "absl-py==1.4.0",
    "accelerate==0.18.0",
    "albumentations==1.3.0",
    "datasets==2.11.0",
    "deepspeed==0.9.5",
    "evaluate==0.4.0",
    "huggingface-hub==0.13.4",
    "tensorboardX==2.6.1",
    "transformers==4.27.4"
])

In [None]:
%%bash -s {requirements}
set -e


requirements=$@

python -m pip install --upgrade pip
python -m pip install $requirements

In [1]:
from huggingface_hub import notebook_login

In [2]:
notebook_login()

Token is valid.
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /home/fminutoli/.cache/huggingface/token
Login successful


## 1. Dataset

In [3]:
%%bash
set -e


if [ ! -d SauvolaNet ]; then
    git clone https://github.com/Leedeng/SauvolaNet.git
fi

In [4]:
import sys
import typing
from typing import Any

import datasets
import numpy as np
from PIL import Image
from tqdm.auto import tqdm

In [5]:
def normalize(bitmap: Image.Image) -> Image.Image:
    bitmap = bitmap.convert("L")
    bitmap = np.array(bitmap).astype(np.uint8)
    condition = bitmap < np.max(bitmap)
    bitmap = np.where(condition, 1, 0).astype(np.bool_)
    bitmap = Image.fromarray(bitmap)

    return bitmap

In [6]:
def prepare_examples(
    batch: typing.Dict[str, typing.List[Any]]
) -> typing.Dict[str, typing.List[Any]]:
    """It prepares a batch of examples for semantic segmentation"""
    sources = batch["source"]
    targets = batch["target"]

    batch = {
        "labelmap": [normalize(Image.open(tgt)) for tgt in targets],
        "pixelmap": [Image.open(src) for src in sources]
    }

    return batch

In [7]:
sauvolanet_src = "SauvolaNet/SauvolaDocBin"
sauvolanet_dataset = "SauvolaNet/Dataset"

sys.path.insert(0, sauvolanet_src)
from dataUtils import collect_binarization_by_dataset

collection = collect_binarization_by_dataset(sauvolanet_dataset)

sys.path.remove(sauvolanet_src)

del sauvolanet_src
del sauvolanet_dataset
del collect_binarization_by_dataset

features = datasets.Features({
    "ensemble": datasets.Value("string"),
    "source": datasets.Value("string"),
    "target": datasets.Value("string"),
})

for name, examples in tqdm(collection.items(), desc="Loading datasets"):
    sources, targets = zip(*examples)

    sources = sorted(sources)
    targets = sorted(targets)

    dataset = {"source": sources, "target": targets, "ensemble": [name] * len(sources)}
    dataset = datasets.Dataset.from_dict(dataset, features)

    collection[name] = dataset

collection = datasets.concatenate_datasets([
    dataset for _, dataset in collection.items()
])

features = datasets.Features({
    "ensemble": datasets.Value("string"),
    "labelmap": datasets.Image(),
    "pixelmap": datasets.Image(),
})

collection = collection.map(
    prepare_examples, 
    batched=True,
    features=features, 
    remove_columns=["source", "target"]
)

collection = collection.class_encode_column("ensemble")

del features

collection = collection.train_test_split(
    seed=10,
    shuffle=True,
    stratify_by_column="ensemble",
    train_size=0.75
)

train_dataset = collection["train"]
test_dataset  = collection[ "test"]

del collection

Loading datasets:   0%|          | 0/14 [00:00<?, ?it/s]

Map:   0%|          | 0/207 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/207 [00:00<?, ? examples/s]

In [8]:
labels = ["background", "text"]
num_labels = len(labels)

id2label = {key: val for key, val in enumerate(labels)}
label2id = {val: key for key, val in enumerate(labels)}

del labels

## 2. Augmentation

In [9]:
import albumentations
import cv2 as opencv
import transformers
from transformers import set_seed

In [10]:
set_seed(10)

In [11]:
base_model_name = "nvidia/segformer-b3-finetuned-cityscapes-1024-1024"
base_model_size = {"height": 1024, "width": 1024}

processor = transformers.SegformerImageProcessor.from_pretrained(base_model_name)
processor.size.update(base_model_size)



In [12]:
FLAGS = {
    # The general kwargs
    "border_mode": opencv.BORDER_CONSTANT,
    "fill_value": 255,
    "mask_fill_value": 0,
    "proba": 0.1,

    # The color kwargs
    "brightness": 0.25, 
    "contrast": 0.25, 
    "saturation": 0.25, 
    "hue": 0.1,
    
    # The crop kwargs
    "min_height": processor.size["height"],
    "min_width" : processor.size[ "width"],
    
    # The geometric kwargs
    "rotate": (-90, 90),
    "translate_percent": 0.1
}

image_transform = albumentations.Compose([
    albumentations.ColorJitter(
        brightness=FLAGS["brightness"], 
        contrast=FLAGS["contrast"], 
        saturation=FLAGS["saturation"], 
        hue=FLAGS["hue"]
    )
])

image_and_mask_transform = albumentations.Compose([
    albumentations.Flip(p=FLAGS["proba"]),
    albumentations.Affine(
        p=FLAGS["proba"],
        cval=FLAGS["fill_value"],
        cval_mask=FLAGS["mask_fill_value"],
        mode=FLAGS["border_mode"],
        rotate=FLAGS["rotate"], 
        translate_percent=FLAGS["translate_percent"],
    ),
    albumentations.PadIfNeeded(
        border_mode=FLAGS["border_mode"],
        mask_value=FLAGS["mask_fill_value"],
        min_height=FLAGS["min_height"], 
        min_width=FLAGS["min_width"], 
        value=FLAGS["fill_value"],
    ),
    albumentations.RandomCrop(
        p=FLAGS["proba"],
        height=FLAGS["min_height"], 
        width=FLAGS["min_width"],
    )
])

def train_transform(
    batch: typing.Dict[str, typing.List[Any]]
) -> transformers.BatchFeature:
    images = [image.convert("RGB") for image in batch["pixelmap"]]
    images = [np.array(image) for image in images]
    images = [image_transform(image=image)["image"] for image in images]

    labels = [np.array(label).astype(np.uint8) for label in batch["labelmap"]]

    examples = [
        image_and_mask_transform(image=image, mask=mask) 
        for image, mask in zip(images, labels)
    ]

    images = [example["image"] for example in examples]
    labels = [example[ "mask"] for example in examples]

    encoding = processor(images, labels)
    return encoding

def  test_transform(
    batch: typing.Dict[str, typing.List[Any]]
) -> transformers.BatchFeature:
    images = [image.convert("RGB") for image in batch["pixelmap"]]
    labels = [label for label in batch["labelmap"]]

    encoding = processor(images, labels)
    return encoding

In [13]:
train_dataset.set_transform(train_transform)
test_dataset.set_transform(test_transform)

## 3. Training

In [14]:
import os

In [15]:
kwargs = {
    "master-addr": "localhost",
    "master-port": "9994",  # modify if RuntimeError: Address already in use
    "rank": "0",
    "local-rank": "0",
    "world-size": "1",
}

In [16]:
for key, val in kwargs.items():
    key = key.replace("-", "_")
    key = key.upper()

    os.environ[key] = val

In [17]:
# devices = ["0", "1"]

In [18]:
# def _setup_env_variables() -> None:
#     kwargs = {
#         "cuda-visible-devices": ",".join(devices),
#         "tokenizers-parallelism": "false"
#     }

#     for key, val in kwargs.items():
#         key = key.replace("-", "_")
#         key = key.upper()

#         os.environ[key] = val

In [19]:
# _setup_env_variables()

In [20]:
import torch
import torch.nn.functional as F
# from accelerate import notebook_launcher
from transformers.trainer_utils import get_last_checkpoint

import metrics

In [21]:
datasets.logging.set_verbosity_error()
transformers.logging.set_verbosity_error()

In [22]:
metric = metrics.DIBCO()
logger = transformers.logging.get_logger()

In [23]:
def compute_metrics(outputs: transformers.EvalPrediction) -> typing.Dict[str, float]:
    with torch.no_grad():
        logits, references = outputs
               
        print(logits.min(), logits.max(), logits.dtype, logits.shape, references.min(), references.max(), references.dtype, references.shape)

        logits = torch.from_numpy(logits)
        logits = torch.float32(logits)

        # It upscales the logits to the size of the label
        preds = F.interpolate(
            logits,
            size=references.shape[-2:],
            mode="bilinear",
            align_corners=False,
        ).argmax(dim=1)

        preds = preds.detach().cpu().numpy()

        # references = references.astype(np.float(32))
        # preds = preds.astype(np.float(32))

        batch_metrics = metric(references, preds)
        return batch_metrics

In [24]:
def train_segformer() -> None:
    FLAGS = {
        "accumulation_steps": 16,
        "base_model_name": base_model_name,
        "batch_size": 1,
        "model_name": "binarization-segformer-b3"
    }
    # FLAGS = {
    #     "accumulation_steps": 4 / len(devices),
    #     "base_model_name": base_model_name,
    #     "batch_size": 1,
    #     "fp16": torch.cuda.is_available(),
    #     "learning_rate": 5e-5,
    #     "metric": "dibco",
    #     "model_name": "binarization-segformer-b3",
    #     "num_epochs": 50,
    #     "optimizer": "adamw_torch",
    #     "scheduler_type": "cosine"
    # }

    model_kwargs = {
        "id2label": id2label, 
        "label2id": label2id,
        "ignore_mismatched_sizes": True,
    }

    model = transformers.SegformerForSemanticSegmentation.from_pretrained(FLAGS["base_model_name"], **model_kwargs)

    training_args = transformers.TrainingArguments(
        # auto_find_batch_size=True,
        deepspeed="ds-config-zero2.json",
        # eval_accumulation_steps=FLAGS["accumulation_steps"],
        eval_steps=10,
        evaluation_strategy="steps",
        fp16=torch.cuda.is_available(),
        full_determinism=False,
        gradient_accumulation_steps=FLAGS["accumulation_steps"],
        # gradient_checkpointing=True,
        hub_model_id=FLAGS["model_name"],
        hub_strategy="end",
        # learning_rate=FLAGS["learning_rate"],
        load_best_model_at_end=True,
        logging_steps=10,
        # lr_scheduler_type=FLAGS["scheduler_type"],
        num_train_epochs=50,
        # optim=FLAGS["optimizer"],
        output_dir=FLAGS["model_name"],
        per_device_eval_batch_size=FLAGS["batch_size"],
        per_device_train_batch_size=FLAGS["batch_size"],
        push_to_hub=True,
        remove_unused_columns=False,  # https://discuss.huggingface.co/t/divide-by-zero-error-when-following-ch7-tutorial/18393/6
        report_to="tensorboard",
        save_steps=10,
        save_strategy="steps",
        save_total_limit=3,
        seed=10,
        # warmup_steps=50,
    )
    
    callbacks = [
        transformers.EarlyStoppingCallback(early_stopping_patience=5)
    ]

    trainer = transformers.Trainer(
        args=training_args,
        callbacks=callbacks,
        compute_metrics=compute_metrics,
        eval_dataset=test_dataset,
        model=model,    
        train_dataset=train_dataset,
    )

    try:
        checkpoint = get_last_checkpoint(FLAGS["model_name"])
    except FileNotFoundError:
        logger.debug("No checkpoint")
        checkpoint = None

    resume_from_checkpoint = checkpoint is not None

    trainer.train(resume_from_checkpoint=resume_from_checkpoint)

    trainer.save_model()  # It saves the tokenizer too for easy upload
    trainer.save_state()   

    kwargs = {
        "finetuned_from": FLAGS["base_model_name"],
        "tags": [
            "document-image-binarization"
            "image-segmentation"
        ]
    }

    processor.push_to_hub(FLAGS["model_name"])
    trainer.push_to_hub(**kwargs)

In [25]:
train_segformer()

[2023-07-10 12:10:07,948] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2023-07-10 12:10:14,066] [INFO] [comm.py:594:init_distributed] cdb=None
[2023-07-10 12:10:14,067] [INFO] [comm.py:625:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl


/home/fminutoli/Personal/binarization-segformer/binarization-segformer-b3 is already a clone of https://huggingface.co/DiTo97/binarization-segformer-b3. Make sure you pull the latest changes with `repo.git_pull()`.


Installed CUDA version 11.8 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination
Installed CUDA version 11.8 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination


Using /home/fminutoli/.cache/torch_extensions/py37_cu117 as PyTorch extensions root...
Detected CUDA files, patching ldflags
Emitting ninja build file /home/fminutoli/.cache/torch_extensions/py37_cu117/cpu_adam/build.ninja...
Building extension module cpu_adam...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)


ninja: no work to do.
Time to load cpu_adam op: 2.6278932094573975 seconds


Loading extension module cpu_adam...


Rank: 0 partition count [1] and sizes[(47224002, False)] 
{'loss': 0.0, 'learning_rate': 5e-05, 'epoch': 1.03}
-1.7705 0.945 float16 (52, 2, 256, 256) 0 1 int64 (52, 1024, 1024)


TypeError: 'torch.dtype' object is not callable

In [None]:

# # metric = evaluate.load(FLAGS["metric"])






#         batch_size, height, width = logits.shape

#         # npixel = height*width

#         # fmeasures = []
#         # pfmeasures = []
#         # psnrs = []
#         # drds = []

#         # for idx in range(batch_size):
#         #     im = predictions[idx]
#         #     im_gt = labels[idx]

#         #     fmeasure = Fmeasure(im, im_gt)
#         #     psnr = Psnr(im, im_gt)
#         #     pfmeasure = Pfmeasure(im, im_gt)
#         #     drd = DRD(im, im_gt)

#         #     fmeasures.append(fmeasure)
#         #     pfmeasures.append(pfmeasure)
#         #     psnrs.append(psnr)
#         #     drds.append(drd)

#         # batch_fmeasure = np.mean(fmeasures)
#         # batch_pfmeasure = np.mean(pfmeasures)
#         # batch_psnr = np.mean(psnrs)
#         # batch_drd = np.mean(drds)

#         # metrics = {
#         #     "fmeasure": batch_fmeasure,
#         #     "pfmeasure": batch_pfmeasure,
#         #     "psnr": batch_psnr,
#         #     "drd": batch_drd
#         # }

#         # return metrics

#         # # FIXME: For more information, see
#         # # https://github.com/huggingface/evaluate/pull/328#issuecomment-1286866576
#         # metrics = metric._compute(
#         #         predictions=predictions,
#         #         references=labels,
#         #         num_labels=num_labels,
#         #         ignore_index=0,  # The background info is ignored
#         #         reduce_labels=processor.do_reduce_labels,
#         #     )
        
#         # # It adds per-category metrics as separate key-val pairs
#         # per_category_accuracy = metrics.pop("per_category_accuracy").tolist()
#         # per_category_iou = metrics.pop("per_category_iou").tolist()

#         # metrics.update({f"accuracy_{id2label[key]}": val for key, val in enumerate(per_category_accuracy)})
#         # metrics.update({f"iou_{id2label[key]}": val for key, val in enumerate(per_category_iou)})
        
#         # return metrics




# # decay_parameters = get_parameter_names(model, [nn.LayerNorm])
# # decay_parameters = [name for name in decay_parameters if "bias" not in name]

# # c = [
# #     {
# #         "params": [
# #              param for name, param in model.named_parameters() 
# #              if name in decay_parameters
# #         ],
# #         "weight_decay": training_args.weight_decay,
# #     },
# #     {
# #         "params": [
# #              param for name, param in model.named_parameters() 
# #              if name in decay_parameters
# #         ],
# #         "weight_decay": 0.0,
# #     },
# # ]

# # optim_kwargs = {
# #     "betas": (training_args.adam_beta1, training_args.adam_beta2),
# #     "eps": training_args.adam_epsilon,
# # }
# # optim_kwargs["lr"] = training_args.learning_rate

# # adam_8bit_optim = bitsandbytes.optim.Adam8bit(
# #     optim_kwargs,
# #     betas=(training_args.adam_beta1, training_args.adam_beta2),
# #     eps=training_args.adam_epsilon,
# #     lr=training_args.learning_rate,
# # )

# callbacks = [
#     transformers.EarlyStoppingCallback(early_stopping_patience=5)
# ]

# trainer = transformers.Trainer(
#     args=training_args,
#     callbacks=callbacks,
#     compute_metrics=compute_metrics,
#     eval_dataset=test_dataset,
#     model=model,    
#     train_dataset=train_dataset,
#     # optimizers=(adam_8bit_optim, None)
# )

# try:
#     checkpoint = get_last_checkpoint(FLAGS["model_name"])
# except FileNotFoundError:
#     logger.debug("No checkpoint")
#     checkpoint = None

# resume_from_checkpoint = checkpoint is not None

# trainer.train(resume_from_checkpoint=resume_from_checkpoint)

# trainer.save_model()  # It saves the tokenizer too for easy upload
# trainer.save_state()   

# kwargs = {
#     "finetuned_from": FLAGS["base_model_name"],
#     "tags": [
#         "document-image-binarization"
#         "image-segmentation"
#     ]
# }

# processor.push_to_hub(FLAGS["model_name"])
# trainer.push_to_hub(**kwargs)

## 4. Inference

For a complete example, see T. Cornille's official Segformer [blog post](https://huggingface.co/blog/fine-tune-segformer)